Internal change
PiperOrigin-RevId: 245658213
This commit is contained in:
parent
84c5a4551e
commit
932874df5f
@ -766,6 +766,22 @@ class DatasetBaseIterator : public IteratorBase {
|
|||||||
return model::MakeUnknownNode(std::move(args));
|
return model::MakeUnknownNode(std::move(args));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// When modeling is enabled, this method disables autotuning for the given
|
||||||
|
// iterator (and the transitive closure of its inputs).
|
||||||
|
void DisableAutotune(IteratorContext* ctx, IteratorBase* iterator) {
|
||||||
|
if (iterator->node_) {
|
||||||
|
iterator->node_->set_autotune(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// When modeling is enabled, this method enables autotuning for the given
|
||||||
|
// iterator (and the transitive closure of its inputs).
|
||||||
|
void EnableAutotune(IteratorContext* ctx, IteratorBase* iterator) {
|
||||||
|
if (iterator->node_) {
|
||||||
|
iterator->node_->set_autotune(true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// When modeling is enabled, this method records the fact that this iterator
|
// When modeling is enabled, this method records the fact that this iterator
|
||||||
// has dequeued an element from an internal buffer.
|
// has dequeued an element from an internal buffer.
|
||||||
void RecordBufferDequeue(IteratorContext* ctx,
|
void RecordBufferDequeue(IteratorContext* ctx,
|
||||||
|
@ -41,7 +41,8 @@ namespace {
|
|||||||
// The formula used for computing the probability is derived by modeling the
|
// The formula used for computing the probability is derived by modeling the
|
||||||
// problem as an M/M/1/K queue
|
// problem as an M/M/1/K queue
|
||||||
// (https://en.wikipedia.org/wiki/Birth%E2%80%93death_process#M/M/1/K_queue).
|
// (https://en.wikipedia.org/wiki/Birth%E2%80%93death_process#M/M/1/K_queue).
|
||||||
int64 ComputeWaitTime(int64 output_time, int64 input_time, int64 buffer_size) {
|
double ComputeWaitTime(double output_time, double input_time,
|
||||||
|
int64 buffer_size) {
|
||||||
if (output_time == 0 || input_time == 0) {
|
if (output_time == 0 || input_time == 0) {
|
||||||
return output_time;
|
return output_time;
|
||||||
}
|
}
|
||||||
@ -75,34 +76,40 @@ class InterleaveMany : public Node {
|
|||||||
Args{id_, name_, std::move(output)});
|
Args{id_, name_, std::move(output)});
|
||||||
}
|
}
|
||||||
|
|
||||||
int64 OutputTimeLocked(std::vector<int64>* input_times) const override
|
// The output time is the sum of the self processing time and the average
|
||||||
|
// output time of inputs comprising the interleave "cycle".
|
||||||
|
double OutputTimeLocked(std::vector<double>* input_times) const override
|
||||||
SHARED_LOCKS_REQUIRED(mu_) {
|
SHARED_LOCKS_REQUIRED(mu_) {
|
||||||
if (inputs_.size() <= 1) {
|
if (inputs_.size() <= 1) {
|
||||||
return NanosPerElementLocked();
|
return SelfProcessingTimeLocked();
|
||||||
}
|
}
|
||||||
int64 delta = NanosPerElementLocked() * (inputs_.size() - 1);
|
double delta = SelfProcessingTimeLocked() * (inputs_.size() - 1);
|
||||||
input_times->back() += delta;
|
input_times->back() += delta;
|
||||||
auto cleanup = gtl::MakeCleanup(
|
auto cleanup = gtl::MakeCleanup(
|
||||||
[input_times, delta]() { input_times->back() -= delta; });
|
[input_times, delta]() { input_times->back() -= delta; });
|
||||||
int64 output_time =
|
double output_time = (OutputTimeForInputs(input_times) -
|
||||||
static_cast<double>(OutputTimeForInputs(input_times) -
|
inputs_.front()->OutputTime(input_times)) /
|
||||||
inputs_.front()->OutputTime(input_times)) /
|
static_cast<double>(inputs_.size() - 1);
|
||||||
static_cast<double>(inputs_.size() - 1);
|
return SelfProcessingTimeLocked() + output_time;
|
||||||
return NanosPerElementLocked() + output_time;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
|
// The processing time is the sum of the self processing time and the average
|
||||||
|
// processing time of inputs comprising the interleave "cycle".
|
||||||
|
double TotalProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
|
||||||
if (inputs_.size() <= 1) {
|
if (inputs_.size() <= 1) {
|
||||||
return NanosPerElementLocked();
|
return SelfProcessingTimeLocked();
|
||||||
}
|
}
|
||||||
int64 processing_time =
|
double processing_time =
|
||||||
static_cast<double>(ProcessingTimeForInputs() -
|
(ProcessingTimeForInputs() - inputs_.front()->TotalProcessingTime()) /
|
||||||
inputs_.front()->ProcessingTime()) /
|
|
||||||
static_cast<double>(inputs_.size() - 1);
|
static_cast<double>(inputs_.size() - 1);
|
||||||
return NanosPerElementLocked() + processing_time;
|
return SelfProcessingTimeLocked() + processing_time;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// The first input of AsyncInterleaveMany corresponds to the input dataset whose
|
||||||
|
// elements are used to create the (derived) input datasets whose elements are
|
||||||
|
// interleaved as output.
|
||||||
|
//
|
||||||
// TODO(jsimsa): model the first input
|
// TODO(jsimsa): model the first input
|
||||||
class AsyncInterleaveMany : public Node {
|
class AsyncInterleaveMany : public Node {
|
||||||
public:
|
public:
|
||||||
@ -127,14 +134,19 @@ class AsyncInterleaveMany : public Node {
|
|||||||
Args{id_, name_, std::move(output)}, parameters);
|
Args{id_, name_, std::move(output)}, parameters);
|
||||||
}
|
}
|
||||||
|
|
||||||
int64 OutputTimeLocked(std::vector<int64>* input_times) const override
|
// The output time is estimated using `ComputeWaitTime(output_time,
|
||||||
|
// input_time, parallelism)`, where `output_time` is the sum of the
|
||||||
|
// self-processing time and the average output time of inputs comprising the
|
||||||
|
// interleave "cycle", `input_time` is specified through `input_times` and
|
||||||
|
// `buffer_size` is derived from parallelism.
|
||||||
|
double OutputTimeLocked(std::vector<double>* input_times) const override
|
||||||
SHARED_LOCKS_REQUIRED(mu_) {
|
SHARED_LOCKS_REQUIRED(mu_) {
|
||||||
if (inputs_.size() <= 1) {
|
if (inputs_.size() <= 1) {
|
||||||
return NanosPerElementLocked();
|
return SelfProcessingTimeLocked();
|
||||||
}
|
}
|
||||||
int64 old_input_time = input_times->back();
|
double old_input_time = input_times->back();
|
||||||
int64 new_input_time = static_cast<double>(NanosPerElementLocked()) *
|
double new_input_time =
|
||||||
static_cast<double>(inputs_.size() - 1);
|
SelfProcessingTimeLocked() * static_cast<double>(inputs_.size() - 1);
|
||||||
input_times->push_back(new_input_time);
|
input_times->push_back(new_input_time);
|
||||||
auto cleanup =
|
auto cleanup =
|
||||||
gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
|
gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
|
||||||
@ -143,23 +155,23 @@ class AsyncInterleaveMany : public Node {
|
|||||||
parallelism = std::min(static_cast<int>(parallelism),
|
parallelism = std::min(static_cast<int>(parallelism),
|
||||||
static_cast<int>((*parameter)->value));
|
static_cast<int>((*parameter)->value));
|
||||||
}
|
}
|
||||||
int64 output_time =
|
double output_time = (OutputTimeForInputs(input_times) -
|
||||||
static_cast<double>(OutputTimeForInputs(input_times) -
|
inputs_.front()->OutputTime(input_times)) /
|
||||||
inputs_.front()->OutputTime(input_times)) /
|
static_cast<double>(num_inputs() - 1) / parallelism;
|
||||||
static_cast<double>(inputs_.size() - 1) / parallelism;
|
return ComputeWaitTime(SelfProcessingTimeLocked() + output_time,
|
||||||
return ComputeWaitTime(NanosPerElementLocked() + output_time,
|
|
||||||
old_input_time, parallelism);
|
old_input_time, parallelism);
|
||||||
}
|
}
|
||||||
|
|
||||||
int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
|
// The processing time is the sum of the self processing time and the average
|
||||||
|
// processing time of inputs comprising the interleave "cycle".
|
||||||
|
double TotalProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
|
||||||
if (inputs_.size() <= 1) {
|
if (inputs_.size() <= 1) {
|
||||||
return NanosPerElementLocked();
|
return SelfProcessingTimeLocked();
|
||||||
}
|
}
|
||||||
int64 processing_time =
|
double processing_time =
|
||||||
ProcessingTimeForInputs() - inputs_.front()->ProcessingTime();
|
ProcessingTimeForInputs() - inputs_.front()->TotalProcessingTime();
|
||||||
return NanosPerElementLocked() +
|
return SelfProcessingTimeLocked() +
|
||||||
static_cast<double>(processing_time) /
|
processing_time / static_cast<double>(num_inputs() - 1);
|
||||||
static_cast<double>(inputs_.size() - 1);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -176,22 +188,27 @@ class KnownRatio : public Node {
|
|||||||
ratio_);
|
ratio_);
|
||||||
}
|
}
|
||||||
|
|
||||||
int64 OutputTimeLocked(std::vector<int64>* input_times) const override
|
// The output time is the sum of the self processing time and the product of
|
||||||
|
// `ratio_` and the sum of output times of inputs.
|
||||||
|
double OutputTimeLocked(std::vector<double>* input_times) const override
|
||||||
SHARED_LOCKS_REQUIRED(mu_) {
|
SHARED_LOCKS_REQUIRED(mu_) {
|
||||||
if (ratio_ == 0) {
|
if (ratio_ == 0) {
|
||||||
return NanosPerElementLocked();
|
return SelfProcessingTimeLocked();
|
||||||
}
|
}
|
||||||
int64 old_input_time = input_times->back();
|
double old_input_time = input_times->back();
|
||||||
input_times->back() += static_cast<int64>(
|
input_times->back() +=
|
||||||
static_cast<double>(old_input_time + NanosPerElementLocked()) / ratio_);
|
(old_input_time + SelfProcessingTimeLocked()) / ratio_;
|
||||||
auto cleanup = gtl::MakeCleanup([input_times, old_input_time]() {
|
auto cleanup = gtl::MakeCleanup([input_times, old_input_time]() {
|
||||||
input_times->back() = old_input_time;
|
input_times->back() = old_input_time;
|
||||||
});
|
});
|
||||||
return NanosPerElementLocked() + ratio_ * OutputTimeForInputs(input_times);
|
return SelfProcessingTimeLocked() +
|
||||||
|
ratio_ * OutputTimeForInputs(input_times);
|
||||||
}
|
}
|
||||||
|
|
||||||
int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
|
// The processing time is the sum of the self processing time and the product
|
||||||
return NanosPerElementLocked() + ratio_ * ProcessingTimeForInputs();
|
// of `ratio_` and the sum of processing times of inputs.
|
||||||
|
double TotalProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
|
||||||
|
return SelfProcessingTimeLocked() + ratio_ * ProcessingTimeForInputs();
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -221,31 +238,35 @@ class AsyncKnownRatio : public Node {
|
|||||||
Args{id_, name_, std::move(output)}, ratio_, parameters);
|
Args{id_, name_, std::move(output)}, ratio_, parameters);
|
||||||
}
|
}
|
||||||
|
|
||||||
int64 OutputTimeLocked(std::vector<int64>* input_times) const override
|
// The output time is estimated using `ComputeWaitTime(output_time,
|
||||||
|
// input_time, parallelism)`, where `output_time` is the sum of the self
|
||||||
|
// processing time and the product of `ratio_` and the sum of output times of
|
||||||
|
// inputs, `input_time` is specified through `input_times` and `buffer_size`
|
||||||
|
// is derived from parallelism.
|
||||||
|
double OutputTimeLocked(std::vector<double>* input_times) const override
|
||||||
SHARED_LOCKS_REQUIRED(mu_) {
|
SHARED_LOCKS_REQUIRED(mu_) {
|
||||||
double parallelism = 1.0;
|
double parallelism = 1.0;
|
||||||
if (auto* parameter = gtl::FindOrNull(parameters_, "parallelism")) {
|
if (auto* parameter = gtl::FindOrNull(parameters_, "parallelism")) {
|
||||||
parallelism = (*parameter)->value;
|
parallelism = (*parameter)->value;
|
||||||
}
|
}
|
||||||
if (ratio_ == 0.0) {
|
if (ratio_ == 0.0) {
|
||||||
int64 output_time =
|
double output_time = SelfProcessingTimeLocked() / parallelism;
|
||||||
static_cast<double>(NanosPerElementLocked()) / parallelism;
|
|
||||||
return ComputeWaitTime(output_time, input_times->back(), parallelism);
|
return ComputeWaitTime(output_time, input_times->back(), parallelism);
|
||||||
}
|
}
|
||||||
int64 old_input_time = input_times->back();
|
double old_input_time = input_times->back();
|
||||||
int64 new_input_time = static_cast<int64>(
|
double new_input_time = SelfProcessingTimeLocked() / ratio_ / parallelism;
|
||||||
static_cast<double>(NanosPerElementLocked()) / ratio_ / parallelism);
|
|
||||||
input_times->push_back(new_input_time);
|
input_times->push_back(new_input_time);
|
||||||
auto cleanup =
|
auto cleanup =
|
||||||
gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
|
gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
|
||||||
int64 output_time = static_cast<int64>(
|
double output_time = SelfProcessingTimeLocked() / parallelism +
|
||||||
static_cast<double>(NanosPerElementLocked()) / parallelism +
|
ratio_ * OutputTimeForInputs(input_times);
|
||||||
ratio_ * OutputTimeForInputs(input_times));
|
|
||||||
return ComputeWaitTime(output_time, old_input_time, parallelism);
|
return ComputeWaitTime(output_time, old_input_time, parallelism);
|
||||||
}
|
}
|
||||||
|
|
||||||
int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
|
// The processing time is the sum of the self processing time and the product
|
||||||
return NanosPerElementLocked() + ratio_ * ProcessingTimeForInputs();
|
// of `ratio_` and the sum of processing times of inputs.
|
||||||
|
double TotalProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
|
||||||
|
return SelfProcessingTimeLocked() + ratio_ * ProcessingTimeForInputs();
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -264,40 +285,40 @@ class UnknownRatio : public Node {
|
|||||||
return std::make_shared<UnknownRatio>(Args{id_, name_, std::move(output)});
|
return std::make_shared<UnknownRatio>(Args{id_, name_, std::move(output)});
|
||||||
}
|
}
|
||||||
|
|
||||||
int64 OutputTimeLocked(std::vector<int64>* input_times) const override
|
// The output time is the sum of the self processing time and the product of
|
||||||
|
// the ratio estimate and the sum of output times of inputs.
|
||||||
|
double OutputTimeLocked(std::vector<double>* input_times) const override
|
||||||
SHARED_LOCKS_REQUIRED(mu_) {
|
SHARED_LOCKS_REQUIRED(mu_) {
|
||||||
if (num_elements_ == 0 || inputs_.empty() ||
|
if (num_elements_ == 0 || inputs_.empty() ||
|
||||||
inputs_.front()->num_elements() == 0) {
|
inputs_.front()->num_elements() == 0) {
|
||||||
return NanosPerElementLocked();
|
return SelfProcessingTimeLocked();
|
||||||
}
|
}
|
||||||
// TODO(jsimsa): The current implementation assumes that the number of input
|
// TODO(jsimsa): The current implementation assumes that the number of input
|
||||||
// elements consumed per output is the same across all inputs.
|
// elements consumed per output is the same across all inputs.
|
||||||
std::shared_ptr<Node> input = inputs_.front();
|
std::shared_ptr<Node> input = inputs_.front();
|
||||||
double ratio = static_cast<double>(input->num_elements()) /
|
double ratio = static_cast<double>(input->num_elements()) /
|
||||||
static_cast<double>(num_elements_);
|
static_cast<double>(num_elements_);
|
||||||
int64 old_input_time = input_times->back();
|
double old_input_time = input_times->back();
|
||||||
input_times->back() =
|
input_times->back() = (old_input_time + SelfProcessingTimeLocked()) / ratio;
|
||||||
static_cast<double>(old_input_time + NanosPerElementLocked()) / ratio;
|
|
||||||
auto cleanup = gtl::MakeCleanup([input_times, old_input_time]() {
|
auto cleanup = gtl::MakeCleanup([input_times, old_input_time]() {
|
||||||
input_times->back() = old_input_time;
|
input_times->back() = old_input_time;
|
||||||
});
|
});
|
||||||
return NanosPerElementLocked() +
|
return SelfProcessingTimeLocked() +
|
||||||
static_cast<int64>(
|
ratio * OutputTimeForInputs(input_times);
|
||||||
ratio * static_cast<double>(OutputTimeForInputs(input_times)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
|
// The processing time is the sum of the self processing time and the product
|
||||||
|
// of the ratio estimate and the sum of processing times of inputs.
|
||||||
|
double TotalProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
|
||||||
if (inputs_.empty() || num_elements_ == 0) {
|
if (inputs_.empty() || num_elements_ == 0) {
|
||||||
return NanosPerElementLocked();
|
return SelfProcessingTimeLocked();
|
||||||
}
|
}
|
||||||
// TODO(jsimsa): The current implementation that the number of input
|
// TODO(jsimsa): The current implementation assumes that the number of input
|
||||||
// elements consumed per output is the same across all inputs.
|
// elements consumed per output is the same across all inputs.
|
||||||
std::shared_ptr<Node> input = inputs_.front();
|
std::shared_ptr<Node> input = inputs_.front();
|
||||||
double ratio = static_cast<double>(input->num_elements()) /
|
double ratio = static_cast<double>(input->num_elements()) /
|
||||||
static_cast<double>(num_elements_);
|
static_cast<double>(num_elements_);
|
||||||
return NanosPerElementLocked() +
|
return SelfProcessingTimeLocked() + ratio * ProcessingTimeForInputs();
|
||||||
static_cast<int64>(ratio *
|
|
||||||
static_cast<double>(ProcessingTimeForInputs()));
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -313,12 +334,14 @@ class Unknown : public Node {
|
|||||||
return std::make_shared<Unknown>(Args{id_, name_, std::move(output)});
|
return std::make_shared<Unknown>(Args{id_, name_, std::move(output)});
|
||||||
}
|
}
|
||||||
|
|
||||||
int64 OutputTimeLocked(std::vector<int64>* input_times) const override
|
// The output time is the sum of output times of inputs.
|
||||||
|
double OutputTimeLocked(std::vector<double>* input_times) const override
|
||||||
SHARED_LOCKS_REQUIRED(mu_) {
|
SHARED_LOCKS_REQUIRED(mu_) {
|
||||||
return OutputTimeForInputs(input_times);
|
return OutputTimeForInputs(input_times);
|
||||||
}
|
}
|
||||||
|
|
||||||
int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
|
// The processing time is the sum of processing times of inputs.
|
||||||
|
double TotalProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
|
||||||
return ProcessingTimeForInputs();
|
return ProcessingTimeForInputs();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -415,7 +438,7 @@ void Model::Optimize(int64 cpu_budget) {
|
|||||||
snapshot = output_->Snapshot(nullptr);
|
snapshot = output_->Snapshot(nullptr);
|
||||||
}
|
}
|
||||||
VLOG(2) << "Starting optimization of tunable parameters";
|
VLOG(2) << "Starting optimization of tunable parameters";
|
||||||
const int64 processing_time = ProcessingTime(snapshot);
|
const int64 processing_time = TotalProcessingTime(snapshot);
|
||||||
auto parameters = CollectTunableParameters(snapshot);
|
auto parameters = CollectTunableParameters(snapshot);
|
||||||
for (auto& pair : parameters) {
|
for (auto& pair : parameters) {
|
||||||
pair.second->value = 1;
|
pair.second->value = 1;
|
||||||
@ -441,13 +464,6 @@ void Model::Optimize(int64 cpu_budget) {
|
|||||||
pair.second->value++;
|
pair.second->value++;
|
||||||
int64 new_output_time = OutputTime(snapshot);
|
int64 new_output_time = OutputTime(snapshot);
|
||||||
int64 delta = output_time - new_output_time;
|
int64 delta = output_time - new_output_time;
|
||||||
if (delta < 0) {
|
|
||||||
VLOG(3) << "Increasing the parallelism of tunable parameter "
|
|
||||||
<< pair.first << " resulted in slowdown (before=" << output_time
|
|
||||||
<< ", after=" << new_output_time
|
|
||||||
<< "). This should never happen because the latency "
|
|
||||||
"should be monotonic w.r.t. to parallelism.";
|
|
||||||
}
|
|
||||||
if (delta > best_delta) {
|
if (delta > best_delta) {
|
||||||
best_delta = delta;
|
best_delta = delta;
|
||||||
best_parameter = pair.second.get();
|
best_parameter = pair.second.get();
|
||||||
@ -455,11 +471,10 @@ void Model::Optimize(int64 cpu_budget) {
|
|||||||
pair.second->value--;
|
pair.second->value--;
|
||||||
}
|
}
|
||||||
if (!best_parameter) {
|
if (!best_parameter) {
|
||||||
// This should never happen because we are using a model snapshot and
|
|
||||||
// the output time is monotonically decreasing w.r.t. parallelism.
|
|
||||||
LOG(WARNING) << "Failed to find a tunable parameter that would "
|
LOG(WARNING) << "Failed to find a tunable parameter that would "
|
||||||
"decrease the output time, aborting the current "
|
"decrease the output time. This means that the "
|
||||||
"optimization attempt.";
|
"autotuning optimization got stuck in a local maximum. "
|
||||||
|
"The optimization attempt will be aborted.";
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
best_parameter->value++;
|
best_parameter->value++;
|
||||||
@ -537,12 +552,18 @@ std::map<string, std::shared_ptr<Parameter>> Model::CollectTunableParameters(
|
|||||||
}
|
}
|
||||||
|
|
||||||
int64 Model::OutputTime(std::shared_ptr<Node> node) {
|
int64 Model::OutputTime(std::shared_ptr<Node> node) {
|
||||||
std::vector<int64> input_times(1, 0);
|
std::vector<double> input_times(1, 0);
|
||||||
|
// TODO(jsimsa): Now that we are accounting for buffer size in wait time
|
||||||
|
// computation, assuming that the input is infinitely fast will result in
|
||||||
|
// inaccurate estimates of the output latency.
|
||||||
|
//
|
||||||
|
// We should compute the output latency as a fix-point of the following
|
||||||
|
// equation: `output_time = node(OutputTime(input_times(1, output_time))`.
|
||||||
return node->OutputTime(&input_times);
|
return node->OutputTime(&input_times);
|
||||||
}
|
}
|
||||||
|
|
||||||
int64 Model::ProcessingTime(std::shared_ptr<Node> node) {
|
int64 Model::TotalProcessingTime(std::shared_ptr<Node> node) {
|
||||||
return node->ProcessingTime();
|
return node->TotalProcessingTime();
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace model
|
} // namespace model
|
||||||
|
@ -138,6 +138,12 @@ class Node {
|
|||||||
processing_time_ += delta;
|
processing_time_ += delta;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Returns an indication whether autotuning is enabled for this node.
|
||||||
|
bool autotune() const LOCKS_EXCLUDED(mu_) {
|
||||||
|
tf_shared_lock l(mu_);
|
||||||
|
return autotune_;
|
||||||
|
}
|
||||||
|
|
||||||
// Returns the number of bytes stored in this node's buffer.
|
// Returns the number of bytes stored in this node's buffer.
|
||||||
int64 buffered_bytes() const LOCKS_EXCLUDED(mu_) {
|
int64 buffered_bytes() const LOCKS_EXCLUDED(mu_) {
|
||||||
tf_shared_lock l(mu_);
|
tf_shared_lock l(mu_);
|
||||||
@ -215,11 +221,20 @@ class Node {
|
|||||||
inputs_.remove(input);
|
inputs_.remove(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Sets the value that determines whether autotuning is enabled for this node.
|
||||||
|
void set_autotune(bool autotune) LOCKS_EXCLUDED(mu_) {
|
||||||
|
mutex_lock l(mu_);
|
||||||
|
autotune_ = autotune;
|
||||||
|
}
|
||||||
|
|
||||||
// Collects tunable parameters in the subtree rooted in this node.
|
// Collects tunable parameters in the subtree rooted in this node.
|
||||||
void CollectTunableParameters(
|
void CollectTunableParameters(
|
||||||
std::map<string, std::shared_ptr<Parameter>>* parameters) const
|
std::map<string, std::shared_ptr<Parameter>>* parameters) const
|
||||||
LOCKS_EXCLUDED(mu_) {
|
LOCKS_EXCLUDED(mu_) {
|
||||||
tf_shared_lock l(mu_);
|
tf_shared_lock l(mu_);
|
||||||
|
if (!autotune_) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
for (auto& pair : parameters_) {
|
for (auto& pair : parameters_) {
|
||||||
if (pair.second->state->tunable) {
|
if (pair.second->state->tunable) {
|
||||||
parameters->insert(std::make_pair(long_name(), pair.second));
|
parameters->insert(std::make_pair(long_name(), pair.second));
|
||||||
@ -230,17 +245,31 @@ class Node {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns the per-element output time for this node.
|
// Returns a human-readable representation of this node.
|
||||||
int64 OutputTime(std::vector<int64>* input_times) const LOCKS_EXCLUDED(mu_) {
|
string DebugString() const LOCKS_EXCLUDED(mu_) {
|
||||||
tf_shared_lock l(mu_);
|
tf_shared_lock l(mu_);
|
||||||
return OutputTimeLocked(input_times);
|
string result;
|
||||||
|
strings::StrAppend(&result, long_name(), ":\n");
|
||||||
|
strings::StrAppend(&result, " autotune=", autotune_, "\n");
|
||||||
|
strings::StrAppend(&result, " buffered_bytes=", buffered_bytes_, "\n");
|
||||||
|
strings::StrAppend(&result, " processing_time=", processing_time_, "\n");
|
||||||
|
strings::StrAppend(&result, " num_elements=", num_elements_, "\n");
|
||||||
|
string inputs;
|
||||||
|
for (auto& input : inputs_) {
|
||||||
|
strings::StrAppend(&inputs, input->long_name(), ",");
|
||||||
|
}
|
||||||
|
strings::StrAppend(&result, " inputs={", inputs, "}\n");
|
||||||
|
for (auto& input : inputs_) {
|
||||||
|
strings::StrAppend(&result, input->DebugString());
|
||||||
|
}
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns the per-element processing time spent in the subtree rooted in
|
// Returns the per-element output time for this node.
|
||||||
// this node.
|
double OutputTime(std::vector<double>* input_times) const
|
||||||
int64 ProcessingTime() const LOCKS_EXCLUDED(mu_) {
|
LOCKS_EXCLUDED(mu_) {
|
||||||
tf_shared_lock l(mu_);
|
tf_shared_lock l(mu_);
|
||||||
return ProcessingTimeLocked();
|
return OutputTimeLocked(input_times);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns a copy of this node, making a deep copy of its inputs and a
|
// Returns a copy of this node, making a deep copy of its inputs and a
|
||||||
@ -254,6 +283,7 @@ class Node {
|
|||||||
std::shared_ptr<Node> result = Clone(output);
|
std::shared_ptr<Node> result = Clone(output);
|
||||||
{
|
{
|
||||||
mutex_lock l2(result->mu_);
|
mutex_lock l2(result->mu_);
|
||||||
|
result->autotune_ = autotune_;
|
||||||
result->buffered_bytes_ = buffered_bytes_;
|
result->buffered_bytes_ = buffered_bytes_;
|
||||||
result->processing_time_ = processing_time_;
|
result->processing_time_ = processing_time_;
|
||||||
result->num_elements_ = num_elements_;
|
result->num_elements_ = num_elements_;
|
||||||
@ -265,57 +295,90 @@ class Node {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Returns the per-element CPU time spent in the subtree rooted in this node.
|
||||||
|
double TotalProcessingTime() const LOCKS_EXCLUDED(mu_) {
|
||||||
|
tf_shared_lock l(mu_);
|
||||||
|
return TotalProcessingTimeLocked();
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
// Returns the number of inputs.
|
||||||
|
int64 num_inputs() const SHARED_LOCKS_REQUIRED(mu_) {
|
||||||
|
int64 num_inputs = 0;
|
||||||
|
for (auto& input : inputs_) {
|
||||||
|
// Inputs for which autotuning is disabled are excluded.
|
||||||
|
if (input->autotune()) {
|
||||||
|
++num_inputs;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return num_inputs;
|
||||||
|
}
|
||||||
|
|
||||||
// Creates a clone of this node.
|
// Creates a clone of this node.
|
||||||
virtual std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const
|
virtual std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const
|
||||||
SHARED_LOCKS_REQUIRED(mu_) = 0;
|
SHARED_LOCKS_REQUIRED(mu_) = 0;
|
||||||
|
|
||||||
// Returns the per-element processing time spent in this node.
|
|
||||||
int64 NanosPerElementLocked() const SHARED_LOCKS_REQUIRED(mu_) {
|
|
||||||
if (num_elements_ == 0) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
return static_cast<int64>(static_cast<double>(processing_time_) /
|
|
||||||
static_cast<double>(num_elements_));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the sum of per-element output time for the inputs of this node.
|
// Returns the sum of per-element output time for the inputs of this node.
|
||||||
int64 OutputTimeForInputs(std::vector<int64>* input_times) const
|
double OutputTimeForInputs(std::vector<double>* input_times) const
|
||||||
SHARED_LOCKS_REQUIRED(mu_) {
|
SHARED_LOCKS_REQUIRED(mu_) {
|
||||||
int64 sum = 0;
|
double sum = 0;
|
||||||
for (auto& input : inputs_) {
|
for (auto& input : inputs_) {
|
||||||
sum += input->OutputTime(input_times);
|
// Inputs for which autotuning is disabled are excluded.
|
||||||
|
if (input->autotune()) {
|
||||||
|
sum += input->OutputTime(input_times);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns the per-element output time for this node.
|
// Returns the per-element output time for this node.
|
||||||
virtual int64 OutputTimeLocked(std::vector<int64>* input_times) const
|
virtual double OutputTimeLocked(std::vector<double>* input_times) const
|
||||||
SHARED_LOCKS_REQUIRED(mu_) = 0;
|
SHARED_LOCKS_REQUIRED(mu_) = 0;
|
||||||
|
|
||||||
// Returns the sum of per-element processing time for the inputs of this node.
|
// Returns the sum of per-element processing time for the inputs of this node.
|
||||||
//
|
//
|
||||||
// TODO(jsimsa): use processing time history as a prior for future inputs
|
// TODO(jsimsa): use processing time history as a prior for future inputs
|
||||||
int64 ProcessingTimeForInputs() const SHARED_LOCKS_REQUIRED(mu_) {
|
double ProcessingTimeForInputs() const SHARED_LOCKS_REQUIRED(mu_) {
|
||||||
int64 sum = 0;
|
int64 sum = 0;
|
||||||
for (auto& input : inputs_) {
|
for (auto& input : inputs_) {
|
||||||
sum += input->ProcessingTime();
|
// Inputs for which autotuning is disabled are excluded.
|
||||||
|
if (input->autotune()) {
|
||||||
|
sum += input->SelfProcessingTimeLocked();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns the per-element processing time spent in the subtree rooted in
|
// Returns the per-element processing time spent in this node.
|
||||||
// this node.
|
double SelfProcessingTimeLocked() const SHARED_LOCKS_REQUIRED(mu_) {
|
||||||
virtual int64 ProcessingTimeLocked() const SHARED_LOCKS_REQUIRED(mu_) = 0;
|
if (num_elements_ == 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return static_cast<double>(processing_time_) /
|
||||||
|
static_cast<double>(num_elements_);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the per-element CPU time spent in the subtree rooted in this node.
|
||||||
|
virtual double TotalProcessingTimeLocked() const
|
||||||
|
SHARED_LOCKS_REQUIRED(mu_) = 0;
|
||||||
|
|
||||||
mutable mutex mu_;
|
mutable mutex mu_;
|
||||||
const int64 id_;
|
const int64 id_;
|
||||||
const string name_;
|
const string name_;
|
||||||
|
|
||||||
|
// Indicates whether the subtree rooted in this node should be included in
|
||||||
|
// autotuning. In particular, if this is `false`, then the subtree is excluded
|
||||||
|
// from computation of output time and processing time.
|
||||||
|
bool autotune_ GUARDED_BY(mu_) = true;
|
||||||
int64 buffered_bytes_ GUARDED_BY(mu_) = 0;
|
int64 buffered_bytes_ GUARDED_BY(mu_) = 0;
|
||||||
int64 processing_time_ GUARDED_BY(mu_) = 0;
|
int64 processing_time_ GUARDED_BY(mu_) = 0;
|
||||||
int64 num_elements_ GUARDED_BY(mu_) = 0;
|
int64 num_elements_ GUARDED_BY(mu_) = 0;
|
||||||
std::map<std::thread::id, int64> work_start_ GUARDED_BY(mu_);
|
std::map<std::thread::id, int64> work_start_ GUARDED_BY(mu_);
|
||||||
std::map<string, std::shared_ptr<Parameter>> parameters_ GUARDED_BY(mu_);
|
std::map<string, std::shared_ptr<Parameter>> parameters_ GUARDED_BY(mu_);
|
||||||
|
|
||||||
|
// Inputs of this node. These can represent an iterator created from the input
|
||||||
|
// dataset but also other input iterators (e.g. created by the user-defined
|
||||||
|
// functions of `flat_map` or `interleave`).
|
||||||
std::list<std::shared_ptr<Node>> inputs_ GUARDED_BY(mu_);
|
std::list<std::shared_ptr<Node>> inputs_ GUARDED_BY(mu_);
|
||||||
|
|
||||||
// The reference to the output node is not owned so that deletion of a
|
// The reference to the output node is not owned so that deletion of a
|
||||||
@ -421,7 +484,7 @@ class Model {
|
|||||||
int64 OutputTime(std::shared_ptr<Node> node);
|
int64 OutputTime(std::shared_ptr<Node> node);
|
||||||
|
|
||||||
// Collects the processing time for the given node.
|
// Collects the processing time for the given node.
|
||||||
int64 ProcessingTime(std::shared_ptr<Node> node);
|
int64 TotalProcessingTime(std::shared_ptr<Node> node);
|
||||||
|
|
||||||
// Used for coordination between different input pipeline threads. Exclusive
|
// Used for coordination between different input pipeline threads. Exclusive
|
||||||
// access is required only when adding or removing nodes. Concurrent access to
|
// access is required only when adding or removing nodes. Concurrent access to
|
||||||
|
@ -25,11 +25,11 @@ namespace model {
|
|||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
class AsyncInterleaveManyTest
|
class AsyncInterleaveManyTest
|
||||||
: public ::testing::TestWithParam<std::tuple<int64, int64>> {};
|
: public ::testing::TestWithParam<std::tuple<int64, double>> {};
|
||||||
|
|
||||||
TEST_P(AsyncInterleaveManyTest, Model) {
|
TEST_P(AsyncInterleaveManyTest, Model) {
|
||||||
const int64 parallelism = std::get<0>(GetParam());
|
const int64 parallelism = std::get<0>(GetParam());
|
||||||
const int64 input_time = std::get<1>(GetParam());
|
const double input_time = std::get<1>(GetParam());
|
||||||
std::shared_ptr<Node> async_interleave_many =
|
std::shared_ptr<Node> async_interleave_many =
|
||||||
model::MakeAsyncInterleaveManyNode(
|
model::MakeAsyncInterleaveManyNode(
|
||||||
{0, "async_interleave_many", nullptr},
|
{0, "async_interleave_many", nullptr},
|
||||||
@ -55,29 +55,29 @@ TEST_P(AsyncInterleaveManyTest, Model) {
|
|||||||
auto cleanup2 = gtl::MakeCleanup([async_interleave_many, source2]() {
|
auto cleanup2 = gtl::MakeCleanup([async_interleave_many, source2]() {
|
||||||
async_interleave_many->remove_input(source2);
|
async_interleave_many->remove_input(source2);
|
||||||
});
|
});
|
||||||
std::vector<int64> input_times(1, input_time);
|
std::vector<double> input_times(1, input_time);
|
||||||
async_interleave_many->add_processing_time(100);
|
async_interleave_many->add_processing_time(100);
|
||||||
EXPECT_EQ(async_interleave_many->processing_time(), 100);
|
EXPECT_EQ(async_interleave_many->processing_time(), 100);
|
||||||
EXPECT_EQ(async_interleave_many->ProcessingTime(), 0);
|
EXPECT_EQ(async_interleave_many->TotalProcessingTime(), 0);
|
||||||
EXPECT_EQ(async_interleave_many->OutputTime(&input_times), 0);
|
EXPECT_EQ(async_interleave_many->OutputTime(&input_times), 0);
|
||||||
async_interleave_many->record_element();
|
async_interleave_many->record_element();
|
||||||
EXPECT_EQ(async_interleave_many->num_elements(), 1);
|
EXPECT_EQ(async_interleave_many->num_elements(), 1);
|
||||||
EXPECT_EQ(async_interleave_many->ProcessingTime(), 100);
|
EXPECT_EQ(async_interleave_many->TotalProcessingTime(), 100);
|
||||||
EXPECT_LE(async_interleave_many->OutputTime(&input_times), 100);
|
EXPECT_LE(async_interleave_many->OutputTime(&input_times), 100);
|
||||||
EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
|
EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
|
||||||
source1->add_processing_time(200);
|
source1->add_processing_time(200);
|
||||||
source2->add_processing_time(300);
|
source2->add_processing_time(300);
|
||||||
EXPECT_EQ(async_interleave_many->ProcessingTime(), 100);
|
EXPECT_EQ(async_interleave_many->TotalProcessingTime(), 100);
|
||||||
EXPECT_LE(async_interleave_many->OutputTime(&input_times), 100);
|
EXPECT_LE(async_interleave_many->OutputTime(&input_times), 100);
|
||||||
EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
|
EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
|
||||||
source1->record_element();
|
source1->record_element();
|
||||||
source2->record_element();
|
source2->record_element();
|
||||||
EXPECT_EQ(async_interleave_many->ProcessingTime(), 100 + 250);
|
EXPECT_EQ(async_interleave_many->TotalProcessingTime(), 100 + 250);
|
||||||
EXPECT_LE(async_interleave_many->OutputTime(&input_times),
|
EXPECT_LE(async_interleave_many->OutputTime(&input_times),
|
||||||
100 + 250 / parallelism);
|
100 + 250 / parallelism);
|
||||||
EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
|
EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
|
||||||
async_interleave_many->record_element();
|
async_interleave_many->record_element();
|
||||||
EXPECT_EQ(async_interleave_many->ProcessingTime(), 50 + 250);
|
EXPECT_EQ(async_interleave_many->TotalProcessingTime(), 50 + 250);
|
||||||
EXPECT_LE(async_interleave_many->OutputTime(&input_times),
|
EXPECT_LE(async_interleave_many->OutputTime(&input_times),
|
||||||
50 + 250 / parallelism);
|
50 + 250 / parallelism);
|
||||||
EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
|
EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
|
||||||
@ -89,11 +89,11 @@ INSTANTIATE_TEST_SUITE_P(Test, AsyncInterleaveManyTest,
|
|||||||
200)));
|
200)));
|
||||||
|
|
||||||
class AsyncKnownRatioTest
|
class AsyncKnownRatioTest
|
||||||
: public ::testing::TestWithParam<std::tuple<int64, int64, int64>> {};
|
: public ::testing::TestWithParam<std::tuple<int64, double, int64>> {};
|
||||||
|
|
||||||
TEST_P(AsyncKnownRatioTest, Model) {
|
TEST_P(AsyncKnownRatioTest, Model) {
|
||||||
const int64 parallelism = std::get<0>(GetParam());
|
const int64 parallelism = std::get<0>(GetParam());
|
||||||
const int64 input_time = std::get<1>(GetParam());
|
const double input_time = std::get<1>(GetParam());
|
||||||
const int64 num_inputs_per_output = std::get<2>(GetParam());
|
const int64 num_inputs_per_output = std::get<2>(GetParam());
|
||||||
std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
|
std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
|
||||||
{0, "async_known_many", nullptr}, num_inputs_per_output,
|
{0, "async_known_many", nullptr}, num_inputs_per_output,
|
||||||
@ -107,50 +107,51 @@ TEST_P(AsyncKnownRatioTest, Model) {
|
|||||||
std::shared_ptr<Node> source2 =
|
std::shared_ptr<Node> source2 =
|
||||||
model::MakeSourceNode({2, "source2", async_known_many});
|
model::MakeSourceNode({2, "source2", async_known_many});
|
||||||
async_known_many->add_input(source2);
|
async_known_many->add_input(source2);
|
||||||
std::vector<int64> input_times(1, input_time);
|
std::vector<double> input_times(1, input_time);
|
||||||
source1->add_processing_time(100);
|
source1->add_processing_time(100);
|
||||||
EXPECT_EQ(async_known_many->ProcessingTime(), 0);
|
EXPECT_EQ(async_known_many->TotalProcessingTime(), 0);
|
||||||
EXPECT_EQ(async_known_many->OutputTime(&input_times), 0);
|
EXPECT_EQ(async_known_many->OutputTime(&input_times), 0);
|
||||||
source2->add_processing_time(200);
|
source2->add_processing_time(200);
|
||||||
EXPECT_EQ(async_known_many->ProcessingTime(), 0);
|
EXPECT_EQ(async_known_many->TotalProcessingTime(), 0);
|
||||||
EXPECT_EQ(async_known_many->OutputTime(&input_times), 0);
|
EXPECT_EQ(async_known_many->OutputTime(&input_times), 0);
|
||||||
source1->record_element();
|
source1->record_element();
|
||||||
EXPECT_EQ(async_known_many->ProcessingTime(), num_inputs_per_output * 100);
|
EXPECT_EQ(async_known_many->TotalProcessingTime(),
|
||||||
|
num_inputs_per_output * 100);
|
||||||
EXPECT_LE(async_known_many->OutputTime(&input_times),
|
EXPECT_LE(async_known_many->OutputTime(&input_times),
|
||||||
num_inputs_per_output * 100);
|
num_inputs_per_output * 100);
|
||||||
EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
|
EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
|
||||||
source2->record_element();
|
source2->record_element();
|
||||||
EXPECT_EQ(async_known_many->ProcessingTime(),
|
EXPECT_EQ(async_known_many->TotalProcessingTime(),
|
||||||
num_inputs_per_output * (100 + 200));
|
num_inputs_per_output * (100 + 200));
|
||||||
EXPECT_LE(async_known_many->OutputTime(&input_times),
|
EXPECT_LE(async_known_many->OutputTime(&input_times),
|
||||||
num_inputs_per_output * (100 + 200));
|
num_inputs_per_output * (100 + 200));
|
||||||
EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
|
EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
|
||||||
source1->record_element();
|
source1->record_element();
|
||||||
EXPECT_EQ(async_known_many->ProcessingTime(),
|
EXPECT_EQ(async_known_many->TotalProcessingTime(),
|
||||||
num_inputs_per_output * (50 + 200));
|
num_inputs_per_output * (50 + 200));
|
||||||
EXPECT_LE(async_known_many->OutputTime(&input_times),
|
EXPECT_LE(async_known_many->OutputTime(&input_times),
|
||||||
num_inputs_per_output * (50 + 200));
|
num_inputs_per_output * (50 + 200));
|
||||||
EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
|
EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
|
||||||
source2->record_element();
|
source2->record_element();
|
||||||
EXPECT_EQ(async_known_many->ProcessingTime(),
|
EXPECT_EQ(async_known_many->TotalProcessingTime(),
|
||||||
num_inputs_per_output * (50 + 100));
|
num_inputs_per_output * (50 + 100));
|
||||||
EXPECT_LE(async_known_many->OutputTime(&input_times),
|
EXPECT_LE(async_known_many->OutputTime(&input_times),
|
||||||
num_inputs_per_output * (50 + 100));
|
num_inputs_per_output * (50 + 100));
|
||||||
EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
|
EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
|
||||||
async_known_many->add_processing_time(128);
|
async_known_many->add_processing_time(128);
|
||||||
EXPECT_EQ(async_known_many->ProcessingTime(),
|
EXPECT_EQ(async_known_many->TotalProcessingTime(),
|
||||||
num_inputs_per_output * (50 + 100));
|
num_inputs_per_output * (50 + 100));
|
||||||
EXPECT_LE(async_known_many->OutputTime(&input_times),
|
EXPECT_LE(async_known_many->OutputTime(&input_times),
|
||||||
num_inputs_per_output * (50 + 100));
|
num_inputs_per_output * (50 + 100));
|
||||||
EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
|
EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
|
||||||
async_known_many->record_element();
|
async_known_many->record_element();
|
||||||
EXPECT_EQ(async_known_many->ProcessingTime(),
|
EXPECT_EQ(async_known_many->TotalProcessingTime(),
|
||||||
num_inputs_per_output * (50 + 100) + 128);
|
num_inputs_per_output * (50 + 100) + 128);
|
||||||
EXPECT_LE(async_known_many->OutputTime(&input_times),
|
EXPECT_LE(async_known_many->OutputTime(&input_times),
|
||||||
num_inputs_per_output * (50 + 100) + 128 / parallelism);
|
num_inputs_per_output * (50 + 100) + 128 / parallelism);
|
||||||
EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
|
EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
|
||||||
async_known_many->record_element();
|
async_known_many->record_element();
|
||||||
EXPECT_EQ(async_known_many->ProcessingTime(),
|
EXPECT_EQ(async_known_many->TotalProcessingTime(),
|
||||||
num_inputs_per_output * (50 + 100) + 64);
|
num_inputs_per_output * (50 + 100) + 64);
|
||||||
EXPECT_LE(async_known_many->OutputTime(&input_times),
|
EXPECT_LE(async_known_many->OutputTime(&input_times),
|
||||||
num_inputs_per_output * (50 + 100) + 64 / parallelism);
|
num_inputs_per_output * (50 + 100) + 64 / parallelism);
|
||||||
@ -174,25 +175,25 @@ TEST(InterleaveManyTest, Model) {
|
|||||||
std::shared_ptr<Node> source2 =
|
std::shared_ptr<Node> source2 =
|
||||||
model::MakeSourceNode({2, "source2", interleave_many});
|
model::MakeSourceNode({2, "source2", interleave_many});
|
||||||
interleave_many->add_input(source2);
|
interleave_many->add_input(source2);
|
||||||
std::vector<int64> input_times(1, 0);
|
std::vector<double> input_times(1, 0);
|
||||||
interleave_many->add_processing_time(100);
|
interleave_many->add_processing_time(100);
|
||||||
EXPECT_EQ(interleave_many->processing_time(), 100);
|
EXPECT_EQ(interleave_many->processing_time(), 100);
|
||||||
EXPECT_EQ(interleave_many->ProcessingTime(), 0);
|
EXPECT_EQ(interleave_many->TotalProcessingTime(), 0);
|
||||||
EXPECT_EQ(interleave_many->OutputTime(&input_times), 0);
|
EXPECT_EQ(interleave_many->OutputTime(&input_times), 0);
|
||||||
interleave_many->record_element();
|
interleave_many->record_element();
|
||||||
EXPECT_EQ(interleave_many->num_elements(), 1);
|
EXPECT_EQ(interleave_many->num_elements(), 1);
|
||||||
EXPECT_EQ(interleave_many->ProcessingTime(), 100);
|
EXPECT_EQ(interleave_many->TotalProcessingTime(), 100);
|
||||||
EXPECT_EQ(interleave_many->OutputTime(&input_times), 100);
|
EXPECT_EQ(interleave_many->OutputTime(&input_times), 100);
|
||||||
source1->add_processing_time(200);
|
source1->add_processing_time(200);
|
||||||
source2->add_processing_time(300);
|
source2->add_processing_time(300);
|
||||||
EXPECT_EQ(interleave_many->ProcessingTime(), 100);
|
EXPECT_EQ(interleave_many->TotalProcessingTime(), 100);
|
||||||
EXPECT_EQ(interleave_many->OutputTime(&input_times), 100);
|
EXPECT_EQ(interleave_many->OutputTime(&input_times), 100);
|
||||||
source1->record_element();
|
source1->record_element();
|
||||||
source2->record_element();
|
source2->record_element();
|
||||||
EXPECT_EQ(interleave_many->ProcessingTime(), 350);
|
EXPECT_EQ(interleave_many->TotalProcessingTime(), 350);
|
||||||
EXPECT_EQ(interleave_many->OutputTime(&input_times), 350);
|
EXPECT_EQ(interleave_many->OutputTime(&input_times), 350);
|
||||||
interleave_many->record_element();
|
interleave_many->record_element();
|
||||||
EXPECT_EQ(interleave_many->ProcessingTime(), 300);
|
EXPECT_EQ(interleave_many->TotalProcessingTime(), 300);
|
||||||
EXPECT_EQ(interleave_many->OutputTime(&input_times), 300);
|
EXPECT_EQ(interleave_many->OutputTime(&input_times), 300);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -208,39 +209,43 @@ TEST_P(KnownRatioTest, Model) {
|
|||||||
std::shared_ptr<Node> source2 =
|
std::shared_ptr<Node> source2 =
|
||||||
model::MakeSourceNode({2, "source2", known_many});
|
model::MakeSourceNode({2, "source2", known_many});
|
||||||
known_many->add_input(source2);
|
known_many->add_input(source2);
|
||||||
std::vector<int64> input_times(1, 0);
|
std::vector<double> input_times(1, 0);
|
||||||
source1->add_processing_time(100);
|
source1->add_processing_time(100);
|
||||||
EXPECT_EQ(known_many->ProcessingTime(), 0);
|
EXPECT_EQ(known_many->TotalProcessingTime(), 0);
|
||||||
EXPECT_EQ(known_many->OutputTime(&input_times), 0);
|
EXPECT_EQ(known_many->OutputTime(&input_times), 0);
|
||||||
source2->add_processing_time(200);
|
source2->add_processing_time(200);
|
||||||
EXPECT_EQ(known_many->ProcessingTime(), 0);
|
EXPECT_EQ(known_many->TotalProcessingTime(), 0);
|
||||||
EXPECT_EQ(known_many->OutputTime(&input_times), 0);
|
EXPECT_EQ(known_many->OutputTime(&input_times), 0);
|
||||||
source1->record_element();
|
source1->record_element();
|
||||||
EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * 100);
|
EXPECT_EQ(known_many->TotalProcessingTime(), num_inputs_per_output * 100);
|
||||||
EXPECT_EQ(known_many->OutputTime(&input_times), num_inputs_per_output * 100);
|
EXPECT_EQ(known_many->OutputTime(&input_times), num_inputs_per_output * 100);
|
||||||
source2->record_element();
|
source2->record_element();
|
||||||
EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * (100 + 200));
|
EXPECT_EQ(known_many->TotalProcessingTime(),
|
||||||
|
num_inputs_per_output * (100 + 200));
|
||||||
EXPECT_EQ(known_many->OutputTime(&input_times),
|
EXPECT_EQ(known_many->OutputTime(&input_times),
|
||||||
num_inputs_per_output * (100 + 200));
|
num_inputs_per_output * (100 + 200));
|
||||||
source1->record_element();
|
source1->record_element();
|
||||||
EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * (50 + 200));
|
EXPECT_EQ(known_many->TotalProcessingTime(),
|
||||||
|
num_inputs_per_output * (50 + 200));
|
||||||
EXPECT_EQ(known_many->OutputTime(&input_times),
|
EXPECT_EQ(known_many->OutputTime(&input_times),
|
||||||
num_inputs_per_output * (50 + 200));
|
num_inputs_per_output * (50 + 200));
|
||||||
source2->record_element();
|
source2->record_element();
|
||||||
EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * (50 + 100));
|
EXPECT_EQ(known_many->TotalProcessingTime(),
|
||||||
|
num_inputs_per_output * (50 + 100));
|
||||||
EXPECT_EQ(known_many->OutputTime(&input_times),
|
EXPECT_EQ(known_many->OutputTime(&input_times),
|
||||||
num_inputs_per_output * (50 + 100));
|
num_inputs_per_output * (50 + 100));
|
||||||
known_many->add_processing_time(128);
|
known_many->add_processing_time(128);
|
||||||
EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * (50 + 100));
|
EXPECT_EQ(known_many->TotalProcessingTime(),
|
||||||
|
num_inputs_per_output * (50 + 100));
|
||||||
EXPECT_EQ(known_many->OutputTime(&input_times),
|
EXPECT_EQ(known_many->OutputTime(&input_times),
|
||||||
num_inputs_per_output * (50 + 100));
|
num_inputs_per_output * (50 + 100));
|
||||||
known_many->record_element();
|
known_many->record_element();
|
||||||
EXPECT_EQ(known_many->ProcessingTime(),
|
EXPECT_EQ(known_many->TotalProcessingTime(),
|
||||||
num_inputs_per_output * (50 + 100) + 128);
|
num_inputs_per_output * (50 + 100) + 128);
|
||||||
EXPECT_EQ(known_many->OutputTime(&input_times),
|
EXPECT_EQ(known_many->OutputTime(&input_times),
|
||||||
num_inputs_per_output * (50 + 100) + 128);
|
num_inputs_per_output * (50 + 100) + 128);
|
||||||
known_many->record_element();
|
known_many->record_element();
|
||||||
EXPECT_EQ(known_many->ProcessingTime(),
|
EXPECT_EQ(known_many->TotalProcessingTime(),
|
||||||
num_inputs_per_output * (50 + 100) + 64);
|
num_inputs_per_output * (50 + 100) + 64);
|
||||||
EXPECT_EQ(known_many->OutputTime(&input_times),
|
EXPECT_EQ(known_many->OutputTime(&input_times),
|
||||||
num_inputs_per_output * (50 + 100) + 64);
|
num_inputs_per_output * (50 + 100) + 64);
|
||||||
@ -250,18 +255,18 @@ INSTANTIATE_TEST_SUITE_P(Test, KnownRatioTest, ::testing::Values(0, 1, 2, 4));
|
|||||||
|
|
||||||
TEST(SourceTest, Model) {
|
TEST(SourceTest, Model) {
|
||||||
std::shared_ptr<Node> source = model::MakeSourceNode({0, "source", nullptr});
|
std::shared_ptr<Node> source = model::MakeSourceNode({0, "source", nullptr});
|
||||||
std::vector<int64> input_times(1, 0);
|
std::vector<double> input_times(1, 0);
|
||||||
source->add_processing_time(100);
|
source->add_processing_time(100);
|
||||||
EXPECT_EQ(source->processing_time(), 100);
|
EXPECT_EQ(source->processing_time(), 100);
|
||||||
EXPECT_EQ(source->ProcessingTime(), 0);
|
EXPECT_EQ(source->TotalProcessingTime(), 0);
|
||||||
EXPECT_EQ(source->OutputTime(&input_times), 0);
|
EXPECT_EQ(source->OutputTime(&input_times), 0);
|
||||||
source->record_element();
|
source->record_element();
|
||||||
EXPECT_EQ(source->num_elements(), 1);
|
EXPECT_EQ(source->num_elements(), 1);
|
||||||
EXPECT_EQ(source->ProcessingTime(), 100);
|
EXPECT_EQ(source->TotalProcessingTime(), 100);
|
||||||
EXPECT_EQ(source->OutputTime(&input_times), 100);
|
EXPECT_EQ(source->OutputTime(&input_times), 100);
|
||||||
source->record_element();
|
source->record_element();
|
||||||
EXPECT_EQ(source->num_elements(), 2);
|
EXPECT_EQ(source->num_elements(), 2);
|
||||||
EXPECT_EQ(source->ProcessingTime(), 50);
|
EXPECT_EQ(source->TotalProcessingTime(), 50);
|
||||||
EXPECT_EQ(source->OutputTime(&input_times), 50);
|
EXPECT_EQ(source->OutputTime(&input_times), 50);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -274,25 +279,25 @@ TEST(UnknownRatioTest, Model) {
|
|||||||
std::shared_ptr<Node> source2 =
|
std::shared_ptr<Node> source2 =
|
||||||
model::MakeSourceNode({2, "source2", unknown_many});
|
model::MakeSourceNode({2, "source2", unknown_many});
|
||||||
unknown_many->add_input(source2);
|
unknown_many->add_input(source2);
|
||||||
std::vector<int64> input_times(1, 0);
|
std::vector<double> input_times(1, 0);
|
||||||
unknown_many->add_processing_time(100);
|
unknown_many->add_processing_time(100);
|
||||||
EXPECT_EQ(unknown_many->processing_time(), 100);
|
EXPECT_EQ(unknown_many->processing_time(), 100);
|
||||||
EXPECT_EQ(unknown_many->ProcessingTime(), 0);
|
EXPECT_EQ(unknown_many->TotalProcessingTime(), 0);
|
||||||
EXPECT_EQ(unknown_many->OutputTime(&input_times), 0);
|
EXPECT_EQ(unknown_many->OutputTime(&input_times), 0);
|
||||||
unknown_many->record_element();
|
unknown_many->record_element();
|
||||||
EXPECT_EQ(unknown_many->num_elements(), 1);
|
EXPECT_EQ(unknown_many->num_elements(), 1);
|
||||||
EXPECT_EQ(unknown_many->ProcessingTime(), 100);
|
EXPECT_EQ(unknown_many->TotalProcessingTime(), 100);
|
||||||
EXPECT_EQ(unknown_many->OutputTime(&input_times), 100);
|
EXPECT_EQ(unknown_many->OutputTime(&input_times), 100);
|
||||||
source1->add_processing_time(100);
|
source1->add_processing_time(100);
|
||||||
source2->add_processing_time(200);
|
source2->add_processing_time(200);
|
||||||
EXPECT_EQ(unknown_many->ProcessingTime(), 100);
|
EXPECT_EQ(unknown_many->TotalProcessingTime(), 100);
|
||||||
EXPECT_EQ(unknown_many->OutputTime(&input_times), 100);
|
EXPECT_EQ(unknown_many->OutputTime(&input_times), 100);
|
||||||
source1->record_element();
|
source1->record_element();
|
||||||
source2->record_element();
|
source2->record_element();
|
||||||
EXPECT_EQ(unknown_many->ProcessingTime(), 400);
|
EXPECT_EQ(unknown_many->TotalProcessingTime(), 400);
|
||||||
EXPECT_EQ(unknown_many->OutputTime(&input_times), 400);
|
EXPECT_EQ(unknown_many->OutputTime(&input_times), 400);
|
||||||
unknown_many->record_element();
|
unknown_many->record_element();
|
||||||
EXPECT_EQ(unknown_many->ProcessingTime(), 200);
|
EXPECT_EQ(unknown_many->TotalProcessingTime(), 200);
|
||||||
EXPECT_EQ(unknown_many->OutputTime(&input_times), 200);
|
EXPECT_EQ(unknown_many->OutputTime(&input_times), 200);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -305,36 +310,36 @@ TEST(UnknownTest, Model) {
|
|||||||
std::shared_ptr<Node> source2 =
|
std::shared_ptr<Node> source2 =
|
||||||
model::MakeSourceNode({2, "source2", unknown});
|
model::MakeSourceNode({2, "source2", unknown});
|
||||||
unknown->add_input(source2);
|
unknown->add_input(source2);
|
||||||
std::vector<int64> input_times(1, 0);
|
std::vector<double> input_times(1, 0);
|
||||||
source1->add_processing_time(100);
|
source1->add_processing_time(100);
|
||||||
EXPECT_EQ(unknown->ProcessingTime(), 0);
|
EXPECT_EQ(unknown->TotalProcessingTime(), 0);
|
||||||
EXPECT_EQ(unknown->OutputTime(&input_times), 0);
|
EXPECT_EQ(unknown->OutputTime(&input_times), 0);
|
||||||
source2->add_processing_time(100);
|
source2->add_processing_time(100);
|
||||||
EXPECT_EQ(unknown->ProcessingTime(), 0);
|
EXPECT_EQ(unknown->TotalProcessingTime(), 0);
|
||||||
EXPECT_EQ(unknown->OutputTime(&input_times), 0);
|
EXPECT_EQ(unknown->OutputTime(&input_times), 0);
|
||||||
source1->record_element();
|
source1->record_element();
|
||||||
EXPECT_EQ(unknown->ProcessingTime(), 100);
|
EXPECT_EQ(unknown->TotalProcessingTime(), 100);
|
||||||
EXPECT_EQ(unknown->OutputTime(&input_times), 100);
|
EXPECT_EQ(unknown->OutputTime(&input_times), 100);
|
||||||
source2->record_element();
|
source2->record_element();
|
||||||
EXPECT_EQ(unknown->ProcessingTime(), 200);
|
EXPECT_EQ(unknown->TotalProcessingTime(), 200);
|
||||||
EXPECT_EQ(unknown->OutputTime(&input_times), 200);
|
EXPECT_EQ(unknown->OutputTime(&input_times), 200);
|
||||||
source1->record_element();
|
source1->record_element();
|
||||||
EXPECT_EQ(unknown->ProcessingTime(), 150);
|
EXPECT_EQ(unknown->TotalProcessingTime(), 150);
|
||||||
EXPECT_EQ(unknown->OutputTime(&input_times), 150);
|
EXPECT_EQ(unknown->OutputTime(&input_times), 150);
|
||||||
source2->record_element();
|
source2->record_element();
|
||||||
EXPECT_EQ(unknown->ProcessingTime(), 100);
|
EXPECT_EQ(unknown->TotalProcessingTime(), 100);
|
||||||
EXPECT_EQ(unknown->OutputTime(&input_times), 100);
|
EXPECT_EQ(unknown->OutputTime(&input_times), 100);
|
||||||
// Unknown node processing time should not affect its ProcessingTime() or
|
// Unknown node processing time should not affect its TotalProcessingTime() or
|
||||||
// OutputTime().
|
// OutputTime().
|
||||||
unknown->add_processing_time(100);
|
unknown->add_processing_time(100);
|
||||||
EXPECT_EQ(unknown->processing_time(), 100);
|
EXPECT_EQ(unknown->processing_time(), 100);
|
||||||
EXPECT_EQ(unknown->ProcessingTime(), 100);
|
EXPECT_EQ(unknown->TotalProcessingTime(), 100);
|
||||||
EXPECT_EQ(unknown->OutputTime(&input_times), 100);
|
EXPECT_EQ(unknown->OutputTime(&input_times), 100);
|
||||||
// Unknown node number of elements should not affect its ProcessingTime() or
|
// Unknown node number of elements should not affect its TotalProcessingTime()
|
||||||
// OutputTime().
|
// or OutputTime().
|
||||||
unknown->record_element();
|
unknown->record_element();
|
||||||
EXPECT_EQ(unknown->num_elements(), 1);
|
EXPECT_EQ(unknown->num_elements(), 1);
|
||||||
EXPECT_EQ(unknown->ProcessingTime(), 100);
|
EXPECT_EQ(unknown->TotalProcessingTime(), 100);
|
||||||
EXPECT_EQ(unknown->OutputTime(&input_times), 100);
|
EXPECT_EQ(unknown->OutputTime(&input_times), 100);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -350,12 +355,12 @@ class TestNode : public model::Node {
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
int64 OutputTimeLocked(std::vector<int64>* input_times) const override
|
double OutputTimeLocked(std::vector<double>* input_times) const override
|
||||||
SHARED_LOCKS_REQUIRED(mu_) {
|
SHARED_LOCKS_REQUIRED(mu_) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
|
double TotalProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -252,7 +252,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
|
|||||||
return model::MakeAsyncInterleaveManyNode(
|
return model::MakeAsyncInterleaveManyNode(
|
||||||
std::move(args),
|
std::move(args),
|
||||||
{model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1,
|
{model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1,
|
||||||
/*max=*/port::NumSchedulableCPUs())});
|
/*max=*/dataset()->cycle_length_)});
|
||||||
}
|
}
|
||||||
|
|
||||||
Status SaveInternal(IteratorStateWriter* writer) override {
|
Status SaveInternal(IteratorStateWriter* writer) override {
|
||||||
@ -462,6 +462,10 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
|
|||||||
if (!future_elements_.empty()) {
|
if (!future_elements_.empty()) {
|
||||||
current_elements_[idx] = std::move(future_elements_.back());
|
current_elements_[idx] = std::move(future_elements_.back());
|
||||||
future_elements_.pop_back();
|
future_elements_.pop_back();
|
||||||
|
if (current_elements_[idx]->iterator) {
|
||||||
|
EnableAutotune(ctx.get(),
|
||||||
|
current_elements_[idx]->iterator.get());
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
current_elements_[idx] = MakeElement(ctx);
|
current_elements_[idx] = MakeElement(ctx);
|
||||||
if (!current_elements_[idx]) {
|
if (!current_elements_[idx]) {
|
||||||
@ -480,9 +484,21 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
|
|||||||
if (num_results > 0) {
|
if (num_results > 0) {
|
||||||
num_calls_++;
|
num_calls_++;
|
||||||
element->in_use = true;
|
element->in_use = true;
|
||||||
thread_pool_->Schedule(
|
thread_pool_->Schedule(std::bind(
|
||||||
std::bind(&ParallelInterleaveIterator::FetchResults, this,
|
&ParallelInterleaveIterator::FetchResults, this, ctx,
|
||||||
ctx, std::move(element), num_results));
|
std::move(element), num_results,
|
||||||
|
[this, ctx]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
|
||||||
|
--num_calls_;
|
||||||
|
const auto& stats_aggregator = ctx->stats_aggregator();
|
||||||
|
if (stats_aggregator) {
|
||||||
|
stats_aggregator->AddScalar(
|
||||||
|
stats_utils::ThreadUtilizationScalarName(
|
||||||
|
dataset()->node_name()),
|
||||||
|
static_cast<float>(num_calls_) /
|
||||||
|
static_cast<float>(num_parallel_calls_->value),
|
||||||
|
num_elements());
|
||||||
|
}
|
||||||
|
}));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -518,7 +534,8 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
|
|||||||
// Fetches up to `dataset()->block_length_` results from `element`.
|
// Fetches up to `dataset()->block_length_` results from `element`.
|
||||||
void FetchResults(const std::shared_ptr<IteratorContext>& ctx,
|
void FetchResults(const std::shared_ptr<IteratorContext>& ctx,
|
||||||
const std::shared_ptr<Element>& element,
|
const std::shared_ptr<Element>& element,
|
||||||
int64 num_results) LOCKS_EXCLUDED(*mu_) {
|
int64 num_results, std::function<void()> done)
|
||||||
|
LOCKS_EXCLUDED(*mu_) {
|
||||||
RecordStart(ctx.get());
|
RecordStart(ctx.get());
|
||||||
auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
|
auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
|
||||||
bool end_of_input = false;
|
bool end_of_input = false;
|
||||||
@ -546,15 +563,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
|
|||||||
element->inputs.clear();
|
element->inputs.clear();
|
||||||
--num_open_;
|
--num_open_;
|
||||||
}
|
}
|
||||||
--num_calls_;
|
done();
|
||||||
const auto& stats_aggregator = ctx->stats_aggregator();
|
|
||||||
if (stats_aggregator) {
|
|
||||||
stats_aggregator->AddScalar(
|
|
||||||
stats_utils::ThreadUtilizationScalarName(dataset()->node_name()),
|
|
||||||
static_cast<float>(num_calls_) /
|
|
||||||
static_cast<float>(num_parallel_calls_->value),
|
|
||||||
num_elements());
|
|
||||||
}
|
|
||||||
cond_var_->notify_all();
|
cond_var_->notify_all();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -566,9 +575,8 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
|
|||||||
RecordStart(ctx.get());
|
RecordStart(ctx.get());
|
||||||
auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
|
auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
|
||||||
auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
|
auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
|
||||||
// TODO(jsimsa): Autotune the buffer size.
|
// TODO(jsimsa): Autotune the number of iterators to prefetch.
|
||||||
return num_calls_ >= num_parallel_calls_->value ||
|
return future_elements_.size() >= 2 * dataset()->cycle_length_;
|
||||||
future_elements_.size() >= 2 * dataset()->cycle_length_;
|
|
||||||
};
|
};
|
||||||
while (true) {
|
while (true) {
|
||||||
mutex_lock l(*mu_);
|
mutex_lock l(*mu_);
|
||||||
@ -595,20 +603,11 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
|
|||||||
if (!element->iterator) {
|
if (!element->iterator) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
++num_calls_;
|
DisableAutotune(ctx.get(), element->iterator.get());
|
||||||
element->in_use = true;
|
element->in_use = true;
|
||||||
thread_pool_->Schedule(
|
thread_pool_->Schedule(
|
||||||
std::bind(&ParallelInterleaveIterator::FetchResults, this, ctx,
|
std::bind(&ParallelInterleaveIterator::FetchResults, this, ctx,
|
||||||
std::move(element), dataset()->block_length_));
|
std::move(element), dataset()->block_length_, [] {}));
|
||||||
}
|
|
||||||
const auto& stats_aggregator = ctx->stats_aggregator();
|
|
||||||
if (stats_aggregator) {
|
|
||||||
stats_aggregator->AddScalar(
|
|
||||||
stats_utils::ThreadUtilizationScalarName(
|
|
||||||
dataset()->node_name()),
|
|
||||||
static_cast<float>(num_calls_) /
|
|
||||||
static_cast<float>(num_parallel_calls_->value),
|
|
||||||
num_elements());
|
|
||||||
}
|
}
|
||||||
cond_var_->notify_all();
|
cond_var_->notify_all();
|
||||||
}
|
}
|
||||||
|
@ -15,7 +15,6 @@ py_test(
|
|||||||
"//tensorflow/python:client_testlib",
|
"//tensorflow/python:client_testlib",
|
||||||
"//tensorflow/python:math_ops",
|
"//tensorflow/python:math_ops",
|
||||||
"//tensorflow/python:session",
|
"//tensorflow/python:session",
|
||||||
"//tensorflow/python/data/experimental/ops:batching",
|
|
||||||
"//tensorflow/python/data/experimental/ops:optimization",
|
"//tensorflow/python/data/experimental/ops:optimization",
|
||||||
"//tensorflow/python/data/ops:dataset_ops",
|
"//tensorflow/python/data/ops:dataset_ops",
|
||||||
"//third_party/py/numpy",
|
"//third_party/py/numpy",
|
||||||
|
@ -22,7 +22,6 @@ import time
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from tensorflow.python.client import session
|
from tensorflow.python.client import session
|
||||||
from tensorflow.python.data.experimental.ops import batching
|
|
||||||
from tensorflow.python.data.experimental.ops import optimization
|
from tensorflow.python.data.experimental.ops import optimization
|
||||||
from tensorflow.python.data.ops import dataset_ops
|
from tensorflow.python.data.ops import dataset_ops
|
||||||
from tensorflow.python.ops import math_ops
|
from tensorflow.python.ops import math_ops
|
||||||
@ -78,13 +77,12 @@ class AutotuneBenchmark(test.Benchmark):
|
|||||||
dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
|
dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
|
||||||
np.random.rand(4 * k,
|
np.random.rand(4 * k,
|
||||||
1))).repeat()
|
1))).repeat()
|
||||||
dataset = dataset.apply(
|
dataset = dataset.map(
|
||||||
batching.map_and_batch(
|
math_ops.matmul, num_parallel_calls=optimization.AUTOTUNE)
|
||||||
math_ops.matmul,
|
dataset = dataset.batch(batch_size=batch_size)
|
||||||
num_parallel_calls=optimization.AUTOTUNE,
|
|
||||||
batch_size=batch_size))
|
|
||||||
options = dataset_ops.Options()
|
options = dataset_ops.Options()
|
||||||
options.experimental_optimization.apply_default_optimizations = False
|
options.experimental_optimization.apply_default_optimizations = False
|
||||||
|
options.experimental_optimization.map_and_batch_fusion = True
|
||||||
options.experimental_optimization.autotune = autotune
|
options.experimental_optimization.autotune = autotune
|
||||||
dataset = dataset.with_options(options)
|
dataset = dataset.with_options(options)
|
||||||
iterator = dataset_ops.make_one_shot_iterator(dataset)
|
iterator = dataset_ops.make_one_shot_iterator(dataset)
|
||||||
|
Loading…
Reference in New Issue
Block a user