Merge commit for internal changes

This commit is contained in:
Benoit Steiner 2017-05-08 19:31:28 -07:00
commit 78fe1944d9
129 changed files with 8280 additions and 1676 deletions

View File

@ -342,6 +342,7 @@ filegroup(
"//tensorflow/tensorboard/components/tf_globals:all_files",
"//tensorflow/tensorboard/components/tf_globals_d3v4:all_files",
"//tensorflow/tensorboard/components/tf_graph_common:all_files",
"//tensorflow/tensorboard/components/tf_graph_loader:all_files",
"//tensorflow/tensorboard/components/tf_histogram_dashboard:all_files",
"//tensorflow/tensorboard/components/tf_histogram_dashboard/demo:all_files",
"//tensorflow/tensorboard/components/tf_image_dashboard:all_files",

View File

@ -73,7 +73,7 @@ cc_library(
"//tensorflow/compiler/xla:util",
"//tensorflow/compiler/xla:xla_data_proto",
"//tensorflow/compiler/xla/client:client_library",
"//tensorflow/compiler/xla/client:local_client",
"//tensorflow/compiler/xla/client:compile_only_client",
"//tensorflow/compiler/xla/service:compiler",
"//tensorflow/compiler/xla/service/cpu:cpu_compiler",
"//tensorflow/core:core_cpu",

View File

@ -27,7 +27,7 @@ limitations under the License.
#include "tensorflow/compiler/tf2xla/xla_compiler.h"
#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
#include "tensorflow/compiler/xla/client/client_library.h"
#include "tensorflow/compiler/xla/client/local_client.h"
#include "tensorflow/compiler/xla/client/compile_only_client.h"
#include "tensorflow/compiler/xla/service/compiler.h"
#include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
#include "tensorflow/compiler/xla/shape_util.h"
@ -274,7 +274,8 @@ Status CreateXlaArgs(const Graph& graph,
// Converts the TensorFlow graph into an XLA computation, by executing the
// graph symbolically, with each op building up the XLA HLO.
Status ConvertGraphToXla(xla::LocalClient* client, std::unique_ptr<Graph> graph,
Status ConvertGraphToXla(xla::CompileOnlyClient* client,
std::unique_ptr<Graph> graph,
xla::Computation* computation, bool* has_context_arg) {
// Create a device and context to convert the graph into an XLA computation.
XlaOpRegistry::RegisterCompilationKernels();
@ -333,7 +334,8 @@ Status ConvertGraphToXla(xla::LocalClient* client, std::unique_ptr<Graph> graph,
}
// Compiles the XLA computation into executable code.
Status CompileXla(xla::LocalClient* client, const xla::Computation& computation,
Status CompileXla(xla::CompileOnlyClient* client,
const xla::Computation& computation,
const xla::cpu::CpuAotCompilationOptions& aot_opts,
CompileResult* compile_result) {
// Retrieves arg and result layouts from the computation.
@ -350,7 +352,7 @@ Status CompileXla(xla::LocalClient* client, const xla::Computation& computation,
for (int i = 0; i < pshape->parameters_size(); ++i) {
arg_layouts.push_back(pshape->mutable_parameters(i));
}
xla::LocalClient::AheadOfTimeComputationInstance instance;
xla::CompileOnlyClient::AotComputationInstance instance;
instance.computation = &computation;
instance.argument_layouts = std::move(arg_layouts);
instance.result_layout = &pshape->result();
@ -365,7 +367,7 @@ Status CompileXla(xla::LocalClient* client, const xla::Computation& computation,
std::move(aot_or.ValueOrDie().back()));
compile_result->entry_point = aot_opts.entry_point_name();
compile_result->pointer_size =
xla::LocalClient::PointerSizeForTriple(aot_opts.triple());
xla::CompileOnlyClient::PointerSizeForTriple(aot_opts.triple());
return Status::OK();
}
@ -394,8 +396,9 @@ Status CompileGraph(std::unique_ptr<Graph> graph, const MainFlags& flags,
namespace gpu = perftools::gputools;
gpu::Platform* cpu_platform =
gpu::MultiPlatformManager::PlatformWithName("Host").ValueOrDie();
xla::LocalClient* client =
xla::ClientLibrary::GetOrCreateLocalClient(cpu_platform).ValueOrDie();
xla::CompileOnlyClient* client =
xla::ClientLibrary::GetOrCreateCompileOnlyClient(cpu_platform)
.ValueOrDie();
xla::Computation computation;
TF_RETURN_IF_ERROR(ConvertGraphToXla(client, std::move(graph), &computation,
&compile_result->has_context_arg));

View File

@ -99,6 +99,26 @@ cc_library(
],
)
cc_library(
name = "compile_only_client",
srcs = ["compile_only_client.cc"],
hdrs = ["compile_only_client.h"],
deps = [
":client",
":computation",
"//tensorflow/compiler/xla:status_macros",
"//tensorflow/compiler/xla:statusor",
"//tensorflow/compiler/xla:util",
"//tensorflow/compiler/xla:xla_data_proto",
"//tensorflow/compiler/xla/service:compile_only_service",
"//tensorflow/compiler/xla/service:compiler",
"//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
"//tensorflow/core:lib",
"//tensorflow/core:stream_executor_no_cuda",
"@llvm//:support",
],
)
# This target is used to instantiate the XLA service in-process and create
# a client for it.
cc_library(
@ -106,12 +126,14 @@ cc_library(
srcs = ["client_library.cc"],
hdrs = ["client_library.h"],
deps = [
":compile_only_client",
":local_client",
"//tensorflow/compiler/xla:status_macros",
"//tensorflow/compiler/xla:statusor",
"//tensorflow/compiler/xla:types",
"//tensorflow/compiler/xla:util",
"//tensorflow/compiler/xla/service:backend",
"//tensorflow/compiler/xla/service:compile_only_service",
"//tensorflow/compiler/xla/service:device_memory_allocator",
"//tensorflow/compiler/xla/service:local_service",
"//tensorflow/compiler/xla/service:platform_util",

View File

@ -69,8 +69,8 @@ ClientLibrary::~ClientLibrary() = default;
TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
}
auto it = client_library.instances_.find(platform->id());
if (it != client_library.instances_.end()) {
auto it = client_library.local_instances_.find(platform->id());
if (it != client_library.local_instances_.end()) {
return it->second->client.get();
}
@ -78,13 +78,13 @@ ClientLibrary::~ClientLibrary() = default;
service_options.set_platform(platform);
service_options.set_number_of_replicas(replica_count);
std::unique_ptr<LocalInstance> instance = MakeUnique<LocalInstance>();
auto instance = MakeUnique<LocalInstance>();
TF_ASSIGN_OR_RETURN(instance->service,
LocalService::NewService(service_options));
instance->client = MakeUnique<LocalClient>(instance->service.get());
LocalClient* cl = instance->client.get();
client_library.instances_.insert(
client_library.local_instances_.insert(
std::make_pair(platform->id(), std::move(instance)));
return cl;
}
@ -99,9 +99,35 @@ ClientLibrary::~ClientLibrary() = default;
perftools::gputools::Platform* platform) {
ClientLibrary& client_library = Singleton();
tensorflow::mutex_lock lock(client_library.service_mutex_);
auto it = client_library.instances_.find(platform->id());
CHECK(it != client_library.instances_.end());
auto it = client_library.local_instances_.find(platform->id());
CHECK(it != client_library.local_instances_.end());
return it->second->service.get();
}
/* static */ StatusOr<CompileOnlyClient*>
ClientLibrary::GetOrCreateCompileOnlyClient(
perftools::gputools::Platform* platform) {
ClientLibrary& client_library = Singleton();
tensorflow::mutex_lock lock(client_library.service_mutex_);
if (platform == nullptr) {
TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
}
auto it = client_library.compile_only_instances_.find(platform->id());
if (it != client_library.compile_only_instances_.end()) {
return it->second->client.get();
}
auto instance = MakeUnique<CompileOnlyInstance>();
TF_ASSIGN_OR_RETURN(instance->service,
CompileOnlyService::NewService(platform));
instance->client = MakeUnique<CompileOnlyClient>(instance->service.get());
CompileOnlyClient* cl = instance->client.get();
client_library.compile_only_instances_.insert(
std::make_pair(platform->id(), std::move(instance)));
return cl;
}
} // namespace xla

View File

@ -26,7 +26,9 @@ limitations under the License.
#include <string>
#include <vector>
#include "tensorflow/compiler/xla/client/compile_only_client.h"
#include "tensorflow/compiler/xla/client/local_client.h"
#include "tensorflow/compiler/xla/service/compile_only_service.h"
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
#include "tensorflow/compiler/xla/service/local_service.h"
#include "tensorflow/compiler/xla/statusor.h"
@ -76,6 +78,13 @@ class ClientLibrary {
// access user computations from client.
static LocalService* GetXlaService(perftools::gputools::Platform* platform);
// Singleton constructor-or-accessor for compile-only clients. Arguments:
//
// platform : The platform the underlying XLA service should target. If
// null then default platform is used.
static StatusOr<CompileOnlyClient*> GetOrCreateCompileOnlyClient(
perftools::gputools::Platform* platform = nullptr);
private:
// Returns the singleton instance of ClientLibrary.
static ClientLibrary& Singleton();
@ -90,10 +99,21 @@ class ClientLibrary {
std::unique_ptr<LocalClient> client;
};
struct CompileOnlyInstance {
// Service that is wrapped by the singleton client object.
std::unique_ptr<CompileOnlyService> service;
// Singleton client object.
std::unique_ptr<CompileOnlyClient> client;
};
tensorflow::mutex service_mutex_; // Guards the singleton creation state.
std::unordered_map<perftools::gputools::Platform::Id,
std::unique_ptr<LocalInstance>>
instances_ GUARDED_BY(service_mutex_);
local_instances_ GUARDED_BY(service_mutex_);
std::unordered_map<perftools::gputools::Platform::Id,
std::unique_ptr<CompileOnlyInstance>>
compile_only_instances_ GUARDED_BY(service_mutex_);
TF_DISALLOW_COPY_AND_ASSIGN(ClientLibrary);
};

View File

@ -0,0 +1,59 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/compiler/xla/client/compile_only_client.h"
#include "external/llvm/include/llvm/ADT/Triple.h"
#include "tensorflow/compiler/xla/ptr_util.h"
#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
#include "tensorflow/compiler/xla/status_macros.h"
namespace se = ::perftools::gputools;
namespace xla {
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
CompileOnlyClient::CompileAheadOfTime(
const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
const AotCompilationOptions& options) {
std::vector<CompileOnlyService::AotComputationInstance> service_instances;
service_instances.reserve(computations.size());
for (const AotComputationInstance& instance : computations) {
service_instances.push_back({});
CompileOnlyService::AotComputationInstance& service_instance =
service_instances.back();
TF_RET_CHECK(instance.computation != nullptr);
service_instance.computation = instance.computation->handle();
service_instance.argument_layouts = instance.argument_layouts;
service_instance.result_layout = instance.result_layout;
}
return compiler_service_->CompileAheadOfTime(service_instances, options);
}
int64 CompileOnlyClient::PointerSizeForTriple(
tensorflow::StringPiece target_triple) {
llvm::Triple triple(
llvm::Triple::normalize(llvm_ir::AsStringRef(target_triple)));
if (triple.isArch64Bit()) {
return 8;
} else if (triple.isArch32Bit()) {
return 4;
} else {
CHECK(triple.isArch16Bit());
return 2;
}
}
} // namespace xla

View File

@ -0,0 +1,66 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
#define TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
#include "tensorflow/compiler/xla/client/client.h"
#include "tensorflow/compiler/xla/client/computation.h"
#include "tensorflow/compiler/xla/service/compile_only_service.h"
#include "tensorflow/compiler/xla/service/compiler.h"
#include "tensorflow/compiler/xla/statusor.h"
#include "tensorflow/compiler/xla/xla_data.pb.h"
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
namespace xla {
// An XLA Client specialization for doing ahead-of-time compilation. This does
// not require (or attempt to instantiate) an execution-capable backend for the
// relevant platform.
class CompileOnlyClient : public Client {
public:
explicit CompileOnlyClient(CompileOnlyService* service)
: Client(service), compiler_service_(service) {}
CompileOnlyClient(const CompileOnlyClient&) = delete;
void operator=(const CompileOnlyClient&) = delete;
// A description of a computation to compile using CompileAheadOfTime.
struct AotComputationInstance {
const Computation* computation;
// Inform the compiler of the expected layout for arguments.
std::vector<const Shape*> argument_layouts;
// Specifies the expected result layout.
const Shape* result_layout;
};
// Compiles a list of computations for ahead-of-time execution. This is
// intended for use in static compilation. The |options| parameter describes
// the target for which the compiler should emit code.
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
CompileAheadOfTime(
const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
const AotCompilationOptions& options);
// Returns the size of a pointer in bytes for a given triple.
static int64 PointerSizeForTriple(tensorflow::StringPiece triple);
private:
CompileOnlyService* compiler_service_;
};
} // namespace xla
#endif // TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_

View File

@ -23,13 +23,15 @@ limitations under the License.
namespace xla {
// Wraps a GlobalDataHandle with a lifetime.
// A GlobalData object represents a globally-accessible allocation of
// data in the associated XLA service.
class GlobalData {
public:
// Gives ownership of the global data handle to this object.
GlobalData(ServiceInterface* parent, GlobalDataHandle handle);
// Unregisters the wrapped handle.
// Unregisters the wrapped handle, which causes the service to
// deallocate the associated data.
~GlobalData();
const GlobalDataHandle& handle() const { return handle_; }

View File

@ -176,10 +176,10 @@ StatusOr<std::unique_ptr<ShapedBuffer>> LocalExecutable::Run(
TF_RETURN_IF_ERROR(ValidateExecutionOptions(arguments, options, *backend_));
ExecutableRunOptions actual_options = options;
Backend::StreamPtr stream;
if (options.stream() == nullptr) {
TF_ASSIGN_OR_RETURN(
stream, BorrowStreamForDevice(options.device_ordinal(), backend_));
Backend::StreamPtr stream,
BorrowStreamForDevice(options.device_ordinal(), backend_));
actual_options.set_stream(stream.get());
}
if (options.allocator() == nullptr) {
@ -261,38 +261,6 @@ tensorflow::Status LocalClient::ResolveArguments(
argument_ptrs);
}
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
LocalClient::CompileAheadOfTime(
const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
computations,
const AotCompilationOptions& options) {
std::vector<LocalService::AheadOfTimeComputationInstance> service_instances;
service_instances.reserve(computations.size());
for (const AheadOfTimeComputationInstance& instance : computations) {
service_instances.push_back({});
LocalService::AheadOfTimeComputationInstance& service_instance =
service_instances.back();
TF_RET_CHECK(instance.computation != nullptr);
service_instance.computation = instance.computation->handle();
service_instance.argument_layouts = instance.argument_layouts;
service_instance.result_layout = instance.result_layout;
}
return local_service_->CompileAheadOfTime(service_instances, options);
}
int64 LocalClient::PointerSizeForTriple(tensorflow::StringPiece target_triple) {
llvm::Triple triple(
llvm::Triple::normalize(llvm_ir::AsStringRef(target_triple)));
if (triple.isArch64Bit()) {
return 8;
} else if (triple.isArch32Bit()) {
return 4;
} else {
CHECK(triple.isArch16Bit());
return 2;
}
}
se::Platform* LocalClient::platform() const {
return local_service_->backend().platform();
}

View File

@ -148,7 +148,7 @@ class LocalExecutable {
const ExecutableBuildOptions& build_options_;
};
// An XLA service client object for use when the client and service run in
// An XLA Client specialization for use when the client and service run in
// the same process.
class LocalClient : public Client {
public:
@ -182,30 +182,6 @@ class LocalClient : public Client {
const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
const ExecutableBuildOptions& options);
// A description of a computation to compile using CompileAheadOfTime.
struct AheadOfTimeComputationInstance {
const Computation* computation;
// Inform the compiler of the expected layout for arguments.
std::vector<const Shape*> argument_layouts;
// Specifies the expected result layout.
const Shape* result_layout;
};
// Compiles a list of computations for ahead-of-time execution. This is
// intended for use in static compilation. The |options| parameter describes
// the target for which the compiler should emit code.
//
// TODO(b/31222190): This doesn't really belong in LocalClient. Move it to its
// own library.
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
CompileAheadOfTime(
const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
computations,
const AotCompilationOptions& options);
// Returns the size of a pointer in bytes for a given triple.
static int64 PointerSizeForTriple(tensorflow::StringPiece triple);
// Returns the platform that the underlying service targets.
perftools::gputools::Platform* platform() const;

View File

@ -16,6 +16,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/literal_util.h"
#include <algorithm>
#include <cstring>
#include <functional>
#include <limits>
#include <numeric>
@ -308,37 +309,16 @@ template <typename T, typename WT>
/* static */ std::unique_ptr<Literal> LiteralUtil::Relayout(
const Literal& original, const Layout& layout) {
// Note: if this were a performance bottleneck, we avoid cloning and just make
// an uninitialized array instead, since all values are clobbered below.
std::unique_ptr<Literal> result = CloneToUnique(original);
*result->mutable_shape()->mutable_layout() = layout;
const PrimitiveType primitive_type = original.shape().element_type();
switch (primitive_type) {
case F32:
LiteralUtil::EachCell<float>(
original,
[&](tensorflow::gtl::ArraySlice<int64> indices, float value) {
LiteralUtil::Set<float>(result.get(), indices, value);
});
return result;
case S32:
LiteralUtil::EachCell<int32>(
original,
[&](tensorflow::gtl::ArraySlice<int64> indices, int32 value) {
LiteralUtil::Set<int32>(result.get(), indices, value);
});
return result;
case U32:
LiteralUtil::EachCell<uint32>(
original,
[&](tensorflow::gtl::ArraySlice<int64> indices, uint32 value) {
LiteralUtil::Set<uint32>(result.get(), indices, value);
});
return result;
default:
LOG(FATAL) << "not yet implemented: "
<< PrimitiveType_Name(primitive_type);
}
const Shape& shape = original.shape();
std::vector<int64> base(ShapeUtil::Rank(shape), 0);
std::vector<int64> copy_size(shape.dimensions().begin(),
shape.dimensions().end());
TF_CHECK_OK(Copy(original, base, result.get(), base, copy_size));
return result;
}
/* static */ StatusOr<std::unique_ptr<Literal>> LiteralUtil::Reshape(
@ -346,25 +326,19 @@ template <typename T, typename WT>
if (ShapeUtil::IsTuple(input.shape())) {
return InvalidArgument("Reshape does not support tuples.");
}
std::unique_ptr<Literal> output;
if (!LayoutUtil::IsMonotonicWithDim0Major(input.shape().layout())) {
return Unimplemented(
"Input shape must have a monotonic layout where dimension 0 is major, "
"was: %s",
LayoutUtil::HumanString(input.shape().layout()).c_str());
std::vector<int64> minor_to_major(ShapeUtil::Rank(input.shape()));
std::iota(minor_to_major.rbegin(), minor_to_major.rend(),
static_cast<int64>(0));
output = Relayout(input, LayoutUtil::MakeLayout(minor_to_major));
} else {
output = CloneToUnique(input);
}
std::vector<int64> layout(dimensions.size());
std::iota(layout.rbegin(), layout.rend(), 0);
// Because the layout is monotonic, we can simply reuse the same sequence of
// values without changing their order.
std::unique_ptr<Literal> output = CloneToUnique(input);
output->clear_shape();
output->mutable_shape()->set_element_type(input.shape().element_type());
for (int64 dimension : dimensions) {
output->mutable_shape()->add_dimensions(dimension);
}
*output->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout(layout);
*output->mutable_shape() =
ShapeUtil::MakeShape(input.shape().element_type(), dimensions);
int64 elements_before = ShapeUtil::ElementsIn(input.shape());
int64 elements_after = ShapeUtil::ElementsIn(output->shape());
@ -378,73 +352,42 @@ template <typename T, typename WT>
return std::move(output);
}
namespace {
template <class T>
void TransposeLiteralInternal(const Literal& original,
tensorflow::gtl::ArraySlice<int64> permutation,
Literal* result) {
std::vector<int64> new_indices(ShapeUtil::Rank(original.shape()));
LiteralUtil::EachCell<T>(
original, [&](tensorflow::gtl::ArraySlice<int64> indices, T value) {
for (int64 i = 0; i < indices.size(); ++i) {
new_indices[i] = indices[permutation[i]];
}
LiteralUtil::Set<T>(result, new_indices, value);
});
}
} // namespace
/* static */ std::unique_ptr<Literal> LiteralUtil::Transpose(
const Literal& original, tensorflow::gtl::ArraySlice<int64> permutation) {
CHECK(!ShapeUtil::IsTuple(original.shape()))
<< "tuple is not supported for transpose";
std::vector<int64> dimension_numbers(ShapeUtil::Rank(original.shape()));
std::iota(dimension_numbers.begin(), dimension_numbers.end(), 0);
CHECK(std::is_permutation(permutation.begin(), permutation.end(),
dimension_numbers.begin()))
<< "given permutation is not a permutation of dimension numbers";
std::vector<int64> new_dimension_sizes;
for (const int64 dim : permutation) {
new_dimension_sizes.push_back(original.shape().dimensions(dim));
}
const auto result_shape = ShapeUtil::MakeShape(
original.shape().element_type(), new_dimension_sizes);
std::unique_ptr<Literal> result = CloneToUnique(original);
*result->mutable_shape() = result_shape;
const PrimitiveType primitive_type = original.shape().element_type();
switch (primitive_type) {
case F32:
TransposeLiteralInternal<float>(original, permutation, result.get());
return result;
case F64:
TransposeLiteralInternal<double>(original, permutation, result.get());
return result;
case PRED:
TransposeLiteralInternal<bool>(original, permutation, result.get());
return result;
case S8:
TransposeLiteralInternal<int8>(original, permutation, result.get());
return result;
case U8:
TransposeLiteralInternal<uint8>(original, permutation, result.get());
return result;
case S32:
TransposeLiteralInternal<int32>(original, permutation, result.get());
return result;
case U32:
TransposeLiteralInternal<uint32>(original, permutation, result.get());
return result;
case S64:
TransposeLiteralInternal<int64>(original, permutation, result.get());
return result;
case U64:
TransposeLiteralInternal<uint64>(original, permutation, result.get());
return result;
default:
LOG(FATAL) << "not yet implemented: "
<< PrimitiveType_Name(primitive_type);
<< "Tuple is not supported for transpose";
CHECK(IsPermutation(permutation, ShapeUtil::Rank(original.shape())))
<< "Given permutation is not a permutation of dimension numbers";
// To transpose the array, we just permute the dimensions and layout, and
// do a straight memory copy of the raw data set.
// This is considerably faster than iterating over every array element using
// the EachCell<>() and Set<>() APIs.
std::vector<int64> inverse_permutation = InversePermutation(permutation);
Shape shape =
ShapeUtil::PermuteDimensions(inverse_permutation, original.shape());
// Replace the layout with one affine to the original shape, such that a
// transpose operation can be performed by leaving the flat values
// representation intact.
// For example, consider the shape F32[11,8]{1,0} under a {1,0} permutation.
// The shape with affine layout resulting from that operation will be
// F32[8,11]{0,1}, since it leave the original most minor (the 8 sized), the
// most minor.
// Essentially, given MinMaj(Di) the position of the Di dimension within the
// minor to major vector, and given T(Di) the index that the original Di
// dimension has within the transposed array, a layout is affine if
// MinMaj(Di) == TMinMaj(T(Di)), with TMinMaj() being the minor to major
// vector of the affine layout.
Layout* layout = shape.mutable_layout();
layout->clear_minor_to_major();
for (auto index : original.shape().layout().minor_to_major()) {
layout->add_minor_to_major(inverse_permutation[index]);
}
std::unique_ptr<Literal> new_literal = CreateFromShape(shape);
DCHECK_GE(ShapeUtil::ByteSizeOf(new_literal->shape()),
ShapeUtil::ByteSizeOf(original.shape()));
std::memcpy(MutableInternalData(new_literal.get()), InternalData(original),
ShapeUtil::ByteSizeOf(original.shape()));
return new_literal;
}
/* static */ std::unique_ptr<Literal> LiteralUtil::Slice(
@ -793,47 +736,14 @@ void TransposeLiteralInternal(const Literal& original,
const Literal& literal,
const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
const string& value)>& per_cell) {
if (ShapeUtil::Rank(literal.shape()) == 1) {
for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
per_cell({i0}, GetAsString(literal, {i0}));
}
if (ShapeUtil::HasZeroElements(literal.shape())) {
return;
}
if (ShapeUtil::Rank(literal.shape()) == 2) {
for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
for (int64 i1 = 0; i1 < literal.shape().dimensions(1); ++i1) {
per_cell({i0, i1}, GetAsString(literal, {i0, i1}));
}
}
return;
}
if (ShapeUtil::Rank(literal.shape()) == 3) {
for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
for (int64 i1 = 0; i1 < literal.shape().dimensions(1); ++i1) {
for (int64 i2 = 0; i2 < literal.shape().dimensions(2); ++i2) {
per_cell({i0, i1, i2}, GetAsString(literal, {i0, i1, i2}));
}
}
}
return;
}
if (ShapeUtil::Rank(literal.shape()) == 4) {
for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
for (int64 i1 = 0; i1 < literal.shape().dimensions(1); ++i1) {
for (int64 i2 = 0; i2 < literal.shape().dimensions(2); ++i2) {
for (int64 i3 = 0; i3 < literal.shape().dimensions(3); ++i3) {
per_cell({i0, i1, i2, i3}, GetAsString(literal, {i0, i1, i2, i3}));
}
}
}
}
return;
}
LOG(FATAL) << "unhandled rank: " << ShapeUtil::Rank(literal.shape());
std::vector<int64> indices = IndexUtil::LinearIndexToMultidimensionalIndex(
literal.shape(), /*linear_index=*/0);
do {
per_cell(indices, GetAsString(literal, indices));
} while (IndexUtil::BumpIndices(literal.shape(), &indices));
}
namespace {

View File

@ -239,6 +239,11 @@ class LiteralUtil {
// Clones literal into an owned unique_ptr version.
static std::unique_ptr<Literal> CloneToUnique(const Literal& literal);
// Returns the linear index of the given index within the literal's
// element_type repeated field.
static int64 LinearIndex(const Literal& literal,
tensorflow::gtl::ArraySlice<int64> multi_index);
// Gets or sets an element in the literal at the given index. The index is
// CHECKed against the dimension sizes.
template <typename NativeT>
@ -427,11 +432,6 @@ class LiteralUtil {
"Cannot map native type to primitive type.");
}
// Returns the linear index of the given index within the literal's
// element_type repeated field.
static int64 LinearIndex(const Literal& literal,
tensorflow::gtl::ArraySlice<int64> multi_index);
// Internal template helper for the Copy() API, matching its arguments one by
// one.
//

View File

@ -469,6 +469,26 @@ TEST_F(LiteralUtilTest, ReshapeR4) {
EXPECT_TRUE(LiteralUtil::Equal(*expected, *reshape));
}
TEST_F(LiteralUtilTest, ReshapeR4Dim0Minor) {
// clang-format off
// F32[1x3x2x4]
auto original = LiteralUtil::CreateR4WithLayout<float>({{
{{10, 11, 12, 13}, {14, 15, 16, 17}},
{{18, 19, 20, 21}, {22, 23, 24, 25}},
{{26, 27, 28, 29}, {30, 31, 32, 33}},
}}, layout_r4_dim0minor_);
// F32[1x3x4x2]
auto expected = LiteralUtil::CreateR3WithLayout<float>({
{{10, 11}, {12, 13}, {14, 15}, {16, 17}},
{{18, 19}, {20, 21}, {22, 23}, {24, 25}},
{{26, 27}, {28, 29}, {30, 31}, {32, 33}},
}, layout_r3_dim0major_);
// clang-format on
auto reshape = LiteralUtil::Reshape(*original, {3, 4, 2}).ConsumeValueOrDie();
EXPECT_TRUE(LiteralUtil::Equal(*expected, *reshape));
}
TEST_F(LiteralUtilTest, TransposeR0) {
auto original = LiteralUtil::CreateR0<float>(1.7f);
auto reshape = LiteralUtil::Transpose(*original, /*permutation=*/{});
@ -659,15 +679,15 @@ TEST_F(LiteralUtilTest, Copy) {
primitive_util::NativeToPrimitiveType<uint32>(), dimensions, layout);
auto blank = LiteralUtil::CreateFromShape(shape);
auto source = LiteralUtil::CreateFromShape(shape);
const int64 sbase[] = {0, 0, 0, 0};
const int64 incr[] = {1, 1, 1, 1};
const int64 zero_base[] = {0, 0, 0, 0};
const int64 step[] = {1, 1, 1, 1};
uint32 seqnr = 0;
auto init_proc = [&](const std::vector<int64>& indexes) {
LiteralUtil::Set(source.get(), indexes, ++seqnr);
return true;
};
ShapeUtil::ForEachIndex(source->shape(), sbase, dimensions, incr,
ShapeUtil::ForEachIndex(source->shape(), zero_base, dimensions, step,
init_proc);
const int64 src_base[] = {3, 1, 5, 7};
@ -691,7 +711,7 @@ TEST_F(LiteralUtilTest, Copy) {
bval == LiteralUtil::Get<uint32>(*source, source_indexes));
return matched;
};
ShapeUtil::ForEachIndex(source->shape(), sbase, copy_size, incr,
ShapeUtil::ForEachIndex(source->shape(), zero_base, copy_size, step,
check_proc);
EXPECT_TRUE(matched);
}
@ -710,5 +730,43 @@ TEST_F(LiteralUtilTest, CopyScalars) {
EXPECT_EQ(LiteralUtil::Get<uint32>(*vect, {4}), 17);
}
TEST_F(LiteralUtilTest, Populate) {
struct PopulateData {
std::vector<int64> dimensions;
std::vector<int64> layout;
} populate_data[] = {
{{}, {}},
{{16}, {0}},
{{4, 16}, {1, 0}},
{{21, 12}, {0, 1}},
{{6, 11, 17}, {2, 0, 1}},
{{6, 11, 5, 17}, {3, 2, 0, 1}},
};
for (const auto& data : populate_data) {
Shape shape = ShapeUtil::MakeShapeWithLayout(
primitive_util::NativeToPrimitiveType<uint32>(), data.dimensions,
data.layout);
auto literal = LiteralUtil::CreateFromShape(shape);
auto generator = [&](tensorflow::gtl::ArraySlice<int64> indexes) -> uint32 {
// Offsets from linear index just to avoid R0 literals to be initialized
// with zero.
return LiteralUtil::LinearIndex(*literal, indexes) + 17;
};
TF_EXPECT_OK(LiteralUtil::Populate<uint32>(literal.get(), generator));
std::vector<int64> zero_base(data.dimensions.size(), 0);
std::vector<int64> step(data.dimensions.size(), 1);
bool matched = true;
auto check_function = [&](const std::vector<int64>& indexes) {
auto value = LiteralUtil::Get<uint32>(*literal, indexes);
matched = matched && (value == generator(indexes));
return matched;
};
ShapeUtil::ForEachIndex(literal->shape(), zero_base, data.dimensions, step,
check_function);
EXPECT_TRUE(matched);
}
}
} // namespace
} // namespace xla

View File

@ -406,6 +406,27 @@ cc_library(
],
)
cc_library(
name = "compile_only_service",
srcs = ["compile_only_service.cc"],
hdrs = ["compile_only_service.h"],
deps = [
":backend",
":compiler",
":computation_layout",
":computation_tracker",
":platform_util",
":service",
"//tensorflow/compiler/xla:status_macros",
"//tensorflow/compiler/xla:statusor",
"//tensorflow/compiler/xla:types",
"//tensorflow/compiler/xla:util",
"//tensorflow/compiler/xla:xla_data_proto",
"//tensorflow/core:lib",
"//tensorflow/core:stream_executor_no_cuda",
],
)
cc_library(
name = "cpu_plugin",
deps = [

View File

@ -0,0 +1,131 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/compiler/xla/service/compile_only_service.h"
#include <string>
#include <utility>
#include <vector>
#include "tensorflow/compiler/xla/service/backend.h"
#include "tensorflow/compiler/xla/service/computation_layout.h"
#include "tensorflow/compiler/xla/service/computation_tracker.h"
#include "tensorflow/compiler/xla/service/platform_util.h"
#include "tensorflow/compiler/xla/status_macros.h"
#include "tensorflow/compiler/xla/types.h"
#include "tensorflow/compiler/xla/util.h"
#include "tensorflow/core/lib/gtl/cleanup.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
namespace se = ::perftools::gputools;
namespace xla {
/* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
CompileOnlyService::NewService(perftools::gputools::Platform* platform) {
ServiceOptions default_options;
default_options.set_platform(platform);
return NewService(default_options);
}
/* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
CompileOnlyService::NewService(const ServiceOptions& options) {
perftools::gputools::Platform* platform = options.platform();
if (platform == nullptr) {
TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
}
TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform));
TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> compute_constant_backend,
CreateComputeConstantBackend());
std::unique_ptr<CompileOnlyService> service(
new CompileOnlyService(compiler, std::move(compute_constant_backend)));
return std::move(service);
}
CompileOnlyService::CompileOnlyService(
Compiler* compiler, std::unique_ptr<Backend> compute_constant_backend)
: Service(/*backend=*/nullptr, std::move(compute_constant_backend)),
compiler_(compiler) {
runs_in_client_process_ = true;
}
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
CompileOnlyService::CompileAheadOfTime(
const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
const AotCompilationOptions& options) {
std::vector<std::unique_ptr<HloModule>> hlo_modules;
std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
for (const AotComputationInstance& instance : computations) {
TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
computation_tracker_.Resolve(instance.computation));
VersionedComputationHandle versioned_handle =
user_computation->GetVersionedHandle();
// Dump computation proto state if flag is set.
legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
const string& directory_path = flags->xla_dump_computations_to;
if (!directory_path.empty()) {
TF_ASSIGN_OR_RETURN(
std::unique_ptr<SessionModule> session_module,
computation_tracker_.SnapshotComputation(versioned_handle.handle));
string filename = tensorflow::strings::StrCat(
"computation_", versioned_handle.handle.handle(), "__",
session_module->entry().name(), "__version_",
versioned_handle.version);
TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
*session_module));
}
TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
computation_tracker_.BuildHloModule(
versioned_handle,
/*include_unreachable_instructions=*/true));
hlo_modules.push_back(std::move(hlo_module));
TF_ASSIGN_OR_RETURN(
std::shared_ptr<const ProgramShape> program_shape,
user_computation->ComputeProgramShape(versioned_handle.version));
module_configs.push_back(MakeUnique<HloModuleConfig>(*program_shape));
HloModuleConfig* module_config = module_configs.back().get();
auto* computation_layout =
module_config->mutable_entry_computation_layout();
if (flags->xla_hlo_profile) {
module_config->enable_hlo_profiling(true);
}
for (int i = 0; i < instance.argument_layouts.size(); ++i) {
const Shape& argument_layout = *instance.argument_layouts[i];
if (ShapeUtil::IsTuple(argument_layout)) {
return Unimplemented("tuple arguments not supported yet");
}
TF_RETURN_IF_ERROR(
computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape(
argument_layout));
}
TF_RETURN_IF_ERROR(
computation_layout->mutable_result_layout()->CopyLayoutFromShape(
*instance.result_layout));
}
return compiler_->CompileAheadOfTime(std::move(hlo_modules),
std::move(module_configs),
MakeHloDumper(), options);
}
} // namespace xla

View File

@ -0,0 +1,125 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
#define TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
#include "tensorflow/compiler/xla/service/backend.h"
#include "tensorflow/compiler/xla/service/compiler.h"
#include "tensorflow/compiler/xla/service/service.h"
#include "tensorflow/compiler/xla/statusor.h"
#include "tensorflow/compiler/xla/xla_data.pb.h"
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
namespace xla {
// An XLA Service specialization for ahead-of-time compilation. This only
// instantiates a Compiler object for the relevant platform; it does not
// instantiate or require an execution backend.
class CompileOnlyService : public Service {
public:
// Factory for creating a CompileOnlyService. The parameter platform is the
// platform that the service should target. If platform is null then the
// default platform is used.
static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
perftools::gputools::Platform* platform);
static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
const ServiceOptions& options);
// A description of a computation to compile using CompileAheadOfTime.
struct AotComputationInstance {
ComputationHandle computation;
std::vector<const Shape*> argument_layouts;
const Shape* result_layout = nullptr;
};
// Compiles a list of computations for ahead-of-time execution. This is
// intended for use in static compilation. See
// |CompileOnlyClient::CompileAheadOfTime| for additional details.
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
CompileAheadOfTime(
const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
const AotCompilationOptions& Options);
// Override Service methods that require or imply the existence of an
// execute backend. Note that this does not include TransferToClient and
// TransferToClientInProcess, as computing contants produces global data
// that we may wish to transfer.
tensorflow::Status Execute(const ExecuteRequest* arg,
ExecuteResponse* result) override {
return Unimplemented("CompileOnlyService does not support execution.");
}
tensorflow::Status ExecuteParallel(const ExecuteParallelRequest* arg,
ExecuteParallelResponse* result) override {
return Unimplemented("CompileOnlyService does not support execution.");
}
tensorflow::Status GetDeviceHandles(
const GetDeviceHandlesRequest* arg,
GetDeviceHandlesResponse* result) override {
return Unimplemented("CompileOnlyService does not support devices.");
}
tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
ExecuteAsyncResponse* result) override {
return Unimplemented("CompileOnlyService does not support execution.");
}
tensorflow::Status WaitForExecution(
const WaitForExecutionRequest* arg,
WaitForExecutionResponse* result) override {
return Unimplemented("CompileOnlyService does not support execution.");
}
tensorflow::Status TransferToServer(
const TransferToServerRequest* arg,
TransferToServerResponse* result) override {
return Unimplemented(
"CompileOnlyService does not support device data transfers.");
}
tensorflow::Status TransferToInfeed(
const TransferToInfeedRequest* arg,
TransferToInfeedResponse* result) override {
return Unimplemented(
"CompileOnlyService does not support device data transfers.");
}
tensorflow::Status TransferFromOutfeed(
const TransferFromOutfeedRequest* arg,
TransferFromOutfeedResponse* result) override {
return Unimplemented(
"CompileOnlyService does not support device data transfers.");
}
tensorflow::Status TransferToServerInProcess(
const TransferToServerInProcessRequest* arg,
TransferToServerInProcessResponse* result) override {
return Unimplemented(
"CompileOnlyService does not support device data transfers.");
}
tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
ResetDeviceResponse* result) override {
return Unimplemented("CompileOnlyService does not support devices.");
}
private:
explicit CompileOnlyService(
Compiler* compiler, std::unique_ptr<Backend> compute_constant_backend);
CompileOnlyService(const CompileOnlyService&) = delete;
void operator=(const CompileOnlyService&) = delete;
// The compiler for the target platform. This is included in place of
// the Service::execute_backend_'s compiler, since execute_backend_ is a
// nullptr in CompileOnlyService.
Compiler* compiler_;
};
} // namespace xla
#endif // TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_ONLY_SERVICE_H_

View File

@ -188,41 +188,52 @@ tensorflow::Status PrepareHloModuleForIrEmitting(
return pipeline.Run(hlo_module).status();
}
// Invokes the ptxas tool on the given PTX string, and dumps its output.
void DumpPtxasInfo(const string& ptx) {
// Invokes the ptxas tool on the given PTX string, and stores the resulting
// SASS in *cubin. If -v 2 or greater, runs ptxas with -v and dumps the
// resulting stderr (which contains register allocation info, etc.)
// to VLOG(2). If ptxas binary is not found *sass is set to "".
Status CompilePTX(const string& ptx, int cc_major, int cc_minor,
string* cubin) {
*cubin = "";
const string ptxas_path =
tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin/ptxas");
// Do not log PTX stats if ptxas is not found at the given path.
if (!tensorflow::Env::Default()->FileExists(ptxas_path).ok()) {
LOG(WARNING)
<< "Failed to dump PTX stats because ptxas is not found at path \""
<< ptxas_path << "\".";
return;
}
LOG(INFO) << "Invoking ptxas at path \"" << ptxas_path << "\".";
TF_RETURN_IF_ERROR(tensorflow::Env::Default()->FileExists(ptxas_path));
// Write `ptx` into a temporary file.
char tempdir_template[] = "/tmp/ptxXXXXXX";
char* tempdir_name = mkdtemp(tempdir_template);
CHECK_NOTNULL(tempdir_name);
string ptx_path = tensorflow::io::JoinPath(tempdir_name, "ptx");
TF_CHECK_OK(
tensorflow::WriteStringToFile(tensorflow::Env::Default(), ptx_path, ptx));
LOG(INFO) << "ptx file written to: " << ptx_path;
// Invoke ptxas and collect its output.
tensorflow::SubProcess ptxas_info_dumper;
ptxas_info_dumper.SetProgram(ptxas_path, {ptxas_path, ptx_path, "-o",
"/dev/null", "-v", "-arch=sm_35"});
ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
tensorflow::ACTION_PIPE);
CHECK(ptxas_info_dumper.Start());
string stderr_output;
int exit_status = ptxas_info_dumper.Communicate(
/*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
XLA_LOG_LINES(tensorflow::INFO, stderr_output);
if (exit_status != 0) {
LOG(FATAL) << "Invalid PTX. See the error message above for reasons.";
tensorflow::SubProcess ptxas_info;
string arch = tensorflow::strings::StrCat("sm_", cc_major, cc_minor);
string cubin_path = tensorflow::io::JoinPath(tempdir_name, "cubin");
if (VLOG_IS_ON(2)) {
ptxas_info.SetProgram(ptxas_path, {ptxas_path, "-v", "-o", cubin_path,
"-arch", arch, ptx_path});
} else {
ptxas_info.SetProgram(
ptxas_path, {ptxas_path, "-o", cubin_path, "-arch", arch, ptx_path});
}
ptxas_info.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
CHECK(ptxas_info.Start());
string stderr_output;
int ptxas_exit_status = ptxas_info.Communicate(
/*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
TF_RET_CHECK(ptxas_exit_status == 0);
return tensorflow::ReadFileToString(tensorflow::Env::Default(), cubin_path,
cubin);
}
} // namespace
@ -298,10 +309,14 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
// Reserve space for the PTX to be generated for this module.
string* ptx;
string* cubin;
{
tensorflow::mutex_lock lock(mutex_);
generated_ptxes_.emplace_back(MakeUnique<string>());
ptx = generated_ptxes_.back().get();
generated_cubins_.emplace_back(MakeUnique<string>());
cubin = generated_cubins_.back().get();
}
int cc_major, cc_minor;
if (!stream_exec->GetDeviceDescription().cuda_compute_capability(&cc_major,
@ -318,9 +333,6 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(llvm_module));
VLOG(2) << "PTX:";
XLA_VLOG_LINES(2, *ptx);
if (VLOG_IS_ON(2)) {
DumpPtxasInfo(*ptx);
}
auto thunk_schedule = MakeUnique<ThunkSchedule>(
ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
@ -328,9 +340,13 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
VLOG(2) << "Printing the thunk schedule...";
XLA_VLOG_LINES(2, thunk_schedule->ToString());
TF_RET_CHECK(CompilePTX(*ptx, cc_major, cc_minor, cubin).ok());
auto* gpu_executable =
new GpuExecutable(*ptx, std::move(thunk_schedule), std::move(hlo_module),
new GpuExecutable(*cubin, *ptx, {cc_major, cc_minor},
std::move(thunk_schedule), std::move(hlo_module),
std::move(module_config), std::move(buffer_assignment));
if (flags->xla_gpu_embed_ir) {
DCHECK_NE("", ir_module_string_before_opt);
gpu_executable->set_ir_module_string(ir_module_string_before_opt);

View File

@ -71,6 +71,7 @@ class GpuCompiler : public Compiler {
// StreamExecutor (b/24776264).
tensorflow::mutex mutex_;
std::vector<std::unique_ptr<string>> generated_ptxes_ GUARDED_BY(mutex_);
std::vector<std::unique_ptr<string>> generated_cubins_ GUARDED_BY(mutex_);
// The size in bytes of a pointer. Used for computing ShapeSizeBytes.
int64 pointer_size_;

View File

@ -107,13 +107,17 @@ class HloExecutionProfiler {
// Implementation note: HLO profiling is always enabled for GPU executables,
// since we can use timers around thunks.
GpuExecutable::GpuExecutable(tensorflow::StringPiece ptx,
GpuExecutable::GpuExecutable(tensorflow::StringPiece cubin,
tensorflow::StringPiece ptx,
std::pair<int, int> compute_capability,
std::unique_ptr<ThunkSchedule> thunk_schedule,
std::unique_ptr<HloModule> hlo_module,
std::unique_ptr<HloModuleConfig> module_config,
std::unique_ptr<BufferAssignment> assignment)
: Executable(std::move(hlo_module), std::move(module_config)),
cubin_(cubin),
ptx_(ptx),
compute_capability_(compute_capability),
thunk_schedule_(std::move(thunk_schedule)),
assignment_(std::move(assignment)) {}
@ -186,6 +190,13 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteOnStream(
// false.
TF_RET_CHECK(!module_config().has_hybrid_result());
// Ensure the compute capability of the cubin and the stream match.
std::pair<int, int> stream_compute_compatibility;
stream->parent()->GetDeviceDescription().cuda_compute_capability(
&stream_compute_compatibility.first,
&stream_compute_compatibility.second);
TF_RET_CHECK(stream_compute_compatibility == compute_capability_);
BufferAllocations::Builder buffer_allocations_builder;
for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
++i) {

View File

@ -40,15 +40,17 @@ limitations under the License.
namespace xla {
namespace gpu {
// GPU-targeting implementation of the XLA Executable interface.
//
// Launches the given CUDA kernel via the StreamExecutor.
//
// This is an immutable data type after initialization, and thus thread safe.
// GPUExecutable should eventually be updated to associate a compute
// capability with the PTX and store multiple cubins, each with their own
// associated CC's, rather than including CC as a property of GpuExecutable.
class GpuExecutable : public Executable {
public:
GpuExecutable(tensorflow::StringPiece ptx,
GpuExecutable(tensorflow::StringPiece cubin, tensorflow::StringPiece ptx,
std::pair<int, int> compute_capability,
std::unique_ptr<ThunkSchedule> thunk_schedule,
std::unique_ptr<HloModule> hlo_module,
std::unique_ptr<HloModuleConfig> module_config,
@ -62,7 +64,8 @@ class GpuExecutable : public Executable {
ir_module_string_ = ir_module_string;
}
// Returns the compiled PTX for the computation.
// Returns the compiled CUDA binary for the computation.
tensorflow::StringPiece cubin() const { return cubin_; }
tensorflow::StringPiece ptx() const { return ptx_; }
StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
@ -104,8 +107,10 @@ class GpuExecutable : public Executable {
// This string should be modified only before ExecuteOnStream.
string ir_module_string_;
// The reference to the compiled PTX for the computation.
const tensorflow::StringPiece ptx_;
// The reference to the compiled PTX & CUDA binary for the computation.
tensorflow::StringPiece cubin_;
tensorflow::StringPiece ptx_;
std::pair<int, int> compute_capability_;
// The thunks to be invoked by this GpuExecutable. They are generated by the
// IrEmitter.

View File

@ -41,13 +41,10 @@ tensorflow::Status KernelThunk::Initialize(const GpuExecutable& executable) {
// Already initialized by another thread.
return tensorflow::Status::OK();
}
loader_spec_.reset(new se::MultiKernelLoaderSpec(io_buffers_.size() + 1));
tensorflow::StringPiece ptx = executable.ptx();
// Convert tensorflow::StringPiece to se::port::StringPiece because
// StreamExecutor uses the latter.
loader_spec_->AddCudaPtxInMemory(
se::port::StringPiece(ptx.data(), ptx.size()), kernel_name_);
tensorflow::StringPiece cubin = executable.cubin();
loader_spec_->AddCudaCubinInMemory(cubin.data(), kernel_name_);
return tensorflow::Status::OK();
}

View File

@ -195,7 +195,7 @@ TEST_F(HloConstantFoldingTest, TransposeConstantFold) {
HloInstruction* root = computation->root_instruction();
EXPECT_THAT(root, op::Constant());
EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), shape));
using NativeT = typename primitive_util::PrimitiveTypeToNative<F32>::type;
bool matched = true;

View File

@ -128,70 +128,6 @@ StatusOr<GlobalDataHandle> LocalService::AllocateBufferOnDevice(
allocation_size));
}
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
LocalService::CompileAheadOfTime(
const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
computations,
const AotCompilationOptions& options) {
std::vector<std::unique_ptr<HloModule>> hlo_modules;
std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
for (const AheadOfTimeComputationInstance& instance : computations) {
TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
computation_tracker_.Resolve(instance.computation));
VersionedComputationHandle versioned_handle =
user_computation->GetVersionedHandle();
// Dump computation proto state if flag is set.
legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
const string& directory_path = flags->xla_dump_computations_to;
if (!directory_path.empty()) {
TF_ASSIGN_OR_RETURN(
std::unique_ptr<SessionModule> session_module,
computation_tracker_.SnapshotComputation(versioned_handle.handle));
string filename = tensorflow::strings::StrCat(
"computation_", versioned_handle.handle.handle(), "__",
session_module->entry().name(), "__version_",
versioned_handle.version);
TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
*session_module));
}
TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
computation_tracker_.BuildHloModule(
versioned_handle,
/*include_unreachable_instructions=*/true));
hlo_modules.push_back(std::move(hlo_module));
TF_ASSIGN_OR_RETURN(
std::shared_ptr<const ProgramShape> program_shape,
user_computation->ComputeProgramShape(versioned_handle.version));
module_configs.push_back(MakeUnique<HloModuleConfig>(*program_shape));
HloModuleConfig* module_config = module_configs.back().get();
auto* computation_layout =
module_config->mutable_entry_computation_layout();
if (flags->xla_hlo_profile) {
module_config->enable_hlo_profiling(true);
}
for (int i = 0; i < instance.argument_layouts.size(); ++i) {
const Shape& argument_layout = *instance.argument_layouts[i];
if (ShapeUtil::IsTuple(argument_layout)) {
return Unimplemented("tuple arguments not supported yet");
}
TF_RETURN_IF_ERROR(
computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape(
argument_layout));
}
TF_RETURN_IF_ERROR(
computation_layout->mutable_result_layout()->CopyLayoutFromShape(
*instance.result_layout));
}
return execute_backend_->compiler()->CompileAheadOfTime(
std::move(hlo_modules), std::move(module_configs), MakeHloDumper(),
options);
}
StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
const ComputationHandle& computation,
const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,

View File

@ -59,22 +59,6 @@ class LocalService : public Service {
const Shape& shape, int device_ordinal,
bool allocate_space_for_deep_copy);
// A description of a computation to compile using CompileAheadOfTime.
struct AheadOfTimeComputationInstance {
ComputationHandle computation;
std::vector<const Shape*> argument_layouts;
const Shape* result_layout = nullptr;
};
// Compiles a list of computations for ahead-of-time execution. This is
// intended for use in static compilation. See
// |LocalClient::CompileAheadOfTime| for additional details.
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
CompileAheadOfTime(
const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
computations,
const AotCompilationOptions& Options);
// Builds an Executable with the given argument layouts and options. If
// result_layout is non-null, then the executable is compiled to produce a
// result of the given layout.

View File

@ -180,20 +180,24 @@ Service::Service(std::unique_ptr<Backend> execute_backend,
std::unique_ptr<Backend> compute_constant_backend)
: execute_backend_(std::move(execute_backend)),
compute_constant_backend_(std::move(compute_constant_backend)) {
LOG(INFO) << Printf(
"XLA service %p executing computations on platform %s. Devices:", this,
execute_backend_->platform()->Name().c_str());
for (int i = 0; i < execute_backend_->device_count(); ++i) {
if (execute_backend_->device_ordinal_supported(i)) {
se::StreamExecutor* executor =
execute_backend_->stream_executor(i).ValueOrDie();
const auto& description = executor->GetDeviceDescription();
LOG(INFO) << Printf(" StreamExecutor device (%d): %s, %s", i,
description.name().c_str(),
description.platform_version().c_str());
} else {
LOG(INFO) << Printf(" StreamExecutor device (%d) not supported", i);
if (execute_backend_) {
LOG(INFO) << Printf(
"XLA service %p executing computations on platform %s. Devices:", this,
execute_backend_->platform()->Name().c_str());
for (int i = 0; i < execute_backend_->device_count(); ++i) {
if (execute_backend_->device_ordinal_supported(i)) {
se::StreamExecutor* executor =
execute_backend_->stream_executor(i).ValueOrDie();
const auto& description = executor->GetDeviceDescription();
LOG(INFO) << Printf(" StreamExecutor device (%d): %s, %s", i,
description.name().c_str(),
description.platform_version().c_str());
} else {
LOG(INFO) << Printf(" StreamExecutor device (%d) not supported", i);
}
}
} else {
VLOG(1) << "XLA compile-only service constructed";
}
}
@ -286,7 +290,7 @@ StatusOr<std::vector<const Allocation*>> Service::ResolveAndValidateArguments(
StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
const ProgramShape& program_shape,
tensorflow::gtl::ArraySlice<const Allocation*> arguments,
const ExecutionOptions& execution_options) {
const ExecutionOptions& execution_options, Backend* backend) {
auto module_config = MakeUnique<HloModuleConfig>(program_shape);
auto* computation_layout = module_config->mutable_entry_computation_layout();
@ -326,7 +330,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
module_config->enable_hlo_profiling(true);
}
module_config->set_replica_count(execute_backend_->Replicas().size());
module_config->set_replica_count(backend->Replicas().size());
module_config->set_fast_math_disabled(execution_options.disable_fast_math());
module_config->set_seed(execution_options.seed());
@ -474,7 +478,7 @@ StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
std::unique_ptr<Executable> executable_unique_ptr,
BuildExecutable(versioned_handle, std::move(module_config),
/*executable_for_compute_constant=*/false, arguments,
execute_backend_.get(), executor));
backend, executor));
if (profile != nullptr) {
uint64 end_micros = tensorflow::Env::Default()->NowMicros();
@ -575,15 +579,14 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
perftools::gputools::DeviceMemoryBase result;
if (backend->Replicas().size() == 1) {
TF_ASSIGN_OR_RETURN(
result,
ExecuteOnStreamWrapper<StatusOr<se::DeviceMemoryBase>>(
executable, &run_options[0], profile, execute_backend_.get(),
[&arguments](Executable* executable,
const ServiceExecutableRunOptions* run_options,
HloExecutionProfile* hlo_execution_profile) {
return executable->ExecuteOnStream(run_options, arguments,
hlo_execution_profile);
}));
result, ExecuteOnStreamWrapper<StatusOr<se::DeviceMemoryBase>>(
executable, &run_options[0], profile, backend,
[&arguments](Executable* executable,
const ServiceExecutableRunOptions* run_options,
HloExecutionProfile* hlo_execution_profile) {
return executable->ExecuteOnStream(run_options, arguments,
hlo_execution_profile);
}));
} else {
std::vector<
tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>>
@ -666,7 +669,8 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
// the program and the argument allocations.
TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
CreateModuleConfig(*program_shape, arg_allocations,
request.execution_options()));
request.execution_options(),
execute_backend_.get()));
VLOG(3) << "ExecuteParallel created HloModuleConfig computation layout: "
<< module_config->entry_computation_layout().ToString();
@ -751,9 +755,10 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
ResolveAndValidateArguments(arg->arguments(), execute_backend_.get(),
execute_backend_->default_device_ordinal()));
TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
CreateModuleConfig(*program_shape, arg_allocations,
arg->execution_options()));
TF_ASSIGN_OR_RETURN(
std::unique_ptr<HloModuleConfig> module_config,
CreateModuleConfig(*program_shape, arg_allocations,
arg->execution_options(), execute_backend_.get()));
VLOG(3) << "Execute created HloModuleConfig computation layout: "
<< module_config->entry_computation_layout().ToString();
@ -818,9 +823,10 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
ResolveAndValidateArguments(arg->arguments(), execute_backend_.get(),
execute_backend_->default_device_ordinal()));
TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
CreateModuleConfig(*program_shape, arg_allocations,
arg->execution_options()));
TF_ASSIGN_OR_RETURN(
std::unique_ptr<HloModuleConfig> module_config,
CreateModuleConfig(*program_shape, arg_allocations,
arg->execution_options(), execute_backend_.get()));
VLOG(3) << "ExecuteAsync created HloModuleConfig computation layout: "
<< module_config->entry_computation_layout().ToString();
@ -1141,7 +1147,8 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
}
TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
CreateModuleConfig(program_shape, {}, execution_options));
CreateModuleConfig(program_shape, {}, execution_options,
compute_constant_backend_.get()));
TF_ASSIGN_OR_RETURN(
std::shared_ptr<Executable> executable,

View File

@ -265,11 +265,11 @@ class Service : public ServiceInterface {
tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
const Backend* backend, int device_ordinal);
// Create a Hlo module config foe the given program shape and arguments.
// Create a Hlo module config for the given program shape and arguments.
StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
const ProgramShape& program_shape,
tensorflow::gtl::ArraySlice<const Allocation*> arguments,
const ExecutionOptions& execution_options);
const ExecutionOptions& execution_options, Backend* backend);
// Builds an Executable for the given parameters. If
// executable_for_compute_constant is true, then the executable is intended to

View File

@ -728,9 +728,17 @@ Status ForEachMutableSubshapeHelper(
new_shape.add_dimensions(dim);
}
if (shape.has_layout()) {
new_shape.mutable_layout()->clear_minor_to_major();
Layout* new_layout = new_shape.mutable_layout();
new_layout->clear_minor_to_major();
for (auto index : Permute(permutation, shape.layout().minor_to_major())) {
new_shape.mutable_layout()->add_minor_to_major(index);
new_layout->add_minor_to_major(index);
}
if (shape.layout().padded_dimensions_size() > 0) {
new_layout->clear_padded_dimensions();
for (auto dim :
Permute(permutation, shape.layout().padded_dimensions())) {
new_layout->add_padded_dimensions(dim);
}
}
}
return new_shape;
@ -1057,7 +1065,9 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
DCHECK_EQ(count.size(), base.size());
const Layout& layout = shape.layout();
int64 rank = layout.minor_to_major_size();
int64 n = 0;
// Allows handling R0 arrays, such that the visitor function will be called
// once with the proper empty indexes.
int64 n = -1;
std::vector<int64> indexes(base.begin(), base.end());
while (n < rank && visitor_function(indexes)) {
// Increments dimensions in minor to major order.

View File

@ -42,7 +42,7 @@ xla::Computation Doubler(xla::Client* client) {
int main(int argc, char** argv) {
tensorflow::port::InitMain(argv[0], &argc, &argv);
auto client = xla::ClientLibrary::LocalClientOrDie();
auto client = xla::ClientLibrary::GetOrCreateCompileOnlyClient().ValueOrDie();
xla::ComputationBuilder builder(client, "aot_test_helper");
auto opaque_shape = xla::ShapeUtil::MakeOpaqueShape();
@ -74,7 +74,7 @@ int main(int argc, char** argv) {
llvm::Triple triple(xla::llvm_ir::AsStringRef(triple_string));
xla::Computation computation = builder.Build().ConsumeValueOrDie();
xla::LocalClient::AheadOfTimeComputationInstance instance{
xla::CompileOnlyClient::AotComputationInstance instance{
&computation, /*argument_layouts=*/{&opaque_shape}, &r0f32};
xla::cpu::CpuAotCompilationOptions options(

View File

@ -153,16 +153,26 @@ string Reindent(tensorflow::StringPiece original,
});
}
bool IsPermutation(tensorflow::gtl::ArraySlice<int64> permutation, int64 rank) {
if (rank != permutation.size()) {
return false;
}
std::vector<int64> output(permutation.size(), -1);
for (auto index : permutation) {
CHECK_GE(index, 0);
CHECK_LT(index, rank);
output[index] = 0;
}
return std::find(output.begin(), output.end(), -1) == output.end();
}
std::vector<int64> InversePermutation(
tensorflow::gtl::ArraySlice<int64> input_permutation) {
DCHECK(IsPermutation(input_permutation, input_permutation.size()));
std::vector<int64> output_permutation(input_permutation.size(), -1);
for (size_t i = 0; i < input_permutation.size(); ++i) {
output_permutation[input_permutation[i]] = i;
}
DCHECK_EQ(
0, std::count(output_permutation.begin(), output_permutation.end(), -1));
DCHECK(std::is_permutation(input_permutation.begin(), input_permutation.end(),
output_permutation.begin()));
return output_permutation;
}

View File

@ -177,6 +177,9 @@ Status Unavailable(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2);
string Reindent(tensorflow::StringPiece original,
tensorflow::StringPiece indentation);
// Checks whether permutation is a permutation of the [0, rank) integer range.
bool IsPermutation(tensorflow::gtl::ArraySlice<int64> permutation, int64 rank);
// Applies `permutation` on `input` and returns the permuted array.
// For each i, output[permutation[i]] = input[i].
//
@ -187,12 +190,11 @@ template <template <typename...> class C, typename T>
std::vector<T> Permute(tensorflow::gtl::ArraySlice<int64> permutation,
C<T> input_) {
tensorflow::gtl::ArraySlice<T> input(input_);
CHECK_EQ(permutation.size(), input.size());
CHECK(IsPermutation(permutation, input.size()));
std::vector<T> output(input.size());
for (size_t i = 0; i < permutation.size(); ++i) {
output[permutation[i]] = input[i];
}
DCHECK(std::is_permutation(input.begin(), input.end(), output.begin()));
return output;
}

View File

@ -121,6 +121,7 @@ set(tf_proto_text_srcs
"tensorflow/core/protobuf/cluster.proto"
"tensorflow/core/protobuf/config.proto"
"tensorflow/core/protobuf/debug.proto"
"tensorflow/core/protobuf/device_properties.proto"
"tensorflow/core/protobuf/rewriter_config.proto"
"tensorflow/core/protobuf/tensor_bundle.proto"
"tensorflow/core/protobuf/saver.proto"

View File

@ -82,7 +82,7 @@ tf_custom_op_py_library(
cuda_py_test(
name = "cudnn_rnn_ops_test",
size = "small",
size = "medium",
srcs = ["python/kernel_tests/cudnn_rnn_ops_test.py"],
additional_deps = [
":cudnn_rnn_py",

View File

@ -40,6 +40,7 @@ limitations under the License.
#include "tensorflow/core/lib/strings/stringprintf.h"
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/util/env_var.h"
#if GOOGLE_CUDA
#include "tensorflow/core/platform/stream_executor.h"
@ -67,7 +68,7 @@ limitations under the License.
* TensorFlow is responsible for making sure the memory is alive long enough
* and recycles afterwards.
*
*/
*/
namespace tensorflow {
using CPUDevice = Eigen::ThreadPoolDevice;
@ -106,6 +107,7 @@ using perftools::gputools::DeviceMemory;
using perftools::gputools::DeviceMemoryBase;
using perftools::gputools::ScratchAllocator;
using perftools::gputools::port::StatusOr;
using strings::Printf;
Status ParseRNNMode(const string& str, RnnMode* rnn_mode) {
if (str == "rnn_relu") {
@ -203,9 +205,10 @@ DeviceMemoryBase SliceDeviceMemory(const DeviceMemoryBase& device_memory,
}
inline Status FromExecutorStatus(const perftools::gputools::port::Status& s) {
return s.ok() ? Status::OK() : Status(static_cast<tensorflow::error::Code>(
static_cast<int>(s.code())),
s.error_message());
return s.ok() ? Status::OK()
: Status(static_cast<tensorflow::error::Code>(
static_cast<int>(s.code())),
s.error_message());
}
template <typename T>
@ -244,8 +247,7 @@ class CudnnRNNWorkspaceAllocator : public ScratchAllocator {
// allocator.
allocated_tensors_.push_back(temporary_memory);
total_byte_size_ += byte_size;
return perftools::gputools::port::StatusOr<
perftools::gputools::DeviceMemory<uint8>>(
return StatusOr<DeviceMemory<uint8>>(
AsDeviceMemory<uint8>(&temporary_memory));
}
int64 TotalByteSize() { return total_byte_size_; }
@ -296,6 +298,43 @@ class CudnnRNNReserveSpaceAllocator : public ScratchAllocator {
int output_index_;
};
// A helper to allocate persistent memory for Cudnn RNN models, which is
// expected to live between kernel invocations.
// This class is not thread-safe.
class CudnnRNNPersistentSpaceAllocator : public ScratchAllocator {
public:
CudnnRNNPersistentSpaceAllocator(OpKernelContext* context)
: context_(context) {}
virtual ~CudnnRNNPersistentSpaceAllocator() {}
int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
return std::numeric_limits<int64>::max();
}
StatusOr<DeviceMemory<uint8>> AllocateBytes(
perftools::gputools::Stream* stream, int64 byte_size) override {
if (total_byte_size_ != 0) {
return Status(error::FAILED_PRECONDITION,
"Persistent space allocator can only be called once");
}
Status allocation_status = context_->allocate_persistent(
DT_UINT8, TensorShape({byte_size}), &handle_, nullptr);
if (!allocation_status.ok()) {
return ToExecutorStatus(allocation_status);
}
total_byte_size_ += byte_size;
return AsDeviceMemory<uint8>(handle_.AccessTensor(context_));
}
int64 TotalByteSize() { return total_byte_size_; }
private:
int64 total_byte_size_ = 0;
PersistentTensor handle_;
OpKernelContext* context_; // not owned
};
struct CudnnModelTypes {
RnnMode rnn_mode;
TFRNNInputMode rnn_input_mode;
@ -317,6 +356,16 @@ struct CudnnModelShapes {
TensorShape input_shape;
TensorShape output_shape;
TensorShape hidden_state_shape;
// At present only fields related to cached RnnDescriptor are concerned.
bool IsCompatibleWith(const CudnnModelShapes& rhs) const {
return num_layers == rhs.num_layers && input_size == rhs.input_size &&
num_units == rhs.num_units && dir_count == rhs.dir_count;
}
string RnnDescDebugString() {
return strings::Printf(
"[num_layers, input_size, num_units, dir_count]: [%d, %d, %d, %d]",
num_layers, input_size, num_units, dir_count);
}
};
// Extract and checks the forward input tensors, parameters, and shapes from the
@ -399,11 +448,23 @@ void RestoreParams(const OpInputList params_input,
} // namespace
// Note: all following kernels depend on a RnnDescriptor instance, which
// according to Cudnn official doc should be kept around and reused across all
// Cudnn kernels in the same model.
// In Tensorflow, we don't pass the reference across different OpKernels,
// rather, recreate it separately in each OpKernel, which does no cause issue:
// CudnnDropoutDescriptor keeps a reference to a memory for
// random number generator state. During recreation, this state is lost.
// However, only forward-pass Cudnn APIs make use of the state.
// A common base class for RNN kernels. It extracts common attributes and
// shape validations.
class CudnnRNNKernelCommon : public OpKernel {
protected:
CudnnRNNKernelCommon(OpKernelConstruction* context) : OpKernel(context) {
OP_REQUIRES_OK(context, context->GetAttr("dropout", &dropout_));
OP_REQUIRES_OK(context, context->GetAttr("seed", &seed_));
OP_REQUIRES_OK(context, context->GetAttr("seed2", &seed2_));
string str;
OP_REQUIRES_OK(context, context->GetAttr("rnn_mode", &str));
OP_REQUIRES_OK(context, ParseRNNMode(str, &model_types_.rnn_mode));
@ -413,6 +474,10 @@ class CudnnRNNKernelCommon : public OpKernel {
OP_REQUIRES_OK(context, context->GetAttr("direction", &str));
OP_REQUIRES_OK(
context, ParseRNNDirectionMode(str, &model_types_.rnn_direction_mode));
// Reset CudnnRnnDescriptor and related random number generate states in
// every Compute() call.
OP_REQUIRES_OK(context, ReadBoolFromEnvVar("TF_CUDNN_RESET_RND_GEN_STATE",
false, &reset_rnd_gen_state_));
}
bool HasInputC() const { return model_types_.HasInputC(); }
@ -422,6 +487,9 @@ class CudnnRNNKernelCommon : public OpKernel {
return model_types_.rnn_direction_mode;
}
CudnnModelTypes model_types() const { return model_types_; }
float dropout() const { return dropout_; }
uint64 seed() { return (static_cast<uint64>(seed_) << 32) | seed2_; }
bool ResetRndGenState() { return reset_rnd_gen_state_; }
template <typename T>
Status ExtractCudnnRNNParamsInfo(OpKernelContext* context,
@ -448,11 +516,14 @@ class CudnnRNNKernelCommon : public OpKernel {
RnnInputMode input_mode;
TF_RETURN_IF_ERROR(
ToRNNInputMode(rnn_input_mode(), num_units, input_size, &input_mode));
auto* stream = context->op_device_context()->stream();
// ExtracCudnnRNNParamsInfo is only called by op_kernels that do not require
// random number generator, therefore set state_allocator to nullptr.
auto rnn_desc_s = stream->parent()->createRnnDescriptor(
num_layers, num_units, input_size, input_mode, rnn_direction_mode(),
rnn_mode(), ToDataType<T>::value, 0.f /*dropout*/, 0 /*seed*/,
nullptr /*state_allocator*/);
rnn_mode(), ToDataType<T>::value, dropout(), seed(),
nullptr /* state_allocator */);
if (!rnn_desc_s.ok()) {
return FromExecutorStatus(rnn_desc_s);
}
@ -461,6 +532,11 @@ class CudnnRNNKernelCommon : public OpKernel {
}
private:
int seed_;
int seed2_;
float dropout_;
bool reset_rnd_gen_state_;
CudnnModelTypes model_types_;
};
@ -560,9 +636,8 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
context->set_output(i, input.Slice(start, end));
} else {
Tensor* output = nullptr;
OP_REQUIRES_OK(
context,
context->allocate_output(i, TensorShape({width, height}), &output));
OP_REQUIRES_OK(context, context->allocate_output(
i, TensorShape({width, height}), &output));
DeviceMemoryBase data_src_ptr = SliceDeviceMemory(
input_ptr, rnn_desc->ParamsWeightRegions()[i].offset,
size_in_bytes);
@ -571,14 +646,17 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
}
}
CHECK(num_params_ == rnn_desc->ParamsBiasRegions().size())
<< "Number of params mismatch. Expected " << num_params_ << ", got "
<< rnn_desc->ParamsBiasRegions().size();
OP_REQUIRES(context, num_params_ == rnn_desc->ParamsBiasRegions().size(),
errors::InvalidArgument("Number of params mismatch. Expected ",
num_params_, ", got ",
rnn_desc->ParamsBiasRegions().size()));
for (int i = 0; i < rnn_desc->ParamsBiasRegions().size(); i++) {
int64 size_in_bytes = rnn_desc->ParamsBiasRegions()[i].size;
int64 size = size_in_bytes / sizeof(T);
CHECK(size == num_units) << "Params size mismatch. Expected " << num_units
<< ", got " << size;
OP_REQUIRES(context, size == num_units,
errors::InvalidArgument("Params size mismatch. Expected ",
num_units, ", got ", size));
// If data is aligned, use slice view to avoid expensive memcpy.
bool start_aligned =
rnn_desc->ParamsBiasRegions()[i].offset % EIGEN_MAX_ALIGN_BYTES == 0;
@ -698,16 +776,32 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
OP_REQUIRES_OK(context,
ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
model_shapes.input_size, &input_mode));
// TODO(zhengxq): add dropout support.
// TODO(zhengxq): cache the descriptor so we don't have to create them all
// the time.
auto data_type = ToDataType<T>::value;
auto rnn_desc_s = executor->createRnnDescriptor(
model_shapes.num_layers, model_shapes.num_units,
model_shapes.input_size, input_mode, rnn_direction_mode(), rnn_mode(),
data_type, 0.f /*dropout*/, 0 /*seed*/, nullptr /*state_allocator*/);
OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
auto rnn_desc = rnn_desc_s.ConsumeValueOrDie();
{
mutex_lock l(mu_);
if (model_shapes_ == nullptr) {
model_shapes_.reset(new CudnnModelShapes(model_shapes));
} else {
OP_REQUIRES(context, model_shapes_->IsCompatibleWith(model_shapes),
errors::InvalidArgument(
"Incompatible rnn model shapes inferred: expecting ",
model_shapes_->RnnDescDebugString(), ", getting ",
model_shapes.RnnDescDebugString(), "."));
}
if (rnn_desc_ == nullptr || ResetRndGenState()) {
dropout_state_allocator_.reset(
new CudnnRNNPersistentSpaceAllocator(context));
auto rnn_desc_s = executor->createRnnDescriptor(
model_shapes_->num_layers, model_shapes_->num_units,
model_shapes_->input_size, input_mode, rnn_direction_mode(),
rnn_mode(), data_type, dropout(), seed(),
dropout_state_allocator_.get());
OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
rnn_desc_ = std::move(rnn_desc_s.ConsumeValueOrDie());
}
}
auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
input_shape.dim_size(0), input_shape.dim_size(1),
@ -753,21 +847,30 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
// Creates a memory callback for the workspace. The memory lives to the end
// of this kernel calls.
CudnnRNNWorkspaceAllocator workspace_allocator(context);
bool launch_status =
stream
->ThenRnnForward(
*rnn_desc, *input_desc, input_data, *hidden_state_desc,
input_h_data, *hidden_state_desc, input_c_data, params_data,
*output_desc, &output_data, *hidden_state_desc, &output_h_data,
*hidden_state_desc, &output_c_data, is_training_,
&reserve_space_allocator, &workspace_allocator)
.ok();
bool launch_status = false;
{
mutex_lock l(mu_);
launch_status =
stream
->ThenRnnForward(
*rnn_desc_, *input_desc, input_data, *hidden_state_desc,
input_h_data, *hidden_state_desc, input_c_data, params_data,
*output_desc, &output_data, *hidden_state_desc,
&output_h_data, *hidden_state_desc, &output_c_data,
is_training_, &reserve_space_allocator, &workspace_allocator)
.ok();
}
OP_REQUIRES(context, launch_status,
errors::Internal("Failed to call ThenRnnForward"));
}
private:
mutex mu_;
bool is_training_;
std::unique_ptr<CudnnModelShapes> model_shapes_ GUARDED_BY(mu_);
std::unique_ptr<RnnDescriptor> rnn_desc_ GUARDED_BY(mu_);
std::unique_ptr<CudnnRNNPersistentSpaceAllocator> dropout_state_allocator_
GUARDED_BY(mu_);
};
REGISTER_KERNEL_BUILDER(
@ -808,9 +911,9 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
const Tensor* output_h = nullptr;
OP_REQUIRES_OK(context, context->input("output_h", &output_h));
OP_REQUIRES(context, output_h->shape() == hidden_state_shape,
errors::InvalidArgument("Invalid output_h shape: ",
output_h->shape().DebugString(), " ",
hidden_state_shape.DebugString()));
errors::InvalidArgument(
"Invalid output_h shape: ", output_h->shape().DebugString(),
" ", hidden_state_shape.DebugString()));
const Tensor* output_c = nullptr;
if (HasInputC()) {
// Only LSTM uses input_c and output_c. So for all other models, we only
@ -881,15 +984,32 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
OP_REQUIRES_OK(context,
ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
model_shapes.input_size, &input_mode));
// TODO(zhengxq): add dropout support.
// TODO(zhengxq): cache the descriptor so we don't have to create them all
// the time.
auto rnn_desc_s = executor->createRnnDescriptor(
model_shapes.num_layers, model_shapes.num_units,
model_shapes.input_size, input_mode, rnn_direction_mode(), rnn_mode(),
data_type, 0.f /*dropout*/, 0 /*seed*/, nullptr /*state_allocator*/);
OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
auto rnn_desc = rnn_desc_s.ConsumeValueOrDie();
{
mutex_lock l(mu_);
if (model_shapes_ == nullptr) {
model_shapes_.reset(new CudnnModelShapes(model_shapes));
} else {
OP_REQUIRES(context, model_shapes_->IsCompatibleWith(model_shapes),
errors::InvalidArgument(
"Incompatible rnn model shapes inferred: expecting ",
model_shapes_->RnnDescDebugString(), ", getting ",
model_shapes.RnnDescDebugString(), "."));
}
if (rnn_desc_ == nullptr || ResetRndGenState()) {
dropout_state_allocator_.reset(
new CudnnRNNPersistentSpaceAllocator(context));
auto rnn_desc_s = executor->createRnnDescriptor(
model_shapes.num_layers, model_shapes.num_units,
model_shapes.input_size, input_mode, rnn_direction_mode(),
rnn_mode(), data_type, dropout(), seed(),
dropout_state_allocator_.get());
OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
rnn_desc_ = std::move(rnn_desc_s.ConsumeValueOrDie());
}
}
auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
input_shape.dim_size(0), input_shape.dim_size(1),
@ -939,21 +1059,32 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
// Creates a memory callback for the workspace. The memory lives to the end
// of this kernel calls.
CudnnRNNWorkspaceAllocator workspace_allocator(context);
bool launch_status =
stream
->ThenRnnBackward(
*rnn_desc, *input_desc, input_data, *hidden_state_desc,
input_h_data, *hidden_state_desc, input_c_data, params_data,
*output_desc, output_data, *hidden_state_desc, output_h_data,
*hidden_state_desc, output_c_data, output_backprop_data,
output_h_backprop_data, output_c_backprop_data,
&input_backprop_data, &input_h_backprop_data,
&input_c_backprop_data, &params_backprop_data,
&reserve_space_uint8, &workspace_allocator)
.ok();
bool launch_status = false;
{
mutex_lock l(mu_);
launch_status =
stream
->ThenRnnBackward(
*rnn_desc_, *input_desc, input_data, *hidden_state_desc,
input_h_data, *hidden_state_desc, input_c_data, params_data,
*output_desc, output_data, *hidden_state_desc, output_h_data,
*hidden_state_desc, output_c_data, output_backprop_data,
output_h_backprop_data, output_c_backprop_data,
&input_backprop_data, &input_h_backprop_data,
&input_c_backprop_data, &params_backprop_data,
&reserve_space_uint8, &workspace_allocator)
.ok();
}
OP_REQUIRES(context, launch_status,
errors::Internal("Failed to call ThenRnnBackward"));
}
private:
mutex mu_;
std::unique_ptr<CudnnModelShapes> model_shapes_ GUARDED_BY(mu_);
std::unique_ptr<RnnDescriptor> rnn_desc_ GUARDED_BY(mu_);
std::unique_ptr<CudnnRNNPersistentSpaceAllocator> dropout_state_allocator_
GUARDED_BY(mu_);
};
REGISTER_KERNEL_BUILDER(

View File

@ -35,6 +35,9 @@ input_mode: Indicate whether there is a linear projection between the input and
input_size == num_units; otherwise, it implies 'linear_input'.
direction: Indicates whether a bidirectional model will be used.
dir = (direction == bidirectional) ? 2 : 1
dropout: dropout probability. When set to 0., dropout is disabled.
seed: the 1st part of a seed to initialize dropout.
seed2: the 2nd part of a seed to initialize dropout.
)doc";
constexpr auto kCudnnRNNParamsBuffer = R"doc(
@ -77,6 +80,9 @@ REGISTER_OP("CudnnRNNParamsSize")
.Attr(kRNNModeAttrs)
.Attr(kRNNInputModeAttrs)
.Attr(kRNNDirectionAttrs)
.Attr("dropout: float = 0.0")
.Attr("seed: int = 0")
.Attr("seed2: int = 0")
.Output("params_size: S")
.SetShapeFn([](InferenceContext* c) {
c->set_output(0, c->Vector(1));
@ -119,6 +125,7 @@ REGISTER_OP("CudnnRNN")
.Input("input_h: T")
.Input("input_c: T")
.Input("params: T")
.SetIsStateful()
.Output("output: T")
.Output("output_h: T")
.Output("output_c: T")
@ -127,7 +134,7 @@ REGISTER_OP("CudnnRNN")
.Attr(kRNNModeAttrs)
.Attr(kRNNInputModeAttrs)
.Attr(kRNNDirectionAttrs)
.Attr("dropout: float")
.Attr("dropout: float = 0.0")
.Attr("seed: int = 0")
.Attr("seed2: int = 0")
.Attr("is_training: bool = true")
@ -158,7 +165,8 @@ REGISTER_OP("CudnnRNN")
Computes the RNN from the input and initial states, with respect to the params
buffer.
)doc",
kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(), R"doc(
kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(),
R"doc(
is_training: Indicates whether this operation is used for inferenece or
training.
reserve_space: an opaque tensor that can be used in backprop calculation. It
@ -185,6 +193,9 @@ REGISTER_OP("CudnnRNNBackprop")
.Attr(kRNNModeAttrs)
.Attr(kRNNInputModeAttrs)
.Attr(kRNNDirectionAttrs)
.Attr("dropout: float = 0.0")
.Attr("seed: int = 0")
.Attr("seed2: int = 0")
.SetShapeFn([](InferenceContext* c) {
auto input_shape = c->input(0);
auto input_h_shape = c->input(1);
@ -199,7 +210,8 @@ REGISTER_OP("CudnnRNNBackprop")
.Doc(strings::StrCat(R"doc(
Compute the backprop of both data and weights in a RNN.
)doc",
kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(), R"doc(
kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(),
R"doc(
output_backprop: A 3-D tensor with the same shape as output in the forward pass.
output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
pass.
@ -228,6 +240,9 @@ REGISTER_OP("CudnnRNNParamsToCanonical")
.Attr(kRNNModeAttrs)
.Attr(kRNNInputModeAttrs)
.Attr(kRNNDirectionAttrs)
.Attr("dropout: float = 0.0")
.Attr("seed: int = 0")
.Attr("seed2: int = 0")
.SetShapeFn([](InferenceContext* c) {
ShapeHandle unused;
TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &unused));
@ -268,6 +283,9 @@ REGISTER_OP("CudnnRNNCanonicalToParams")
.Attr(kRNNModeAttrs)
.Attr(kRNNInputModeAttrs)
.Attr(kRNNDirectionAttrs)
.Attr("dropout: float = 0.0")
.Attr("seed: int = 0")
.Attr("seed2: int = 0")
.SetShapeFn([](InferenceContext* c) {
c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
return Status::OK();
@ -281,7 +299,6 @@ upcoming training or inferences.
num_params: number of parameter sets for all layers.
Each layer may contain multiple parameter sets, with each set consisting of
a weight matrix and a bias vector.
)doc",
kCudnnRNNCommonAttrs));
)doc", kCudnnRNNCommonAttrs));
} // namespace tensorflow

View File

@ -38,15 +38,24 @@ from tensorflow.python.training import saver as saver_lib
class CudnnRNNTest(TensorFlowTestCase):
def _CreateModel(self, rnn_mode, num_layers, num_units, input_size):
def _CreateModel(self,
rnn_mode,
num_layers,
num_units,
input_size,
dropout=0.):
if rnn_mode == "lstm":
model = cudnn_rnn_ops.CudnnLSTM(num_layers, num_units, input_size)
model = cudnn_rnn_ops.CudnnLSTM(
num_layers, num_units, input_size, dropout=dropout)
elif rnn_mode == "gru":
model = cudnn_rnn_ops.CudnnGRU(num_layers, num_units, input_size)
model = cudnn_rnn_ops.CudnnGRU(
num_layers, num_units, input_size, dropout=dropout)
elif rnn_mode == "rnn_tanh":
model = cudnn_rnn_ops.CudnnRNNTanh(num_layers, num_units, input_size)
model = cudnn_rnn_ops.CudnnRNNTanh(
num_layers, num_units, input_size, dropout=dropout)
elif rnn_mode == "rnn_relu":
model = cudnn_rnn_ops.CudnnRNNRelu(num_layers, num_units, input_size)
model = cudnn_rnn_ops.CudnnRNNRelu(
num_layers, num_units, input_size, dropout=dropout)
else:
raise ValueError("Invalid rnn_mode: %s" % rnn_mode)
return model
@ -174,9 +183,11 @@ class CudnnRNNTest(TensorFlowTestCase):
self._testOneLSTMParamsSize(num_layers, num_units, input_size)
def _testOneSimpleInference(self, rnn_mode, num_layers, num_units, input_size,
batch_size, seq_length, dir_count, expected,
tolerance):
model = self._CreateModel(rnn_mode, num_layers, num_units, input_size)
batch_size, seq_length, dir_count, dropout,
expected, tolerance):
random_seed.set_random_seed(5678)
model = self._CreateModel(rnn_mode, num_layers, num_units, input_size,
dropout)
has_input_c = (rnn_mode == "lstm")
params_size_t = model.params_size()
input_data = array_ops.ones([seq_length, batch_size, input_size])
@ -206,18 +217,24 @@ class CudnnRNNTest(TensorFlowTestCase):
with self.test_session(use_gpu=True) as sess:
sess.run(variables.global_variables_initializer())
total_sum_v = sess.run([total_sum])
self.assertAllClose(
total_sum_v[0], expected, atol=tolerance, rtol=tolerance)
@unittest.skipUnless(test.is_built_with_cuda(),
"Test only applicable when running on GPUs")
def testSimpleInference(self):
# Cudnn scales result for dropout during training, therefore dropout has no
# impact for inference results.
# (lstm, gru, rnn_tanh are saturated in the test. rnn_relu case is most
# demonstrative of the dropout-invariant nature of CudnnRnn.)
test_configs = [
[
"lstm",
231833.22,
1e-2,
{
{
"rnn_mode": "lstm",
"dropout": [0., 0.5, 1.],
"expected": 231833.22,
"tolerance": 1e-2,
"shape": {
"num_layers": 4,
"num_units": 200,
"input_size": 200,
@ -225,12 +242,13 @@ class CudnnRNNTest(TensorFlowTestCase):
"seq_length": 10,
"dir_count": 1,
},
],
[
"gru",
56000,
1e-2,
{
},
{
"rnn_mode": "gru",
"dropout": [0., 0.5, 1.],
"expected": 56000,
"tolerance": 1e-2,
"shape": {
"num_layers": 4,
"num_units": 200,
"input_size": 200,
@ -238,12 +256,13 @@ class CudnnRNNTest(TensorFlowTestCase):
"seq_length": 10,
"dir_count": 1,
},
],
[
"rnn_tanh",
56000,
1e-2,
{
},
{
"rnn_mode": "rnn_tanh",
"dropout": [0., 0.5, 1.],
"expected": 56000,
"tolerance": 1e-2,
"shape": {
"num_layers": 4,
"num_units": 200,
"input_size": 200,
@ -251,12 +270,13 @@ class CudnnRNNTest(TensorFlowTestCase):
"seq_length": 10,
"dir_count": 1,
},
],
[
"rnn_relu",
130688,
1e-2,
{
},
{
"rnn_mode": "rnn_relu",
"dropout": [0., 0.5, 1.],
"expected": 130688,
"tolerance": 1e-2,
"shape": {
"num_layers": 2,
"num_units": 8,
"input_size": 4,
@ -264,24 +284,32 @@ class CudnnRNNTest(TensorFlowTestCase):
"seq_length": 2,
"dir_count": 1,
},
],
},
]
with ops.Graph().as_default():
for config in test_configs:
rnn_mode = config[0]
expected = config[1]
tolerance = config[2]
shapes = config[3]
self._testOneSimpleInference(rnn_mode, shapes["num_layers"],
shapes["num_units"], shapes["input_size"],
shapes["batch_size"], shapes["seq_length"],
shapes["dir_count"], expected, tolerance)
rnn_mode = config["rnn_mode"]
dropout_list = config.get("dropout", [0.])
expected = config["expected"]
tolerance = config["tolerance"]
shape = config["shape"]
for dropout in dropout_list:
self._testOneSimpleInference(
rnn_mode, shape["num_layers"], shape["num_units"],
shape["input_size"], shape["batch_size"], shape["seq_length"],
shape["dir_count"], dropout, expected, tolerance)
def _testOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size,
batch_size, seq_length, dir_count, tolerance):
batch_size, seq_length, dir_count, dropout,
tolerance):
# Gradient checking runs two forward ops with almost the same input. Need to
# make sure the drop patterns across the two runs are the same.
old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False))
os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True)
has_input_c = (rnn_mode == "lstm")
random_seed.set_random_seed(1234)
model = self._CreateModel(rnn_mode, num_layers, num_units, input_size)
model = self._CreateModel(rnn_mode, num_layers, num_units, input_size,
dropout)
params_size_t = model.params_size()
input_data = variables.Variable(
random_ops.random_uniform([seq_length, batch_size, input_size]))
@ -294,6 +322,7 @@ class CudnnRNNTest(TensorFlowTestCase):
input_c = variables.Variable(
random_ops.random_uniform(
[num_layers * dir_count, batch_size, num_units]))
output, output_h, output_c = model(
input_data=input_data,
input_h=input_h,
@ -322,18 +351,22 @@ class CudnnRNNTest(TensorFlowTestCase):
sess.run(variables.global_variables_initializer())
all_inputs = [entry[0] for entry in inputs_and_shapes]
all_shapes = [entry[1] for entry in inputs_and_shapes]
err = gradient_checker.compute_gradient_error(all_inputs, all_shapes,
total_sum, [1])
self.assertLess(err, tolerance)
os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state
@unittest.skipUnless(test.is_built_with_cuda(),
"Test only applicable when running on GPUs")
def testSimpleTraining(self):
test_configs = [
[
"lstm",
1e-2,
{
{
"rnn_mode": "lstm",
"dropout": [0., 0.5, 1.],
"tolerance": 1e-2,
"shape": {
"num_layers": 2,
"num_units": 3,
"input_size": 4,
@ -341,11 +374,12 @@ class CudnnRNNTest(TensorFlowTestCase):
"seq_length": 4,
"dir_count": 1,
},
],
[
"gru",
4e-3,
{
},
{
"rnn_mode": "gru",
"dropout": [0., 0.5, 1.],
"tolerance": 4e-3,
"shape": {
"num_layers": 2,
"num_units": 3,
"input_size": 4,
@ -353,11 +387,12 @@ class CudnnRNNTest(TensorFlowTestCase):
"seq_length": 4,
"dir_count": 1,
},
],
[
"rnn_tanh",
5e-3,
{
},
{
"rnn_mode": "rnn_tanh",
"dropout": [0., 0.5, 1.],
"tolerance": 5e-3,
"shape": {
"num_layers": 2,
"num_units": 3,
"input_size": 4,
@ -365,11 +400,12 @@ class CudnnRNNTest(TensorFlowTestCase):
"seq_length": 4,
"dir_count": 1,
},
],
[
"rnn_relu",
3e-1,
{
},
{
"rnn_mode": "rnn_relu",
"dropout": [0., 0.5, 1.],
"tolerance": 4e-1,
"shape": {
"num_layers": 2,
"num_units": 3,
"input_size": 4,
@ -377,17 +413,19 @@ class CudnnRNNTest(TensorFlowTestCase):
"seq_length": 4,
"dir_count": 1,
},
],
},
]
with ops.Graph().as_default():
for config in test_configs:
rnn_mode = config[0]
tolerance = config[1]
shape = config[2]
self._testOneSimpleTraining(rnn_mode, shape["num_layers"],
shape["num_units"], shape["input_size"],
shape["batch_size"], shape["seq_length"],
shape["dir_count"], tolerance)
rnn_mode = config["rnn_mode"]
dropout_list = config.get("dropout", [0.])
tolerance = config["tolerance"]
shape = config["shape"]
for dropout in dropout_list:
self._testOneSimpleTraining(rnn_mode, shape["num_layers"],
shape["num_units"], shape["input_size"],
shape["batch_size"], shape["seq_length"],
shape["dir_count"], dropout, tolerance)
if __name__ == "__main__":

View File

@ -23,13 +23,13 @@ from tensorflow.contrib.util import loader
from tensorflow.python.framework import common_shapes
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.framework import random_seed
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import state_ops
from tensorflow.python.platform import resource_loader
from tensorflow.python.training import saver
_cudnn_rnn_ops_so = loader.load_op_library(
resource_loader.get_path_to_datafile("_cudnn_rnn_ops.so"))
@ -110,12 +110,12 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
if not isinstance(params, tuple):
params = (params,)
assign_ops = [
state_ops.assign(
variable, param, validate_shape=False)
state_ops.assign(variable, param, validate_shape=False)
for variable, param in zip(self._variables, params)
]
return control_flow_ops.group(*assign_ops)
_cudnn_rnn_common_doc_string = """
Cudnn RNN has an opaque parameter buffer that can be used for inference and
training. But it is possible that the layout of the parameter buffers
@ -163,8 +163,7 @@ class _CudnnRNN(object):
input_mode="auto_select",
direction="unidirectional",
dropout=0.,
seed=0,
seed2=0):
seed=0):
"""Creates a CudnnRNN model from model spec.
Args:
@ -183,8 +182,8 @@ class _CudnnRNN(object):
direction: the direction model that the model operates. Could be either
'unidirectional' or 'bidirectional'
dropout: whether to enable dropout. With it is 0, dropout is disabled.
seed: the first part of a seed that is used to initialize dropout.
seed2: the second part of a seed that is used to initialize dropout.
seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
for behavior.
"""
self._num_layers = num_layers
self._num_units = num_units
@ -193,8 +192,10 @@ class _CudnnRNN(object):
self._input_mode = input_mode
self._direction = direction
self._dropout = dropout
self._seed = seed
self._seed2 = seed2
# get graph and op seed.
self._seed, self._seed2 = random_seed.get_seed(seed)
if self._seed is None and self._seed2 is None:
self._seed, self._seed2 = 0, 0
def params_size(self):
"""Calculates the size of the opaque parameter buffer needed for this model.
@ -208,6 +209,9 @@ class _CudnnRNN(object):
input_size=self._input_size,
T=dtypes.float32,
S=dtypes.int32,
dropout=self._dropout,
seed=self._seed,
seed2=self._seed2,
rnn_mode=self._rnn_mode,
input_mode=self._input_mode,
direction=self._direction)[0]
@ -258,6 +262,9 @@ class _CudnnRNN(object):
num_units=self._num_units,
input_size=self._input_size,
params=params,
dropout=self._dropout,
seed=self._seed,
seed2=self._seed2,
num_params=self._num_layers * self._NUM_PARAMS_PER_LAYER,
rnn_mode=self._rnn_mode,
input_mode=self._input_mode,
@ -280,6 +287,9 @@ class _CudnnRNN(object):
input_size=self._input_size,
weights=weights,
biases=biases,
dropout=self._dropout,
seed=self._seed,
seed2=self._seed2,
rnn_mode=self._rnn_mode,
input_mode=self._input_mode,
direction=self._direction)
@ -299,8 +309,7 @@ class CudnnLSTM(_CudnnRNN):
input_mode="auto_select",
direction="unidirectional",
dropout=0.,
seed=0,
seed2=0):
seed=0):
"""Creates a Cudnn LSTM model from model spec.
Args:
@ -317,8 +326,7 @@ class CudnnLSTM(_CudnnRNN):
direction: the direction model that the model operates. Could be either
'unidirectional' or 'bidirectional'
dropout: whether to enable dropout. With it is 0, dropout is disabled.
seed: the first part of a seed that is used to initialize dropout.
seed2: the second part of a seed that is used to initialize dropout.
seed: the seed used for initializing dropout.
"""
super(CudnnLSTM, self).__init__(
"lstm",
@ -328,8 +336,7 @@ class CudnnLSTM(_CudnnRNN):
input_mode=input_mode,
direction=direction,
dropout=dropout,
seed=seed,
seed2=seed2)
seed=seed)
def __call__(self, input_data, input_h, input_c, params, is_training=True):
"""Runs the forward step for the Cudnn LSTM model.
@ -346,11 +353,8 @@ class CudnnLSTM(_CudnnRNN):
output_h: the final state for h.
output_c: the final state for c.
"""
output, output_h, output_c = super(CudnnLSTM, self).__call__(input_data,
input_h,
input_c,
params,
is_training)
output, output_h, output_c = super(CudnnLSTM, self).__call__(
input_data, input_h, input_c, params, is_training=is_training)
return (output, output_h, output_c)
@ -365,8 +369,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
input_mode="auto_select",
direction="unidirectional",
dropout=0.,
seed=0,
seed2=0):
seed=0):
"""Creates a Cudnn RNN model from model without hidden-state C.
Args:
@ -383,8 +386,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
direction: the direction model that the model operates. Could be either
'unidirectional' or 'bidirectional'
dropout: whether to enable dropout. With it is 0, dropout is disabled.
seed: the first part of a seed that is used to initialize dropout.
seed2: the second part of a seed that is used to initialize dropout.
seed: the seed used for initializing dropout.
"""
super(_CudnnRNNNoInputC, self).__init__(
self._rnn_mode,
@ -394,8 +396,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
input_mode=input_mode,
direction=direction,
dropout=dropout,
seed=seed,
seed2=seed2)
seed=seed)
def __call__(self, input_data, input_h, params, is_training=True):
"""Runs the forward step for the Cudnn LSTM model.
@ -459,6 +460,9 @@ def _cudnn_rnn_backward(op, *grad):
output_h_backprop=grad[1],
output_c_backprop=grad[2],
reserve_space=op.outputs[3],
dropout=op.get_attr("dropout"),
seed=op.get_attr("seed"),
seed2=op.get_attr("seed2"),
rnn_mode=op.get_attr("rnn_mode"),
input_mode=op.get_attr("input_mode"),
direction=op.get_attr("direction"))

View File

@ -111,13 +111,11 @@ py_library(
srcs_version = "PY2AND3",
deps = [
":sdca_ops_py",
":sparse_feature_column_py",
"//tensorflow/contrib/framework:framework_py",
"//tensorflow/contrib/layers:layers_py",
"//tensorflow/contrib/learn",
"//tensorflow/python:array_ops",
"//tensorflow/python:dtypes",
"//tensorflow/python:math_ops",
"//tensorflow/python:sparse_tensor",
"//tensorflow/python:tensor_util",
"//tensorflow/python:training",

View File

@ -24,13 +24,10 @@ from tensorflow.contrib.learn.python.learn.estimators import estimator
from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
from tensorflow.contrib.learn.python.learn.estimators import prediction_key
from tensorflow.contrib.linear_optimizer.python import sdca_optimizer
from tensorflow.contrib.linear_optimizer.python.ops import sdca_ops
from tensorflow.contrib.linear_optimizer.python.ops.sparse_feature_column import SparseFeatureColumn
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import sparse_tensor
from tensorflow.python.framework import tensor_util
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import variable_scope
from tensorflow.python.training import session_run_hook
@ -76,131 +73,6 @@ def _add_bias_column(feature_columns, columns_to_tensors, bias_variable,
columns_to_variables[bias_column] = [bias_variable]
def _get_sdca_train_step(optimizer, columns_to_variables, weight_column_name,
loss_type, features, targets, global_step):
"""Returns the training operation of an SdcaModel optimizer."""
def _dense_tensor_to_sparse_feature_column(dense_tensor):
"""Returns SparseFeatureColumn for the input dense_tensor."""
ignore_value = 0.0
sparse_indices = array_ops.where(
math_ops.not_equal(dense_tensor,
math_ops.cast(ignore_value, dense_tensor.dtype)))
sparse_values = array_ops.gather_nd(dense_tensor, sparse_indices)
# TODO(sibyl-Aix6ihai, sibyl-vie3Poto): Makes this efficient, as now SDCA supports
# very sparse features with weights and not weights.
return SparseFeatureColumn(
array_ops.reshape(
array_ops.split(value=sparse_indices, num_or_size_splits=2,
axis=1)[0], [-1]),
array_ops.reshape(
array_ops.split(value=sparse_indices, num_or_size_splits=2,
axis=1)[1], [-1]),
array_ops.reshape(math_ops.to_float(sparse_values), [-1]))
def _training_examples_and_variables():
"""Returns dictionaries for training examples and variables."""
batch_size = targets.get_shape()[0]
# Iterate over all feature columns and create appropriate lists for dense
# and sparse features as well as dense and sparse weights (variables) for
# SDCA.
# TODO(sibyl-vie3Poto): Reshape variables stored as values in column_to_variables
# dict as 1-dimensional tensors.
dense_features, sparse_features, sparse_feature_with_values = [], [], []
dense_feature_weights = []
sparse_feature_weights, sparse_feature_with_values_weights = [], []
for column in sorted(columns_to_variables.keys(), key=lambda x: x.key):
transformed_tensor = features[column]
if isinstance(column, layers.feature_column._RealValuedColumn): # pylint: disable=protected-access
# A real-valued column corresponds to a dense feature in SDCA. A
# transformed tensor corresponding to a RealValuedColumn has rank 2
# (its shape is typically [batch_size, column.dimension]) and so it
# can be passed to SDCA as is.
dense_features.append(transformed_tensor)
# For real valued columns, the variables list contains exactly one
# element.
dense_feature_weights.append(columns_to_variables[column][0])
elif isinstance(column, layers.feature_column._BucketizedColumn): # pylint: disable=protected-access
# A bucketized column corresponds to a sparse feature in SDCA. The
# bucketized feature is "sparsified" for SDCA by converting it to a
# SparseFeatureColumn respresenting the one-hot encoding of the
# bucketized feature.
#
# TODO(sibyl-vie3Poto): Explore whether it is more efficient to translate a
# bucketized feature column to a dense feature in SDCA. This will likely
# depend on the number of buckets.
dense_bucket_tensor = column._to_dnn_input_layer(transformed_tensor) # pylint: disable=protected-access
sparse_feature_column = _dense_tensor_to_sparse_feature_column(
dense_bucket_tensor)
sparse_feature_with_values.append(sparse_feature_column)
# For bucketized columns, the variables list contains exactly one
# element.
sparse_feature_with_values_weights.append(
columns_to_variables[column][0])
elif isinstance(
column,
(
layers.feature_column._CrossedColumn, # pylint: disable=protected-access
layers.feature_column._SparseColumn)): # pylint: disable=protected-access
sparse_features.append(
SparseFeatureColumn(
array_ops.reshape(
array_ops.split(
value=transformed_tensor.indices,
num_or_size_splits=2,
axis=1)[0], [-1]),
array_ops.reshape(transformed_tensor.values, [-1]), None))
sparse_feature_weights.append(columns_to_variables[column][0])
elif isinstance(column, layers.feature_column._WeightedSparseColumn): # pylint: disable=protected-access
id_tensor = column.id_tensor(transformed_tensor)
weight_tensor = column.weight_tensor(transformed_tensor)
sparse_feature_with_values.append(
SparseFeatureColumn(
array_ops.reshape(
array_ops.split(
value=id_tensor.indices, num_or_size_splits=2, axis=1)[
0], [-1]),
array_ops.reshape(id_tensor.values, [-1]),
array_ops.reshape(weight_tensor.values, [-1])))
sparse_feature_with_values_weights.append(
columns_to_variables[column][0])
else:
raise ValueError("SDCAOptimizer does not support column type {}".format(
type(column).__name__))
example_weights = array_ops.reshape(
features[weight_column_name],
shape=[-1]) if weight_column_name else array_ops.ones([batch_size])
example_ids = features[optimizer.example_id_column]
sparse_feature_with_values.extend(sparse_features)
sparse_feature_with_values_weights.extend(sparse_feature_weights)
examples = dict(
sparse_features=sparse_feature_with_values,
dense_features=dense_features,
example_labels=math_ops.to_float(
array_ops.reshape(targets, shape=[-1])),
example_weights=example_weights,
example_ids=example_ids)
sdca_variables = dict(
sparse_features_weights=sparse_feature_with_values_weights,
dense_features_weights=dense_feature_weights)
return examples, sdca_variables
training_examples, training_variables = _training_examples_and_variables()
sdca_model = sdca_ops.SdcaModel(
examples=training_examples,
variables=training_variables,
options=dict(
symmetric_l1_regularization=optimizer.symmetric_l1_regularization,
symmetric_l2_regularization=optimizer.symmetric_l2_regularization,
num_loss_partitions=optimizer.num_loss_partitions,
num_table_shards=optimizer.num_table_shards,
loss_type=loss_type))
train_op = sdca_model.minimize(global_step=global_step)
return sdca_model, train_op
def sdca_model_fn(features, labels, mode, params, config=None):
"""A model_fn for linear models that use the SDCA optimizer.
@ -283,9 +155,9 @@ def sdca_model_fn(features, labels, mode, params, config=None):
def _train_op_fn(unused_loss):
global_step = contrib_variables.get_global_step()
sdca_model, train_op = _get_sdca_train_step(optimizer, columns_to_variables,
weight_column_name, loss_type,
features, labels, global_step)
sdca_model, train_op = optimizer.get_train_step(
columns_to_variables, weight_column_name, loss_type, features, labels,
global_step)
if update_weights_hook is not None:
update_weights_hook.set_parameters(sdca_model, train_op)
return train_op

View File

@ -99,16 +99,16 @@ class SDCAOptimizer(object):
def symmetric_l2_regularization(self):
return self._symmetric_l2_regularization
def get_train_step(self, columns_to_variables,
weight_column_name, loss_type, features, targets,
global_step):
def get_train_step(self, columns_to_variables, weight_column_name, loss_type,
features, targets, global_step):
"""Returns the training operation of an SdcaModel optimizer."""
def _tensor_to_sparse_feature_column(dense_tensor):
def _dense_tensor_to_sparse_feature_column(dense_tensor):
"""Returns SparseFeatureColumn for the input dense_tensor."""
ignore_value = 0.0
sparse_indices = array_ops.where(math_ops.not_equal(
dense_tensor, math_ops.cast(ignore_value, dense_tensor.dtype)))
sparse_indices = array_ops.where(
math_ops.not_equal(dense_tensor,
math_ops.cast(ignore_value, dense_tensor.dtype)))
sparse_values = array_ops.gather_nd(dense_tensor, sparse_indices)
# TODO(sibyl-Aix6ihai, sibyl-vie3Poto): Makes this efficient, as now SDCA supports
# very sparse features with weights and not weights.
@ -133,10 +133,9 @@ class SDCAOptimizer(object):
dense_features, sparse_features, sparse_feature_with_values = [], [], []
dense_feature_weights = []
sparse_feature_weights, sparse_feature_with_values_weights = [], []
# pylint: disable=protected-access
for column in sorted(columns_to_variables.keys(), key=lambda x: x.key):
transformed_tensor = features[column]
if isinstance(column, layers.feature_column._RealValuedColumn):
if isinstance(column, layers.feature_column._RealValuedColumn): # pylint: disable=protected-access
# A real-valued column corresponds to a dense feature in SDCA. A
# transformed tensor corresponding to a RealValuedColumn has rank 2
# (its shape is typically [batch_size, column.dimension]) and so it
@ -145,22 +144,28 @@ class SDCAOptimizer(object):
# For real valued columns, the variables list contains exactly one
# element.
dense_feature_weights.append(columns_to_variables[column][0])
elif isinstance(column, layers.feature_column._BucketizedColumn):
elif isinstance(column, layers.feature_column._BucketizedColumn): # pylint: disable=protected-access
# A bucketized column corresponds to a sparse feature in SDCA. The
# bucketized feature is "sparsified" for SDCA by converting it to a
# SparseFeatureColumn respresenting the one-hot encoding of the
# bucketized feature.
dense_bucket_tensor = layers.input_from_feature_columns(
{column: transformed_tensor}, [column])
sparse_feature_column = _tensor_to_sparse_feature_column(
#
# TODO(sibyl-vie3Poto): Explore whether it is more efficient to translate a
# bucketized feature column to a dense feature in SDCA. This will
# likely depend on the number of buckets.
dense_bucket_tensor = column._to_dnn_input_layer(transformed_tensor) # pylint: disable=protected-access
sparse_feature_column = _dense_tensor_to_sparse_feature_column(
dense_bucket_tensor)
sparse_feature_with_values.append(sparse_feature_column)
# For bucketized columns, the variables list contains exactly one
# element.
sparse_feature_with_values_weights.append(
columns_to_variables[column][0])
elif isinstance(column, (layers.feature_column._CrossedColumn,
layers.feature_column._SparseColumn)):
elif isinstance(
column,
(
layers.feature_column._CrossedColumn, # pylint: disable=protected-access
layers.feature_column._SparseColumn)): # pylint: disable=protected-access
sparse_features.append(
SparseFeatureColumn(
array_ops.reshape(
@ -168,10 +173,9 @@ class SDCAOptimizer(object):
value=transformed_tensor.indices,
num_or_size_splits=2,
axis=1)[0], [-1]),
array_ops.reshape(transformed_tensor.values, [-1]),
None))
array_ops.reshape(transformed_tensor.values, [-1]), None))
sparse_feature_weights.append(columns_to_variables[column][0])
elif isinstance(column, layers.feature_column._WeightedSparseColumn):
elif isinstance(column, layers.feature_column._WeightedSparseColumn): # pylint: disable=protected-access
id_tensor = column.id_tensor(transformed_tensor)
weight_tensor = column.weight_tensor(transformed_tensor)
sparse_feature_with_values.append(
@ -183,11 +187,10 @@ class SDCAOptimizer(object):
array_ops.reshape(id_tensor.values, [-1]),
array_ops.reshape(weight_tensor.values, [-1])))
sparse_feature_with_values_weights.append(
columns_to_variables[column][0])
columns_to_variables[column][0])
else:
raise ValueError('SDCAOptimizer does not support column type %s.' %
type(column).__name__)
# pylint: enable=protected-access
example_weights = array_ops.reshape(
features[weight_column_name],
@ -195,12 +198,13 @@ class SDCAOptimizer(object):
example_ids = features[self._example_id_column]
sparse_feature_with_values.extend(sparse_features)
sparse_feature_with_values_weights.extend(sparse_feature_weights)
examples = dict(sparse_features=sparse_feature_with_values,
dense_features=dense_features,
example_labels=math_ops.to_float(array_ops.reshape(
targets, shape=[-1])),
example_weights=example_weights,
example_ids=example_ids)
examples = dict(
sparse_features=sparse_feature_with_values,
dense_features=dense_features,
example_labels=math_ops.to_float(
array_ops.reshape(targets, shape=[-1])),
example_weights=example_weights,
example_ids=example_ids)
sdca_variables = dict(
sparse_features_weights=sparse_feature_with_values_weights,
dense_features_weights=dense_feature_weights)

View File

@ -11,6 +11,7 @@ tensorflow/core/protobuf/cluster.pb.cc
tensorflow/core/protobuf/config.pb.cc
tensorflow/core/protobuf/rewriter_config.pb.cc
tensorflow/core/protobuf/debug.pb.cc
tensorflow/core/protobuf/device_properties.pb.cc
tensorflow/core/lib/core/error_codes.pb.cc
tensorflow/core/framework/versions.pb.cc
tensorflow/core/framework/variable.pb.cc
@ -36,3 +37,4 @@ tensorflow/core/framework/attr_value.pb.cc
tensorflow/core/framework/allocation_description.pb.cc
tensorflow/core/example/feature.pb.cc
tensorflow/core/example/example.pb.cc
tensorflow/core/grappler/costs/op_performance_data.pb.cc

View File

@ -10,6 +10,7 @@ tensorflow/core/protobuf/meta_graph.pb.h
tensorflow/core/protobuf/cluster.pb.h
tensorflow/core/protobuf/config.pb.h
tensorflow/core/protobuf/debug.pb.h
tensorflow/core/protobuf/device_properties.pb.h
tensorflow/core/protobuf/rewriter_config.pb.h
tensorflow/core/protobuf/tensor_bundle.pb.h
tensorflow/core/lib/core/error_codes.pb.h
@ -37,3 +38,4 @@ tensorflow/core/framework/attr_value.pb.h
tensorflow/core/framework/allocation_description.pb.h
tensorflow/core/example/feature.pb.h
tensorflow/core/example/example.pb.h
tensorflow/core/grappler/costs/op_performance_data.pb.h

View File

@ -10,6 +10,7 @@ tensorflow/core/protobuf/meta_graph.proto
tensorflow/core/protobuf/cluster.proto
tensorflow/core/protobuf/config.proto
tensorflow/core/protobuf/debug.proto
tensorflow/core/protobuf/device_properties.proto
tensorflow/core/protobuf/rewriter_config.proto
tensorflow/core/protobuf/tensor_bundle.proto
tensorflow/core/lib/core/error_codes.proto

View File

@ -8,11 +8,13 @@ exports_files(["LICENSE"])
package(default_visibility = ["//tensorflow:__subpackages__"])
load("//tensorflow:tensorflow.bzl", "py_test")
load("//tensorflow:tensorflow.bzl", "tf_py_test")
py_library(
name = "opt_py",
srcs = [
"__init__.py",
"python/training/drop_stale_gradient_optimizer.py",
"python/training/external_optimizer.py",
"python/training/lazy_adam_optimizer.py",
"python/training/moving_average_optimizer.py",
@ -104,6 +106,22 @@ py_test(
],
)
tf_py_test(
name = "drop_stale_gradient_optimizer_test",
srcs = ["python/training/drop_stale_gradient_optimizer_test.py"],
additional_deps = [
":opt_py",
"//third_party/py/numpy",
"//tensorflow/python:client",
"//tensorflow/python:client_testlib",
"//tensorflow/python:dtypes",
"//tensorflow/python:framework_ops",
"//tensorflow/python:math_ops",
"//tensorflow/python:training",
"//tensorflow/python:variables",
],
)
filegroup(
name = "all_files",
srcs = glob(

View File

@ -19,6 +19,7 @@ from __future__ import division
from __future__ import print_function
# pylint: disable=wildcard-import
from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import *
from tensorflow.contrib.opt.python.training.external_optimizer import *
from tensorflow.contrib.opt.python.training.lazy_adam_optimizer import *
from tensorflow.contrib.opt.python.training.moving_average_optimizer import *
@ -27,7 +28,8 @@ from tensorflow.contrib.opt.python.training.variable_clipping_optimizer import *
from tensorflow.python.util.all_util import remove_undocumented
_allowed_symbols = ['ExternalOptimizerInterface',
_allowed_symbols = ['DropStaleGradientOptimizer',
'ExternalOptimizerInterface',
'LazyAdamOptimizer',
'MovingAverageOptimizer',
'ScipyOptimizerInterface',

View File

@ -0,0 +1,112 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Wrapper optimizer for checking and dropping stale gradients."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import gen_array_ops
from tensorflow.python.ops import gen_math_ops
from tensorflow.python.ops import init_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import variable_scope
from tensorflow.python.summary import summary
from tensorflow.python.training import optimizer
from tensorflow.python.training import training_util
class DropStaleGradientOptimizer(optimizer.Optimizer):
"""Wrapper optimizer that checks and drops stale gradient.
This optimizer records the global step for each worker before computing
gradients and compares it with the global step at the time of applying the
gradients. If the difference is larger than a threshold, it will drop all
the computed gradients.
"""
def __init__(self,
opt,
staleness,
use_locking=False,
name="DropStaleGradient"):
"""Constructs a new DropStaleGradientOptimizer.
Args:
opt: The actual optimizer that will be used to compute and apply the
gradients. Must be one of the Optimizer classes.
staleness: The maximum staleness allowed for the optimizer.
use_locking: If `True` use locks for clip update operations.
name: Optional name prefix for the operations created when applying
gradients. Defaults to "DropStaleGradient".
"""
super(DropStaleGradientOptimizer, self).__init__(use_locking, name)
self._opt = opt
self._staleness = staleness
def compute_gradients(self, loss, *args, **kwargs):
# Record current global step for worker.
with ops.colocate_with(loss):
self._local_step = training_util.get_global_step() + 0
with ops.control_dependencies([self._local_step]):
loss = gen_array_ops.identity(loss)
return self._opt.compute_gradients(loss, *args, **kwargs)
def get_slot(self, *args, **kwargs):
return self._opt.get_slot(*args, **kwargs)
def get_slot_names(self, *args, **kwargs):
return self._opt.get_slot_names(*args, **kwargs)
def apply_gradients(self, grads_and_vars, global_step=None, name=None):
gradients = []
# Number of stale gradients.
stale_counter = variable_scope.get_variable(
"stale_counter", [],
initializer=init_ops.zeros_initializer(),
trainable=False)
def _AcceptGradientOp():
with ops.control_dependencies(
[self._opt.apply_gradients(
grads_and_vars, global_step=global_step, name=name)]):
return gen_array_ops.identity(0.0)
def _DropGradientOp():
return gen_array_ops.identity(1.0)
for grad_and_var in grads_and_vars:
grad = grad_and_var[0]
if isinstance(grad, ops.Tensor):
gradients.append(grad)
else:
gradients.append(grad.op)
with ops.control_dependencies(gradients), ops.colocate_with(global_step):
staleness = gen_array_ops.reshape(
global_step - self._local_step, shape=())
conditional_update = stale_counter.assign_add(control_flow_ops.cond(
gen_math_ops.less_equal(staleness, self._staleness),
_AcceptGradientOp, _DropGradientOp))
summary.scalar(
"Gradient staleness percentage",
stale_counter / (math_ops.cast(global_step + 1, dtypes.float32)))
return conditional_update

View File

@ -0,0 +1,297 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for DropStaleGradientOptimizer."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import portpicker
from tensorflow.contrib.opt.python.training import drop_stale_gradient_optimizer
from tensorflow.python.client import session
from tensorflow.python.framework import ops
from tensorflow.python.ops import data_flow_ops
from tensorflow.python.ops import variables
from tensorflow.python.platform import test
from tensorflow.python.training import gradient_descent
from tensorflow.python.training import server_lib
from tensorflow.python.training import training_util
# Creates the workers and return their sessions, graphs, train_ops.
def _get_workers(num_workers, staleness):
worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
cluster_dict = {
'worker': ['localhost:%s' % port for port in worker_ports],
'ps': ['localhost:%s' % portpicker.pick_unused_port()]
}
cs = server_lib.ClusterSpec(cluster_dict)
workers = [
server_lib.Server(
cs, job_name='worker', task_index=ix, start=True)
for ix in range(num_workers)
]
server_lib.Server(cs, job_name='ps', task_index=0, start=True)
sessions = []
graphs = []
train_ops = []
# To simulate stale cases, maintaining two queues for computing and
# applying gradients respectively. In the phase of computing gradients,
# all workers except chief worker compute gradients together and chief worker
# computes after all other worers' computing finished. In the phase of
# applying gradients, chief worker will first apply gradients, then all other
# workers will apply gradients one by one. Therefore, the chief worker will
# always have 0 staleness, each of all other workers will have a unique
# staleness value from [1, num_workers).
for worker_id in range(num_workers):
graph = ops.Graph()
with graph.as_default():
global_step = training_util.create_global_step()
var_0 = variables.Variable(0.0, name='v0')
var_1 = variables.Variable(1.0, name='v1')
compute_gradients_queue = data_flow_ops.FIFOQueue(
-1, global_step.dtype.base_dtype, shapes=(),
name='compute_gradients_queue', shared_name='compute_gradients_queue')
apply_gradients_queue = data_flow_ops.FIFOQueue(
-1, global_step.dtype.base_dtype, shapes=(),
name='apply_gradients_queue', shared_name='apply_gradients_queue')
# Gradients for loss on var_0 and var_1 will be 1.0.
loss = 0 - var_0 - var_1
sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
stale_check_opt = (
drop_stale_gradient_optimizer.DropStaleGradientOptimizer(
sgd_opt, staleness))
# Compute gradients.
if worker_id == 0:
with ops.control_dependencies(
[compute_gradients_queue.dequeue_many(num_workers - 1)]):
grad_and_vars = stale_check_opt.compute_gradients(loss)
else:
grad_and_vars = stale_check_opt.compute_gradients(loss)
with ops.control_dependencies([t[0] for t in grad_and_vars]):
worker_enqueue_op = compute_gradients_queue.enqueue(global_step)
# Apply gradients.
if worker_id == 0:
with ops.control_dependencies(
[stale_check_opt.apply_gradients(grad_and_vars, global_step)]):
train_op = apply_gradients_queue.enqueue(global_step)
else:
with ops.control_dependencies([worker_enqueue_op]):
with ops.control_dependencies([apply_gradients_queue.dequeue()]):
with ops.control_dependencies(
[stale_check_opt.apply_gradients(
grad_and_vars, global_step)]):
train_op = apply_gradients_queue.enqueue(global_step)
sess = session.Session(workers[worker_id].target)
sessions.append(sess)
graphs.append(graph)
train_ops.append(train_op)
return sessions, graphs, train_ops
class DropStaleGradientOptimizerTest(test.TestCase):
def _run(self, train_op, sess):
sess.run(train_op)
def test1Worker(self):
num_workers = 1
sessions, graphs, train_ops = _get_workers(num_workers, 0)
with graphs[0].as_default():
sessions[0].run(variables.global_variables_initializer())
global_step = training_util.get_global_step(graphs[0])
var_0 = graphs[0].get_tensor_by_name('v0:0')
var_1 = graphs[0].get_tensor_by_name('v1:0')
stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
# Verify the initialized value.
self.assertAllEqual(0.0, sessions[0].run(var_0))
self.assertAllEqual(1.0, sessions[0].run(var_1))
self.assertAllEqual(0.0, sessions[0].run(stale_counter))
self.assertAllEqual(0, sessions[0].run(global_step))
sessions[0].run(train_ops[0])
# Verify the updated value after 1 step.
self.assertAllEqual(1, sessions[0].run(global_step))
self.assertAllEqual(0.0 + 1.0, sessions[0].run(var_0))
self.assertAllEqual(1.0 + 1.0, sessions[0].run(var_1))
self.assertAllEqual(1, sessions[0].run(global_step))
def test1WorkerNegativeStaleness(self):
num_workers = 1
sessions, graphs, train_ops = _get_workers(num_workers, -1)
with graphs[0].as_default():
sessions[0].run(variables.global_variables_initializer())
global_step = training_util.get_global_step(graphs[0])
var_0 = graphs[0].get_tensor_by_name('v0:0')
var_1 = graphs[0].get_tensor_by_name('v1:0')
stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
# Verify the initialized value.
self.assertAllEqual(0.0, sessions[0].run(var_0))
self.assertAllEqual(1.0, sessions[0].run(var_1))
self.assertAllEqual(0.0, sessions[0].run(stale_counter))
self.assertAllEqual(0, sessions[0].run(global_step))
sessions[0].run(train_ops[0])
# Verify no updates because max staleness is negative.
self.assertAllEqual(0, sessions[0].run(global_step))
self.assertAllEqual(1.0, sessions[0].run(stale_counter))
self.assertAllEqual(0.0, sessions[0].run(var_0))
self.assertAllEqual(1.0, sessions[0].run(var_1))
def test2WorkersStaleness0(self):
num_workers = 2
sessions, graphs, train_ops = _get_workers(num_workers, 0)
with graphs[0].as_default():
sessions[0].run(variables.global_variables_initializer())
global_step = training_util.get_global_step(graphs[0])
var_0 = graphs[0].get_tensor_by_name('v0:0')
var_1 = graphs[0].get_tensor_by_name('v1:0')
stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
# Verify the initialized value.
self.assertAllEqual(0.0, sessions[0].run(var_0))
self.assertAllEqual(1.0, sessions[0].run(var_1))
self.assertAllEqual(0.0, sessions[0].run(stale_counter))
self.assertAllEqual(0, sessions[0].run(global_step))
thread_0 = self.checkedThread(
target=self._run, args=(train_ops[0], sessions[0]))
thread_1 = self.checkedThread(
target=self._run, args=(train_ops[1], sessions[1]))
thread_0.start()
thread_1.start()
thread_0.join()
thread_1.join()
# With 2 workers and max staleness set to 0, only cheif worker will update
# var_0 and var_1.
self.assertAllEqual(1, sessions[0].run(global_step))
self.assertAllEqual(1.0, sessions[0].run(stale_counter))
self.assertAllEqual(0.0 + 1.0, sessions[0].run(var_0))
self.assertAllEqual(1.0 + 1.0, sessions[0].run(var_1))
def test2WorkersStaleness1(self):
num_workers = 2
sessions, graphs, train_ops = _get_workers(num_workers, 1)
with graphs[0].as_default():
sessions[0].run(variables.global_variables_initializer())
global_step = training_util.get_global_step(graphs[0])
var_0 = graphs[0].get_tensor_by_name('v0:0')
var_1 = graphs[0].get_tensor_by_name('v1:0')
stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
# Verify the initialized value.
self.assertAllEqual(0.0, sessions[0].run(var_0))
self.assertAllEqual(1.0, sessions[0].run(var_1))
self.assertAllEqual(0.0, sessions[0].run(stale_counter))
self.assertAllEqual(0, sessions[0].run(global_step))
thread_0 = self.checkedThread(
target=self._run, args=(train_ops[0], sessions[0]))
thread_1 = self.checkedThread(
target=self._run, args=(train_ops[1], sessions[1]))
thread_0.start()
thread_1.start()
thread_0.join()
thread_1.join()
# With 2 workers and max staleness set to 1, both workers will update
# var_0 and var_1.
self.assertAllEqual(2, sessions[0].run(global_step))
self.assertAllEqual(0.0, sessions[0].run(stale_counter))
self.assertAllEqual(0.0 + 2.0, sessions[0].run(var_0))
self.assertAllEqual(1.0 + 2.0, sessions[0].run(var_1))
def test3WorkersStaleness0(self):
num_workers = 3
sessions, graphs, train_ops = _get_workers(num_workers, 0)
with graphs[0].as_default():
sessions[0].run(variables.global_variables_initializer())
global_step = training_util.get_global_step(graphs[0])
var_0 = graphs[0].get_tensor_by_name('v0:0')
var_1 = graphs[0].get_tensor_by_name('v1:0')
stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
# Verify the initialized value.
self.assertAllEqual(0.0, sessions[0].run(var_0))
self.assertAllEqual(1.0, sessions[0].run(var_1))
self.assertAllEqual(0.0, sessions[0].run(stale_counter))
self.assertAllEqual(0, sessions[0].run(global_step))
thread_0 = self.checkedThread(
target=self._run, args=(train_ops[0], sessions[0]))
thread_1 = self.checkedThread(
target=self._run, args=(train_ops[1], sessions[1]))
thread_2 = self.checkedThread(
target=self._run, args=(train_ops[2], sessions[2]))
thread_0.start()
thread_1.start()
thread_2.start()
thread_0.join()
thread_1.join()
thread_2.join()
# With 3 workers and max staleness set to 0, only cheif worker will update
# var_0 and var_1.
self.assertAllEqual(1, sessions[0].run(global_step))
self.assertAllEqual(2.0, sessions[0].run(stale_counter))
self.assertAllEqual(0.0 + 1.0, sessions[0].run(var_0))
self.assertAllEqual(1.0 + 1.0, sessions[0].run(var_1))
def test3WorkersStaleness1(self):
num_workers = 3
sessions, graphs, train_ops = _get_workers(num_workers, 1)
with graphs[0].as_default():
sessions[0].run(variables.global_variables_initializer())
global_step = training_util.get_global_step(graphs[0])
var_0 = graphs[0].get_tensor_by_name('v0:0')
var_1 = graphs[0].get_tensor_by_name('v1:0')
stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
# Verify the initialized value.
self.assertAllEqual(0.0, sessions[0].run(var_0))
self.assertAllEqual(1.0, sessions[0].run(var_1))
self.assertAllEqual(0.0, sessions[0].run(stale_counter))
self.assertAllEqual(0, sessions[0].run(global_step))
thread_0 = self.checkedThread(
target=self._run, args=(train_ops[0], sessions[0]))
thread_1 = self.checkedThread(
target=self._run, args=(train_ops[1], sessions[1]))
thread_2 = self.checkedThread(
target=self._run, args=(train_ops[2], sessions[2]))
thread_0.start()
thread_1.start()
thread_2.start()
thread_0.join()
thread_1.join()
thread_2.join()
# With 3 workers and max staleness set to 1, chief worker and only one of
# the two other workers will update var_0 and var_1.
self.assertAllEqual(2, sessions[0].run(global_step))
self.assertAllEqual(1.0, sessions[0].run(stale_counter))
self.assertAllEqual(0.0 + 2.0, sessions[0].run(var_0))
self.assertAllEqual(1.0 + 2.0, sessions[0].run(var_1))
if __name__ == '__main__':
test.main()

View File

@ -849,14 +849,12 @@ class RNNCellTest(test.TestCase):
batch_size = 3
input_size = 4
expected_state_c = np.array(
[[2.954548e-01, 8.354891e-04],
[2.834632e-01, 8.158963e-01],
[2.291694e-01, 1.325745e-04]],
[[0.00072015, 0.00036633], [0.00083481, 0.00047266],
[0.00085111, 0.00053054]],
dtype=np.float32)
expected_state_h = np.array(
[[2.116566e-01, 5.985238e-04],
[2.137760e-01, 6.153145e-01],
[1.742966e-01, 1.008306e-04]],
[[0.0005159, 0.00026243], [0.00062958, 0.00035646],
[0.00064732, 0.00040351]],
dtype=np.float32)
with variable_scope.variable_scope(
"root", initializer=init_ops.constant_initializer(0.5)):

View File

@ -11,7 +11,12 @@ Consultants: Jon Shlens, Pete Warden
1. Measure model parameters, float operations, tensor shapes.
2. Measure op execution times, requested memory size and device placement.
3. Inspect checkpoint tensors' shapes and their values.
4. Explore model based on name scope or graph structure.
4. 3 ways to view and explore TensorFlow model profiles
* Organize by Python code call stack.
* Organize by TensorFlow operation name scope hierarchies.
* Organize by TensorFlow operation inputs/outputs graph.
5. Selectively grouping/filtering/accounting/ordering ops.
tfprof can be used as Python API, Interactive CLI and One-shot Script.
@ -28,7 +33,8 @@ param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
tfprof_options=tf.contrib.tfprof.model_analyzer.
TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
# param_stats is tensorflow.tfprof.TFProfNode proto. It organize the statistics
# param_stats is tensorflow.tfprof.TFGraphNodeProto proto.
# It organize the statistics
# of each graph node in tree scructure. Let's print the root below.
sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
```

View File

@ -21,16 +21,34 @@ py_test(
name = "model_analyzer_test",
srcs = ["model_analyzer_test.py"],
srcs_version = "PY2AND3",
tags = ["no_pip"],
deps = [
":model_analyzer",
"//tensorflow/core:protos_all_py",
"//tensorflow/python:array_ops",
":model_analyzer_testlib",
"//tensorflow/python:client",
"//tensorflow/python:client_testlib",
"//tensorflow/python:framework_for_generated_wrappers",
"//tensorflow/python:platform",
"//tensorflow/python:variables",
],
)
py_library(
name = "model_analyzer_testlib",
srcs = ["model_analyzer_testlib.py"],
srcs_version = "PY2AND3",
deps = [
":model_analyzer",
"//tensorflow/contrib/rnn:rnn_py",
"//tensorflow/core:protos_all_py",
"//tensorflow/python:array_ops",
"//tensorflow/python:framework_for_generated_wrappers",
"//tensorflow/python:init_ops",
"//tensorflow/python:math_ops",
"//tensorflow/python:nn_ops",
"//tensorflow/python:platform",
"//tensorflow/python:rnn",
"//tensorflow/python:training",
"//tensorflow/python:variable_scope",
"//tensorflow/python:variables",
],

View File

@ -123,7 +123,7 @@ def print_model_analysis(graph,
"""Print model statistics.
Prints the model statistics to stdout. Also returns the results
in a TFProfNode proto. See go/tfprof or run tfprof tool:
in a TFGraphNodeProto proto. See go/tfprof or run tfprof tool:
'bazel run third_party/tensorflow/tools/tfprof help'
Examples:
@ -142,15 +142,19 @@ def print_model_analysis(graph,
'micros' and 'bytes'.
op_log: tensorflow::tfprof::OpLog proto. users can use this proto to
group together ops and use a op_type to select the group.
tfprof_cmd: string. Either 'scope' or 'graph'. 'scope' view organize
ops using their name scopes. 'graph' view organize ops using
their graph inputs.
tfprof_cmd: string. Either 'scope', 'graph', 'code'.
'scope' view organize outputs using ops' name scope.
'graph' view organize outputs using op's inputs/outputs.
'code' view organize outputs using Python call stack.
tfprof_options: See 'tfprof help' for details.
Returns:
TFProfNode proto. Side effect: a formatted output to stdout.
If tfprof_cmd is 'scope' or 'graph', returns TFGraphNodeProto proto.
If tfprof_cmd is 'code', returns TFCodeNodeProto proto.
Side effect: a formatted output to stdout.
"""
# pylint: disable=protected-access
op_log = tfprof_logger._merge_default_with_oplog(graph, op_log, run_meta)
op_log = tfprof_logger._merge_default_with_oplog(
graph, op_log, run_meta, add_trace=tfprof_cmd == 'code')
# pylint: enable=protected-access
opts = tfprof_options_pb2.OptionsProto()
opts.max_depth = tfprof_options['max_depth']
@ -178,11 +182,24 @@ def print_model_analysis(graph,
opts.dump_to_file = tfprof_options['dump_to_file']
run_meta_str = run_meta.SerializeToString() if run_meta else b''
op_log_str = op_log.SerializeToString() if op_log else b''
tfprof_node = tfprof_output_pb2.TFProfNode()
tfprof_node.ParseFromString(
print_mdl.PrintModelAnalysis(
graph.as_graph_def().SerializeToString(), run_meta_str, op_log_str,
tfprof_cmd.encode('utf-8'), opts.SerializeToString()))
if tfprof_cmd == 'code':
tfprof_node = tfprof_output_pb2.TFCodeNodeProto()
tfprof_node.ParseFromString(
print_mdl.PrintModelAnalysis(
graph.as_graph_def().SerializeToString(),
run_meta_str,
op_log.SerializeToString(),
tfprof_cmd.encode('utf-8'),
opts.SerializeToString()))
else:
tfprof_node = tfprof_output_pb2.TFGraphNodeProto()
tfprof_node.ParseFromString(
print_mdl.PrintModelAnalysis(
graph.as_graph_def().SerializeToString(),
run_meta_str,
op_log.SerializeToString(),
tfprof_cmd.encode('utf-8'),
opts.SerializeToString()))
return tfprof_node

View File

@ -18,49 +18,27 @@ from __future__ import division
from __future__ import print_function
import os
from tensorflow.core.protobuf import config_pb2
from tensorflow.python.client import session
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import init_ops
from tensorflow.python.ops import nn_ops
from tensorflow.python.ops import variable_scope
from tensorflow.python.ops import variables
from tensorflow.python.platform import gfile
from tensorflow.python.platform import test
# XXX: this depends on pywrap_tensorflow and must come later
from tensorflow.contrib.tfprof.python.tools.tfprof import model_analyzer
from tensorflow.contrib.tfprof.python.tools.tfprof import model_analyzer_testlib as lib
class PrintModelAnalysisTest(test.TestCase):
def _BuildSmallModel(self):
image = array_ops.zeros([2, 6, 6, 3])
_ = variable_scope.get_variable(
'ScalarW', [],
dtypes.float32,
initializer=init_ops.random_normal_initializer(stddev=0.001))
kernel = variable_scope.get_variable(
'DW', [3, 3, 3, 6],
dtypes.float32,
initializer=init_ops.random_normal_initializer(stddev=0.001))
x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
kernel = variable_scope.get_variable(
'DW2', [2, 2, 6, 12],
dtypes.float32,
initializer=init_ops.random_normal_initializer(stddev=0.001))
x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME')
return x
def testDumpToFile(self):
ops.reset_default_graph()
opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
with session.Session() as sess, ops.device('/cpu:0'):
_ = self._BuildSmallModel()
_ = lib.BuildSmallModel()
model_analyzer.print_model_analysis(sess.graph, tfprof_options=opts)
with gfile.Open(opts['dump_to_file'], 'r') as f:
@ -71,6 +49,7 @@ class PrintModelAnalysisTest(test.TestCase):
f.read())
def testSelectEverything(self):
ops.reset_default_graph()
opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
opts['account_type_regexes'] = ['.*']
@ -78,8 +57,10 @@ class PrintModelAnalysisTest(test.TestCase):
'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device', 'op_types'
]
with session.Session() as sess, ops.device('/cpu:0'):
x = self._BuildSmallModel()
config = config_pb2.ConfigProto(
graph_options=config_pb2.GraphOptions(build_cost_model=1))
with session.Session(config=config) as sess, ops.device('/cpu:0'):
x = lib.BuildSmallModel()
sess.run(variables.global_variables_initializer())
run_meta = config_pb2.RunMetadata()
@ -98,6 +79,118 @@ class PrintModelAnalysisTest(test.TestCase):
f.read())
# pylint: enable=line-too-long
def testSimpleCodeView(self):
ops.reset_default_graph()
opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
opts['account_type_regexes'] = ['.*']
opts['show_name_regexes'] = ['.*model_analyzer_testlib.*']
opts['account_displayed_op_only'] = False
# TODO(xpan): Test 'micros'. Since the execution time changes each run,
# it's a bit difficult to test it now.
opts['select'] = [
'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device',
]
config = config_pb2.ConfigProto(
graph_options=config_pb2.GraphOptions(build_cost_model=1))
with session.Session(config=config) as sess, ops.device('/cpu:0'):
x = lib.BuildSmallModel()
sess.run(variables.global_variables_initializer())
run_meta = config_pb2.RunMetadata()
_ = sess.run(x,
options=config_pb2.RunOptions(
trace_level=config_pb2.RunOptions.FULL_TRACE),
run_metadata=run_meta)
model_analyzer.print_model_analysis(
sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
with gfile.Open(opts['dump_to_file'], 'r') as f:
# pylint: disable=line-too-long
self.assertEqual(
'_TFProfRoot (0/451 params, 0/10.44k flops, 0B/5.28KB)\n model_analyzer_testlib.py:33:BuildSmallModel:image = array_ops... (0/0 params, 0/0 flops, 0B/864B)\n model_analyzer_testlib.py:37:BuildSmallModel:initializer=init_... (0/1 params, 0/0 flops, 0B/0B)\n model_analyzer_testlib.py:41:BuildSmallModel:initializer=init_... (0/162 params, 0/0 flops, 0B/1.30KB)\n model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/5.83k flops, 0B/432B)\n model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0/288 params, 0/0 flops, 0B/2.30KB)\n model_analyzer_testlib.py:47:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/4.61k flops, 0B/384B)\n',
f.read())
# pylint: enable=line-too-long
def testComplexCodeView(self):
ops.reset_default_graph()
opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
opts['account_type_regexes'] = ['.*']
opts['show_name_regexes'] = ['.*model_analyzer_testlib.py.*']
opts['account_displayed_op_only'] = False
opts['select'] = ['params', 'float_ops']
config = config_pb2.ConfigProto(
graph_options=config_pb2.GraphOptions(build_cost_model=1))
with session.Session(config=config) as sess, ops.device('/cpu:0'):
x = lib.BuildFullModel()
sess.run(variables.global_variables_initializer())
run_meta = config_pb2.RunMetadata()
_ = sess.run(x,
options=config_pb2.RunOptions(
trace_level=config_pb2.RunOptions.FULL_TRACE),
run_metadata=run_meta)
tfprof_node = model_analyzer.print_model_analysis(
sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
# pylint: disable=line-too-long
with gfile.Open(opts['dump_to_file'], 'r') as f:
self.assertEqual(
'_TFProfRoot (0/2.84k params, 0/54.08k flops)\n model_analyzer_testlib.py:56:BuildFullModel:seq.append(array_... (0/1.80k params, 0/41.76k flops)\n model_analyzer_testlib.py:33:BuildSmallModel:image = array_ops... (0/0 params, 0/0 flops)\n model_analyzer_testlib.py:37:BuildSmallModel:initializer=init_... (0/4 params, 0/0 flops)\n model_analyzer_testlib.py:41:BuildSmallModel:initializer=init_... (0/648 params, 0/0 flops)\n model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/23.33k flops)\n model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0/1.15k params, 0/0 flops)\n model_analyzer_testlib.py:47:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/18.43k flops)\n model_analyzer_testlib.py:60:BuildFullModel:cell, array_ops.c... (0/1.04k params, 0/4.13k flops)\n model_analyzer_testlib.py:62:BuildFullModel:target = array_op... (0/0 params, 0/0 flops)\n model_analyzer_testlib.py:63:BuildFullModel:loss = nn_ops.l2_... (0/0 params, 0/0 flops)\n model_analyzer_testlib.py:65:BuildFullModel:return sgd_op.min... (0/0 params, 0/8.19k flops)\n',
f.read())
self.assertLess(0, tfprof_node.total_exec_micros)
self.assertEqual(2844, tfprof_node.total_parameters)
self.assertEqual(54080, tfprof_node.total_float_ops)
self.assertEqual(5, len(tfprof_node.children))
self.assertEqual('_TFProfRoot', tfprof_node.name)
self.assertEqual('model_analyzer_testlib.py:56:BuildFullModel:seq.append(array_...',
tfprof_node.children[0].name)
self.assertEqual('model_analyzer_testlib.py:60:BuildFullModel:cell, array_ops.c...',
tfprof_node.children[1].name)
self.assertEqual('model_analyzer_testlib.py:62:BuildFullModel:target = array_op...',
tfprof_node.children[2].name)
self.assertEqual('model_analyzer_testlib.py:63:BuildFullModel:loss = nn_ops.l2_...',
tfprof_node.children[3].name)
self.assertEqual('model_analyzer_testlib.py:65:BuildFullModel:return sgd_op.min...',
tfprof_node.children[4].name)
# pylint: enable=line-too-long
def testCodeViewLeafGraphNode(self):
ops.reset_default_graph()
opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
opts['account_type_regexes'] = ['.*']
opts['account_displayed_op_only'] = False
opts['select'] = [
'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device'
]
config = config_pb2.ConfigProto(
graph_options=config_pb2.GraphOptions(build_cost_model=1))
with session.Session(config=config) as sess, ops.device('/cpu:0'):
x = lib.BuildSmallModel()
sess.run(variables.global_variables_initializer())
run_meta = config_pb2.RunMetadata()
_ = sess.run(x,
options=config_pb2.RunOptions(
trace_level=config_pb2.RunOptions.FULL_TRACE),
run_metadata=run_meta)
tfprof_node = model_analyzer.print_model_analysis(
sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
leaf = tfprof_node
while leaf.children:
self.assertEqual(0, len(leaf.graph_nodes))
leaf = leaf.children[0]
self.assertEqual(1, len(leaf.graph_nodes))
if __name__ == '__main__':
test.main()

View File

@ -0,0 +1,67 @@
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A test lib that defines some models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.contrib.rnn.python.ops.core_rnn_cell import BasicRNNCell
from tensorflow.python.framework import dtypes
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import init_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn_ops
from tensorflow.python.ops import rnn
from tensorflow.python.ops import variable_scope
from tensorflow.python.training import gradient_descent
def BuildSmallModel():
"""Build a small forward conv model."""
image = array_ops.zeros([2, 6, 6, 3])
_ = variable_scope.get_variable(
'ScalarW', [],
dtypes.float32,
initializer=init_ops.random_normal_initializer(stddev=0.001))
kernel = variable_scope.get_variable(
'DW', [3, 3, 3, 6],
dtypes.float32,
initializer=init_ops.random_normal_initializer(stddev=0.001))
x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
kernel = variable_scope.get_variable(
'DW2', [2, 2, 6, 12],
dtypes.float32,
initializer=init_ops.random_normal_initializer(stddev=0.001))
x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME')
return x
def BuildFullModel():
"""Build the full model with conv,rnn,opt."""
seq = []
for i in range(4):
with variable_scope.variable_scope('inp_%d' % i):
seq.append(array_ops.reshape(BuildSmallModel(), [2, 1, -1]))
cell = BasicRNNCell(16, 48)
out = rnn.dynamic_rnn(
cell, array_ops.concat(seq, axis=1), dtype=dtypes.float32)[0]
target = array_ops.ones_like(out)
loss = nn_ops.l2_loss(math_ops.reduce_mean(target - out))
sgd_op = gradient_descent.GradientDescentOptimizer(1e-2)
return sgd_op.minimize(loss)

View File

@ -96,12 +96,13 @@ class PrintModelAnalysisTest(test.TestCase):
with session.Session() as sess, ops.device('/cpu:0'):
_ = self._BuildSmallModel()
tfprof_pb = tfprof_output_pb2.TFProfNode()
tfprof_pb = tfprof_output_pb2.TFGraphNodeProto()
tfprof_pb.ParseFromString(
print_mdl.PrintModelAnalysis(sess.graph.as_graph_def(
).SerializeToString(), b'', b'', b'scope', opts.SerializeToString()))
print_mdl.PrintModelAnalysis(
sess.graph.as_graph_def().SerializeToString(),
b'', b'', b'scope', opts.SerializeToString()))
expected_pb = tfprof_output_pb2.TFProfNode()
expected_pb = tfprof_output_pb2.TFGraphNodeProto()
text_format.Merge(r"""name: "_TFProfRoot"
exec_micros: 0
requested_bytes: 0

View File

@ -62,12 +62,13 @@ def _fill_missing_graph_shape(graph, run_meta):
return graph
def _get_logged_ops(graph, run_meta=None):
def _get_logged_ops(graph, run_meta=None, add_trace=False):
"""Extract trainable model parameters and FLOPs for ops from a Graph.
Args:
graph: tf.Graph.
run_meta: RunMetadata proto used to complete shape information.
add_trace: Whether to add op trace information.
Returns:
logged_ops: dict mapping from op_name to OpLogEntry.
"""
@ -76,21 +77,32 @@ def _get_logged_ops(graph, run_meta=None):
op_missing_shape = 0
logged_ops = {}
graph_def = graph.as_graph_def()
for node in graph_def.node:
for op in graph.get_operations():
try:
stats = ops.get_stats_for_node_def(graph, node, REGISTERED_FLOP_STATS)
stats = ops.get_stats_for_node_def(
graph, op.node_def, REGISTERED_FLOP_STATS)
except ValueError:
# Catch Exception When shape is incomplete. Skip it.
op_missing_shape += 1
stats = None
if not stats or not stats.value:
continue
if node.name not in logged_ops:
entry = tfprof_log_pb2.OpLogEntry()
entry.name = node.name
entry = tfprof_log_pb2.OpLogEntry()
entry.name = op.name
add_entry = False
if stats and stats.value:
entry.float_ops = int(stats.value)
add_entry = True
if add_trace:
for tb in op.traceback:
trace = entry.code_def.traces.add()
trace.file = tb[0] if tb[0] else 'none'
trace.lineno = tb[1] if tb[1] else -1
trace.function = tb[2] if tb[2] else 'none'
trace.line = tb[3] if tb[3] else 'none'
add_entry = True
if add_entry:
logged_ops[entry.name] = entry
for v in graph.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES):
@ -108,18 +120,21 @@ def _get_logged_ops(graph, run_meta=None):
return logged_ops
def _merge_default_with_oplog(graph, op_log=None, run_meta=None):
def _merge_default_with_oplog(graph, op_log=None,
run_meta=None,
add_trace=False):
"""Merge the tfprof default extra info with caller's op_log.
Args:
graph: tf.Graph.
op_log: OpLog proto.
run_meta: RunMetadata proto used to complete shape information.
add_trace: Whether to add op trace information.
Returns:
tmp_op_log: Merged OpLog proto.
"""
tmp_op_log = tfprof_log_pb2.OpLog()
logged_ops = _get_logged_ops(graph, run_meta)
logged_ops = _get_logged_ops(graph, run_meta, add_trace=add_trace)
if not op_log:
tmp_op_log.log_entries.extend(logged_ops.values())
else:
@ -131,13 +146,16 @@ def _merge_default_with_oplog(graph, op_log=None, run_meta=None):
all_ops[op_name].types.extend(entry.types)
if entry.float_ops > 0 and all_ops[op_name].float_ops == 0:
all_ops[op_name].float_ops = entry.float_ops
if entry.code_def.traces and not all_ops[op_name].code_def.traces:
all_ops[op_name].code_def.MergeFrom(entry.code_def)
else:
all_ops[op_name] = entry
tmp_op_log.log_entries.extend(all_ops.values())
return tmp_op_log
def write_op_log(graph, log_dir, op_log=None, run_meta=None):
def write_op_log(graph, log_dir, op_log=None, run_meta=None,
add_trace=False):
"""Log provided 'op_log', and add additional model information below.
The API also assigns ops in tf.trainable_variables() an op type called
@ -154,8 +172,9 @@ def write_op_log(graph, log_dir, op_log=None, run_meta=None):
one is created.
run_meta: (Optional) RunMetadata proto that helps flops computation using
run time shape information.
add_trace: Whether to add op trace information. Used to support "code" view.
"""
op_log = _merge_default_with_oplog(graph, op_log, run_meta)
op_log = _merge_default_with_oplog(graph, op_log, run_meta, add_trace)
with gfile.Open(os.path.join(log_dir, 'tfprof_log'), 'w') as log:
log.write(op_log.SerializeToString())

View File

@ -156,6 +156,7 @@ CORE_PROTO_SRCS = [
"protobuf/config.proto",
"protobuf/cluster.proto",
"protobuf/debug.proto",
"protobuf/device_properties.proto",
"protobuf/queue_runner.proto",
"protobuf/rewriter_config.proto",
"protobuf/tensor_bundle.proto",

View File

@ -829,7 +829,8 @@ static bool ValidateInlining(const Node* node, const FunctionBody* fbody) {
// Given a "caller" in "graph", which is a function call of a function
// to "fbody". Replaces the "caller" with fbody->graph and connects
// edges properly.
static void InlineFunctionBody(Graph* g, Node* caller,
static void InlineFunctionBody(const FunctionLibraryDefinition& flib_def,
Graph* g, Node* caller,
const FunctionBody* fbody) {
if (!ValidateInlining(caller, fbody)) {
LOG(WARNING) << "Inlining mismatch: " << caller->DebugString() << " vs. "
@ -837,6 +838,23 @@ static void InlineFunctionBody(Graph* g, Node* caller,
return;
}
// Input edges. For data edges coming into "caller", we first compute the
// <src>:<src_output> for the i-th input in "inputs".
// If "caller" has any input control dependencies, we add a NoOp
// node "input_control_node", which depends on "caller"'s control inputs.
std::vector<Endpoint> inputs(caller->num_inputs());
Node* input_control_node = nullptr;
for (const Edge* e : caller->in_edges()) {
if (e->IsControlEdge()) {
if (input_control_node == nullptr) {
input_control_node = AddNoOp(g);
}
g->AddControlEdge(e->src(), input_control_node);
} else {
inputs[e->dst_input()] = {e->src(), e->src_output()};
}
}
// Duplicate fbody->graph into 'g'. First, we copy the nodes of
// fbody->graph into 'g' except the source and sink nodes. We copy
// edges among nodes in 'fbody->graph'.
@ -850,8 +868,35 @@ static void InlineFunctionBody(Graph* g, Node* caller,
CHECK(n->IsOp());
NodeDef ndef = n->def();
ndef.set_name(strings::StrCat(caller->name(), "/", ndef.name()));
node_map[n->id()] = g->AddNode(ndef, &s);
Node* clone = g->AddNode(ndef, &s);
TF_CHECK_OK(s);
node_map[n->id()] = clone;
// If there is an input control node, and one of:
// a) the node has no data or control inputs, or
// b) the node is a function call or SymbolicGradient,
// then add a control edge from the input control node to the clone.
//
// We must not execute any nodes if the original function call would not
// have executed. This is especially critical when the function call is
// inside a control-flow construct like tf.cond(). Case (a) ensures that
// such nodes do not run.
//
// The purpose of case (b) is to ensure that instances of case (a) created
// by further inlining steps also receive the control dependency.
if (input_control_node) {
bool has_inputs = false;
for (const Edge* e : n->in_edges()) {
if (!e->src()->IsSource()) {
has_inputs = true;
break;
}
}
if (!has_inputs || flib_def.Find(clone->type_string()) != nullptr ||
clone->type_string() == "SymbolicGradient") {
g->AddControlEdge(input_control_node, clone);
}
}
}
for (const Edge* e : fbody->graph->edges()) {
if (e->src()->IsSource() || e->src()->IsSink() || e->dst()->IsSource() ||
@ -865,29 +910,12 @@ static void InlineFunctionBody(Graph* g, Node* caller,
// Connect input edges.
//
// For data edges coming into "caller", we first compute the
// <src>:<src_output> for the i-th input in "inputs". We create one
// Identity node for each input. Then, we connect inputs[i] to to
// the i-th identity node added. The nodes that previously connects
// to the j-th output of i-th arg node are reconnected to th i-th
// We create one Identity node for each input. Then, we connect inputs[i] to
// the i-th identity node added. The nodes that previously connected
// to the j-th output of i-th arg node are reconnected to the i-th
// identity node.
//
// If "caller" has any input control dependencies, we add a NoOp
// node "input_control_node". This "input_control_node" depends on
// what "caller" depends on, and the added identity nodes depend on
// "input_control_node".
std::vector<Endpoint> inputs(caller->num_inputs());
Node* input_control_node = nullptr;
for (const Edge* e : caller->in_edges()) {
if (e->IsControlEdge()) {
if (input_control_node == nullptr) {
input_control_node = AddNoOp(g);
}
g->AddControlEdge(e->src(), input_control_node);
} else {
inputs[e->dst_input()] = {e->src(), e->src_output()};
}
}
// The added identity nodes depend on "input_control_node".
for (std::size_t i = 0; i < fbody->arg_nodes.size(); ++i) {
Node* arg = node_map[fbody->arg_nodes[i]->id()];
Node* n = AddIdentity(g, inputs[i]);
@ -982,7 +1010,7 @@ bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph) {
candidates.push_back({node, fbody});
}
for (const auto& p : candidates) {
InlineFunctionBody(graph, p.first, p.second);
InlineFunctionBody(*fld, graph, p.first, p.second);
}
return !candidates.empty();
}

View File

@ -391,6 +391,90 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
}
}
// Verifies that control dependencies on the caller are added as control
// dependencies on any function calls created by inlining.
TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithControlDeps) {
Init({test::function::XTimesTwo(), test::function::XTimesFour()});
std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
{
Scope s = Scope::NewRootScope();
TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
auto c = ops::NoOp(s.WithOpName("c"));
auto b = Call(&s, "b", "XTimesFour", {a});
s.graph()->AddControlEdge(c.operation.node(), b.node());
auto ret = ops::_Retval(s.WithOpName("b_RetVal"), b, 0);
TF_ASSERT_OK(s.ToGraph(g.get()));
}
ExpandInlineFunctions(lib_.get(), g.get());
{
Scope s = Scope::NewRootScope();
TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
auto c = ops::NoOp(s.WithOpName("c"));
auto func0 =
ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies({c}));
auto func1 = ops::Identity(
s.WithOpName("Func/_1").WithControlDependencies({func0}), a);
auto b_x2 = Call(&s, "b/x2", "XTimesTwo", {func1});
s.graph()->AddControlEdge(func0.operation.node(), b_x2.node());
auto b_y = Call(&s, "b/y", "XTimesTwo", {b_x2});
s.graph()->AddControlEdge(func0.operation.node(), b_y.node());
auto func2 = ops::Identity(s.WithOpName("Func/_2"), b_y);
auto ret = ops::_Retval(s.WithOpName("b_RetVal"), func2, 0);
GraphDef expected;
TF_ASSERT_OK(s.ToGraphDef(&expected));
GraphDef actual;
g->ToGraphDef(&actual);
TF_EXPECT_GRAPH_EQ(expected, actual);
}
ExpandInlineFunctions(lib_.get(), g.get());
{
Scope s = Scope::NewRootScope();
TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
auto c = ops::NoOp(s.WithOpName("c"));
auto func0 =
ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies({c}));
auto func1 = ops::Identity(
s.WithOpName("Func/_1").WithControlDependencies({func0}), a);
auto func3 =
ops::NoOp(s.WithOpName("Func/_3").WithControlDependencies({func0}));
auto func4 = ops::Identity(
s.WithOpName("Func/_4").WithControlDependencies({func3}), func1);
auto b_x2_two = ops::Const(
s.WithOpName("b/x2/two").WithControlDependencies({func3}), 2LL);
auto b_x2_scale = ops::Cast(s.WithOpName("b/x2/scale"), b_x2_two, DT_FLOAT);
auto b_x2_y = ops::Mul(s.WithOpName("b/x2/y"), func4, b_x2_scale);
auto func5 = ops::Identity(s.WithOpName("Func/_5"), b_x2_y);
auto func6 =
ops::NoOp(s.WithOpName("Func/_6").WithControlDependencies({func0}));
auto func7 = ops::Identity(
s.WithOpName("Func/_7").WithControlDependencies({func6}), func5);
auto b_y_two = ops::Const(
s.WithOpName("b/y/two").WithControlDependencies({func6}), 2LL);
auto b_y_scale = ops::Cast(s.WithOpName("b/y/scale"), b_y_two, DT_FLOAT);
auto b_y_y = ops::Mul(s.WithOpName("b/y/y"), func7, b_y_scale);
auto func8 = ops::Identity(s.WithOpName("Func/_8"), b_y_y);
auto func2 = ops::Identity(s.WithOpName("Func/_2"), func8);
auto ret = ops::_Retval(s.WithOpName("b_RetVal"), func2, 0);
GraphDef expected;
TF_ASSERT_OK(s.ToGraphDef(&expected));
GraphDef actual;
g->ToGraphDef(&actual);
TF_EXPECT_GRAPH_EQ(expected, actual);
}
}
TEST_F(FunctionLibraryRuntimeTest, OptimizeGraph) {
Init({test::function::XTimesTwo(), test::function::XTimesFour(),
test::function::XTimes16()});

View File

@ -567,15 +567,14 @@ int64 MinSystemMemory(int64 available_memory) {
// We use the following heuristic for now:
//
// If the available_memory is < 2GiB, we allocate 200MiB to system memory.
// Otherwise, allocate 300MiB to system memory.
// Otherwise, allocate max(300MiB, 0.05 * available_memory) to system memory.
//
// In the future we could be more sophisticated by using a table of
// devices.
// In the future we could be more sophisticated by using a table of devices.
if (available_memory < (1LL << 31)) {
// 200MiB
return 209715200LL;
} else {
// max(300 MiB, 0.95 * available_memory)
// max(300 MiB, 0.05 * available_memory)
return std::max(314572800LL, static_cast<int64>(available_memory * 0.05));
}
}

View File

@ -60,6 +60,7 @@ cc_library(
"//tensorflow/core:lib",
"//tensorflow/core:protos_all_cc",
"//tensorflow/core/grappler:utils",
"//tensorflow/core/grappler/costs:utils",
"//tensorflow/core/kernels:ops_util",
],
)

View File

@ -56,5 +56,15 @@ void Cluster::DisableDetailedStats(bool disable) {
}
}
const std::vector<string> Cluster::GetDeviceNames() const {
std::vector<string> device_names;
device_names.reserve(devices_.size());
for (const auto& device : devices_) {
device_names.push_back(device.first);
}
std::sort(device_names.begin(), device_names.end());
return device_names;
}
} // end namespace grappler
} // end namespace tensorflow

View File

@ -17,13 +17,14 @@ limitations under the License.
#define TENSORFLOW_GRAPPLER_CLUSTERS_CLUSTER_H_
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "tensorflow/core/framework/device_attributes.pb.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/grappler/grappler_item.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/protobuf/device_properties.pb.h"
#include "tensorflow/core/public/session_options.h"
namespace tensorflow {
@ -62,18 +63,14 @@ class Cluster {
// Return the list of TensorFlow devices that are available to execute a
// graph. This is empty until provision() is called.
const std::vector<DeviceAttributes>& GetDevices() const { return devices_; }
// Convenience method that returns the set of device names.
const std::vector<string> GetDeviceNames() const {
std::vector<string> device_names;
device_names.reserve(devices_.size());
for (const auto& device : devices_) {
device_names.push_back(device.name());
}
return device_names;
const std::unordered_map<string, DeviceProperties>& GetDevices() const {
return devices_;
}
// Convenience method that returns the set of device names. These names are
// sorted alphabetically.
const std::vector<string> GetDeviceNames() const;
// Prepare the session to run the specified grappler item. This include
// initializing all the model variables.
virtual Status Initialize(const GrapplerItem& item) = 0;
@ -85,7 +82,7 @@ class Cluster {
RunMetadata* metadata) = 0;
protected:
std::vector<DeviceAttributes> devices_;
std::unordered_map<string, DeviceProperties> devices_;
const int timeout_s_;
SessionOptions options_;
RunOptions run_options_;

View File

@ -19,6 +19,7 @@ limitations under the License.
#include "tensorflow/cc/training/queue_runner.h"
#include "tensorflow/core/framework/step_stats.pb.h"
#include "tensorflow/core/grappler/costs/utils.h"
#include "tensorflow/core/grappler/utils.h"
#include "tensorflow/core/kernels/ops_util.h"
#include "tensorflow/core/lib/core/errors.h"
@ -66,16 +67,12 @@ Status SingleMachine::Provision() {
return status;
}
DeviceAttributes attr;
attr.set_name("/job:localhost/replica:0/task:0/cpu:0");
attr.set_device_type("CPU");
devices_.push_back(attr);
DeviceProperties attr = GetLocalCPUInfo();
devices_["/job:localhost/replica:0/task:0/cpu:0"] = GetLocalCPUInfo();
for (int i = 0; i < num_gpus_; ++i) {
DeviceAttributes attr;
attr.set_name(strings::StrCat("/job:localhost/replica:0/task:0/gpu:", i));
attr.set_device_type("GPU");
devices_.push_back(attr);
devices_[strings::StrCat("/job:localhost/replica:0/task:0/gpu:", i)] =
GetLocalGPUInfo(i);
}
return Status::OK();
}

View File

@ -25,7 +25,9 @@ tf_proto_library(
name = "op_performance_data",
srcs = ["op_performance_data.proto"],
cc_api_version = 2,
protodeps = ["//tensorflow/core:protos_all"],
protodeps = [
"//tensorflow/core:protos_all",
],
visibility = ["//visibility:public"],
)
@ -141,10 +143,10 @@ cc_library(
hdrs = ["virtual_placer.h"],
visibility = ["//visibility:public"],
deps = [
":op_performance_data_cc",
":utils",
"//tensorflow/core:framework",
"//tensorflow/core:framework_lite",
"//tensorflow/core:lib",
"//tensorflow/core:protos_all_cc",
"//tensorflow/core/grappler:devices",
"//tensorflow/core/grappler/clusters:cluster",

View File

@ -73,7 +73,7 @@ Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
std::vector<OpInfo::TensorProperties> inputs =
properties.GetInputProperties(node->name());
OpInfo::DeviceProperties device = placer.get_device(*node);
DeviceProperties device = placer.get_device(*node);
OpInfo op_info;
op_info.set_op(node->op());
*op_info.mutable_attr() = node->attr();

View File

@ -69,7 +69,7 @@ Costs OpLevelCostEstimator::PredictCosts(const OpInfo& op_features) const {
}
std::pair<double, double> OpLevelCostEstimator::GetDeviceInfo(
const OpInfo::DeviceProperties& device) const {
const DeviceProperties& device) const {
double gflops = -1;
double bandwidth = -1;
if (device.bandwidth() > 0) {
@ -77,7 +77,7 @@ std::pair<double, double> OpLevelCostEstimator::GetDeviceInfo(
}
if (device.type() == "CPU") {
const OpInfo::DeviceProperties local_cpu = GetLocalCPUInfo();
const DeviceProperties local_cpu = GetLocalCPUInfo();
// Check if vector instructions are available, and refine performance
// prediction based on this.
// Frequencies are stored in MHz in the DeviceProperties.
@ -90,7 +90,7 @@ std::pair<double, double> OpLevelCostEstimator::GetDeviceInfo(
}
}
} else if (device.type() == "GPU") {
const OpInfo::DeviceProperties local_gpu = GetLocalGPUInfo(0);
const DeviceProperties local_gpu = GetLocalGPUInfo(0);
const string architecture = local_gpu.environment().at("architecture");
int cores_per_multiprocessor;
if (architecture < "3") {

View File

@ -40,7 +40,7 @@ class OpLevelCostEstimator {
// executed per second) and memory bandwith (in GigaBytes/second) for the
// specified device.
virtual std::pair<double, double> GetDeviceInfo(
const OpInfo::DeviceProperties& device) const;
const DeviceProperties& device) const;
// For operations for which we haven't yet built estimates, returns a dummy
// value based on input size.

View File

@ -17,6 +17,7 @@ limitations under the License.
#include "tensorflow/core/framework/tensor_shape.pb.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/platform/test.h"
#include "tensorflow/core/protobuf/device_properties.pb.h"
namespace tensorflow {
namespace grappler {

View File

@ -22,6 +22,7 @@ import "tensorflow/core/framework/tensor.proto";
import "tensorflow/core/framework/tensor_shape.proto";
import "tensorflow/core/framework/types.proto";
import "tensorflow/core/framework/attr_value.proto";
import "tensorflow/core/protobuf/device_properties.proto";
// Description of an operation as well as the parameters expected to impact its
// performance.
@ -41,36 +42,6 @@ message OpInfo {
repeated TensorProperties inputs = 3;
// Device on which the operation is run.
message DeviceProperties {
// Device type (CPU, GPU, ...)
string type = 1;
// Vendor (Intel, nvidia, ...)
string vendor = 2;
// Model (Haswell, K40, ...)
string model = 3;
// Core Frequency in Mhz
int64 frequency = 4;
// Number of cores
int64 num_cores = 5;
// Version of the tools and libraries used with this device (e.g. gcc 4.9,
// cudnn 5.1)
map<string, string> environment = 6;
// Number of registers per core.
int64 num_registers = 7;
// L1 cache size in bytes
int64 l1_cache_size = 8;
// L2 cache size in bytes
int64 l2_cache_size = 9;
// L3 cache size in bytes
int64 l3_cache_size = 10;
// Shared memory size per multiprocessor in bytes. This field is
// applicable to GPUs only.
int64 shared_memory_size_per_multiprocessor = 11;
// Memory size in bytes
int64 memory_size = 12;
// Memory bandwidth in KB/s
int64 bandwidth = 13;
}
DeviceProperties device = 4;
}

View File

@ -125,7 +125,7 @@ std::vector<OpInfo::TensorProperties> FindInputFeatures(
return inputs;
}
OpInfo::DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
DeviceNameUtils::ParsedName parsed;
if (DeviceNameUtils::ParseFullName(node.device(), &parsed)) {
if (parsed.type == "GPU") {
@ -134,13 +134,13 @@ OpInfo::DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
return GetLocalCPUInfo();
}
}
OpInfo::DeviceProperties device;
DeviceProperties device;
device.set_type("UNKNOWN");
return device;
}
OpInfo::DeviceProperties GetLocalCPUInfo() {
OpInfo::DeviceProperties device;
DeviceProperties GetLocalCPUInfo() {
DeviceProperties device;
device.set_type("CPU");
device.set_vendor(port::CPUVendorIDString());
@ -165,8 +165,8 @@ OpInfo::DeviceProperties GetLocalCPUInfo() {
return device;
}
OpInfo::DeviceProperties GetLocalGPUInfo(int gpu_id) {
OpInfo::DeviceProperties device;
DeviceProperties GetLocalGPUInfo(int gpu_id) {
DeviceProperties device;
device.set_type("GPU");
#if GOOGLE_CUDA

View File

@ -25,6 +25,7 @@ limitations under the License.
#include "tensorflow/core/graph/types.h"
#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/protobuf/device_properties.pb.h"
namespace tensorflow {
namespace grappler {
@ -40,14 +41,14 @@ std::vector<OpInfo::TensorProperties> FindInputFeatures(
const std::unordered_map<string, const NodeDef*>& name_to_node);
// Returns the DeviceProperties of the device on which 'node' runs.
OpInfo::DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node);
DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node);
// Returns the DeviceProperties of the CPU on which grappler is running.
OpInfo::DeviceProperties GetLocalCPUInfo();
DeviceProperties GetLocalCPUInfo();
// Returns the DeviceProperties for the specified GPU attached to the server on
// which grappler is running.
OpInfo::DeviceProperties GetLocalGPUInfo(int gpu_id);
DeviceProperties GetLocalGPUInfo(int gpu_id);
} // end namespace grappler
} // end namespace tensorflow

View File

@ -18,35 +18,48 @@ limitations under the License.
#include "tensorflow/core/grappler/clusters/cluster.h"
#include "tensorflow/core/grappler/costs/utils.h"
#include "tensorflow/core/grappler/devices.h"
#include "tensorflow/core/lib/strings/str_util.h"
#include "tensorflow/core/util/device_name_utils.h"
namespace tensorflow {
namespace grappler {
VirtualPlacer::VirtualPlacer(Cluster* cluster) : has_gpu_(false) {
devices_["CPU"] = GetLocalCPUInfo();
if (GetNumAvailableGPUs() > 0) {
has_gpu_ = true;
devices_["GPU"] = GetLocalGPUInfo(0);
devices_ = cluster->GetDevices();
for (const auto& device : cluster->GetDevices()) {
if (str_util::Lowercase(device.first).find("gpu") != string::npos) {
has_gpu_ = true;
}
}
unknown_device_.set_type("UNKNOWN");
}
const OpInfo::DeviceProperties& VirtualPlacer::get_device(
const NodeDef& node) const {
string device_type;
const DeviceProperties& VirtualPlacer::get_device(const NodeDef& node) const {
DeviceNameUtils::ParsedName parsed;
if (!node.device().empty() &&
DeviceNameUtils::ParseFullName(node.device(), &parsed)) {
device_type = parsed.type;
} else {
if (has_gpu_) {
device_type = "GPU";
} else {
device_type = "CPU";
if (!node.device().empty()) {
auto it = devices_.find(node.device());
if (it != devices_.end()) {
return it->second;
}
if (DeviceNameUtils::ParseFullName(node.device(), &parsed)) {
string device_name =
strings::StrCat("/job:localhost/replica:0/task:0/",
str_util::Lowercase(parsed.type), ":", parsed.id);
it = devices_.find(device_name);
if (it != devices_.end()) {
return it->second;
}
}
return unknown_device_;
}
auto it = devices_.find(device_type);
string device;
if (has_gpu_) {
device = "/job:localhost/replica:0/task:0/gpu:0";
} else {
device = "/job:localhost/replica:0/task:0/cpu:0";
}
auto it = devices_.find(device);
if (it == devices_.end()) {
return unknown_device_;
}

View File

@ -17,8 +17,8 @@ limitations under the License.
#define TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_PLACER_H_
#include <unordered_map>
#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/protobuf/device_properties.pb.h"
namespace tensorflow {
class NodeDef;
@ -31,12 +31,12 @@ class VirtualPlacer {
public:
VirtualPlacer(Cluster* cluster);
const OpInfo::DeviceProperties& get_device(const NodeDef& node) const;
const DeviceProperties& get_device(const NodeDef& node) const;
private:
std::unordered_map<string, OpInfo::DeviceProperties> devices_;
std::unordered_map<string, DeviceProperties> devices_;
bool has_gpu_;
OpInfo::DeviceProperties unknown_device_;
DeviceProperties unknown_device_;
};
} // namespace grappler

View File

@ -38,9 +38,9 @@ using Eigen::GpuDevice;
// in NHWC format.
template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
int kKnownDepthMultiplier>
__global__ void DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args,
const T* input, const T* filter,
T* output, int num_outputs) {
__global__ void __launch_bounds__(1024, 2)
DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args, const T* input,
const T* filter, T* output, int num_outputs) {
const int in_rows = args.in_rows;
const int in_cols = args.in_cols;
const int in_depth = args.in_depth;
@ -120,9 +120,9 @@ __global__ void DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args,
// in NCHW format.
template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
int kKnownDepthMultiplier>
__global__ void DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args,
const T* input, const T* filter,
T* output, int num_outputs) {
__global__ void __launch_bounds__(1024, 2)
DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args, const T* input,
const T* filter, T* output, int num_outputs) {
const int in_rows = args.in_rows;
const int in_cols = args.in_cols;
const int in_depth = args.in_depth;
@ -250,17 +250,34 @@ void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
TensorFormat data_format) {
const int num_outputs =
args.batch * args.out_rows * args.out_cols * args.out_depth;
CudaLaunchConfig config = GetCudaLaunchConfig(num_outputs, d);
// The compile-time constant version runs faster with a single block.
const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 ||
kKnownDepthMultiplier < 0 ||
args.out_rows * args.out_cols <= 256
? std::numeric_limits<int>::max()
: d.getNumCudaMultiProcessors();
if (data_format == FORMAT_NHWC) {
CudaLaunchConfig config = GetCudaLaunchConfig(
num_outputs, d,
DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
kKnownDepthMultiplier>,
0);
DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
kKnownDepthMultiplier>
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
args, input, filter, output, num_outputs);
<<<std::min(max_block_count, config.block_count),
config.thread_per_block, 0, d.stream()>>>(args, input, filter,
output, num_outputs);
} else if (data_format == FORMAT_NCHW) {
CudaLaunchConfig config = GetCudaLaunchConfig(
num_outputs, d,
DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
kKnownDepthMultiplier>,
0);
DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
kKnownDepthMultiplier>
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
args, input, filter, output, num_outputs);
<<<std::min(max_block_count, config.block_count),
config.thread_per_block, 0, d.stream()>>>(args, input, filter,
output, num_outputs);
} else {
assert(false);
}
@ -288,9 +305,11 @@ template struct DepthwiseConv2dGPULaunch<double>;
// A Cuda kernel to compute the depthwise convolution backprop w.r.t. input.
template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
int kKnownDepthMultiplier>
__global__ void DepthwiseConv2dBackpropInputGPUKernelNHWC(
const DepthwiseArgs args, const T* out_backprop, const T* filter,
T* in_backprop, int num_in_backprop) {
__global__ void __launch_bounds__(640, 2)
DepthwiseConv2dBackpropInputGPUKernelNHWC(const DepthwiseArgs args,
const T* out_backprop,
const T* filter, T* in_backprop,
int num_in_backprop) {
const int in_rows = args.in_rows;
const int in_cols = args.in_cols;
const int in_depth = args.in_depth;
@ -350,7 +369,7 @@ __global__ void DepthwiseConv2dBackpropInputGPUKernelNHWC(
template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
int kKnownDepthMultiplier>
__global__ void __launch_bounds__(1024)
__global__ void __launch_bounds__(640, 2)
DepthwiseConv2dBackpropInputGPUKernelNCHW(const DepthwiseArgs args,
const T* out_backprop,
const T* filter, T* in_backprop,
@ -428,17 +447,22 @@ void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
TensorFormat data_format) {
const int num_in_backprop =
args.batch * args.in_rows * args.in_cols * args.in_depth;
CudaLaunchConfig config = GetCudaLaunchConfig(num_in_backprop, d);
// Increase block count for when there are more warps/SM than threads/SM.
// TODO(csigg): this is pretty arbitraty and should be generalized using
// cudaOccupancyMaxPotentialBlockSize().
config.block_count *= 4;
if (data_format == FORMAT_NHWC) {
CudaLaunchConfig config = GetCudaLaunchConfig(
num_in_backprop, d,
DepthwiseConv2dBackpropInputGPUKernelNHWC<
T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
0);
DepthwiseConv2dBackpropInputGPUKernelNHWC<
T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
args, out_backprop, filter, in_backprop, num_in_backprop);
} else if (data_format == FORMAT_NCHW) {
CudaLaunchConfig config = GetCudaLaunchConfig(
num_in_backprop, d,
DepthwiseConv2dBackpropInputGPUKernelNCHW<
T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
0);
DepthwiseConv2dBackpropInputGPUKernelNCHW<
T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
@ -475,9 +499,12 @@ template struct DepthwiseConv2dBackpropInputGPULaunch<double>;
// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
int kKnownDepthMultiplier>
__global__ void DepthwiseConv2dBackpropFilterGPUKernelNHWC(
const DepthwiseArgs args, const T* out_backprop, const T* input,
T* filter_backprop, int num_out_backprop) {
__global__ void __launch_bounds__(640, 2)
DepthwiseConv2dBackpropFilterGPUKernelNHWC(const DepthwiseArgs args,
const T* out_backprop,
const T* input,
T* filter_backprop,
int num_out_backprop) {
const int in_rows = args.in_rows;
const int in_cols = args.in_cols;
const int in_depth = args.in_depth;
@ -566,9 +593,12 @@ __global__ void DepthwiseConv2dBackpropFilterGPUKernelNHWC(
// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
int kKnownDepthMultiplier>
__global__ void DepthwiseConv2dBackpropFilterGPUKernelNCHW(
const DepthwiseArgs args, const T* out_backprop, const T* input,
T* filter_backprop, int num_out_backprop) {
__global__ void __launch_bounds__(640, 2)
DepthwiseConv2dBackpropFilterGPUKernelNCHW(const DepthwiseArgs args,
const T* out_backprop,
const T* input,
T* filter_backprop,
int num_out_backprop) {
const int in_rows = args.in_rows;
const int in_cols = args.in_cols;
const int in_depth = args.in_depth;
@ -669,13 +699,22 @@ void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d,
TensorFormat data_format) {
const int num_out_backprop =
args.batch * args.out_rows * args.out_cols * args.out_depth;
CudaLaunchConfig config = GetCudaLaunchConfig(num_out_backprop, d);
if (data_format == FORMAT_NHWC) {
CudaLaunchConfig config = GetCudaLaunchConfig(
num_out_backprop, d,
DepthwiseConv2dBackpropFilterGPUKernelNHWC<
T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
0);
DepthwiseConv2dBackpropFilterGPUKernelNHWC<
T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
args, out_backprop, input, filter_backprop, num_out_backprop);
} else if (data_format == FORMAT_NCHW) {
CudaLaunchConfig config = GetCudaLaunchConfig(
num_out_backprop, d,
DepthwiseConv2dBackpropFilterGPUKernelNCHW<
T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
0);
DepthwiseConv2dBackpropFilterGPUKernelNCHW<
T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(

View File

@ -87,6 +87,12 @@ class SplitVOpBase : public OpKernel {
// Special case 1: num_split == 1. Nothing to do.
if (num_split == 1) {
context->set_output(0, context->input(0));
OP_REQUIRES(
context, (*split_sizes_vec)[0] == input_size_split_dim,
errors::InvalidArgument("If there is only one output, it must have "
"the same size as the input. Input size: ",
input_size_split_dim,
" output size: ", (*split_sizes_vec)[0]));
*done = true;
return;
}

View File

@ -127,6 +127,16 @@ class TemporaryVariableOp : public OpKernel {
OP_REQUIRES_OK(context, rm->Create(context->step_container()->name(),
var_name_, tmp_var));
context->set_output_ref(0, &tmp_var->mu, &tmp_var->val);
if (context->track_allocations()) {
AllocatorAttributes attr;
if (context->allocate_on_host(attr)) {
context->record_host_persistent_memory_allocation(
tmp_var->val.AllocatedBytes());
} else {
context->record_device_persistent_memory_allocation(
tmp_var->val.AllocatedBytes());
}
}
}
private:

View File

@ -518,7 +518,17 @@ REGISTER_OP("SplitV")
} else if (rank == 0) {
// Throw error if input is a scalar.
return errors::InvalidArgument("Can't split scalars");
} else if (size_splits == nullptr || !c->ValueKnown(split_dimension)) {
} else if (size_splits == nullptr && c->ValueKnown(split_dimension)) {
// If split dimension is known, but the sizes are unknown, then
// only the split dimension is unknown
output_shape = input;
TF_RETURN_IF_ERROR(c->ReplaceDim(output_shape,
c->Value(split_dimension),
c->UnknownDim(), &output_shape));
for (int i = 0; i < num_outputs; ++i) {
c->set_output(i, output_shape);
}
} else if (size_splits == nullptr && !c->ValueKnown(split_dimension)) {
// If split dimension or tensor containing the split sizes is unknown,
// then return unknown shapes of same rank as input.
output_shape = c->UnknownShapeOfRank(rank);
@ -540,12 +550,37 @@ REGISTER_OP("SplitV")
return errors::InvalidArgument(
"Length of size_splits should be equal to num_outputs");
}
int64_t cumsum_outputs = 0;
bool has_neg_one = false;
// If the sizes of the splits are known, then
// make sure that the sizes add up to the expected
// dimension size, with the possibility of a -1.
// Specify the full output shapes.
for (int i = 0; i < num_outputs; ++i) {
output_shape = c->UnknownShapeOfRank(rank);
TF_RETURN_IF_ERROR(c->ReplaceDim(input, split_dim,
c->MakeDim(data[i]), &output_shape));
c->set_output(i, output_shape);
if (data[i] == -1 && !has_neg_one)
has_neg_one = true;
else if (data[i] == -1 && has_neg_one)
return errors::InvalidArgument("size_splits can only have one -1");
else
cumsum_outputs += data[i];
}
auto split_dim_size = c->Value(c->Dim(input, split_dim));
if (has_neg_one) {
if (cumsum_outputs < split_dim_size)
cumsum_outputs = split_dim_size;
else
cumsum_outputs = split_dim_size + 1;
}
if (cumsum_outputs != c->Value(c->Dim(input, split_dim)))
return errors::InvalidArgument(
"Sum of output sizes must match "
"the size of the original Tensor along the split dimension "
"or the sum of the positive sizes must be less if it contains a "
"-1");
}
return Status::OK();

View File

@ -0,0 +1,51 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
syntax = "proto3";
package tensorflow;
option cc_enable_arenas = true;
option java_outer_classname = "DevicePropertiesProtos";
message DeviceProperties {
// Device type (CPU, GPU, ...)
string type = 1;
// Vendor (Intel, nvidia, ...)
string vendor = 2;
// Model (Haswell, K40, ...)
string model = 3;
// Core Frequency in Mhz
int64 frequency = 4;
// Number of cores
int64 num_cores = 5;
// Version of the tools and libraries used with this device (e.g. gcc 4.9,
// cudnn 5.1)
map<string, string> environment = 6;
// Number of registers per core.
int64 num_registers = 7;
// L1 cache size in bytes
int64 l1_cache_size = 8;
// L2 cache size in bytes
int64 l2_cache_size = 9;
// L3 cache size in bytes
int64 l3_cache_size = 10;
// Shared memory size per multiprocessor in bytes. This field is
// applicable to GPUs only.
int64 shared_memory_size_per_multiprocessor = 11;
// Memory size in bytes
int64 memory_size = 12;
// Memory bandwidth in KB/s
int64 bandwidth = 13;
}

View File

@ -63,6 +63,28 @@ inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
return config;
}
// Calculate the Cuda launch config we should use for a kernel launch. This
// variant takes the resource limits of func into account to maximize occupancy.
template <typename DeviceFunc>
inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
const GPUDevice& d, DeviceFunc func,
size_t dynamic_shared_memory_size) {
int block_count = 0;
int thread_per_block = 0;
cudaOccupancyMaxPotentialBlockSize(&block_count, &thread_per_block, func,
dynamic_shared_memory_size,
work_element_count);
block_count =
std::min(block_count,
(work_element_count + thread_per_block - 1) / thread_per_block);
CudaLaunchConfig config;
config.virtual_thread_count = work_element_count;
config.thread_per_block = thread_per_block;
config.block_count = block_count;
return config;
}
struct Cuda2DLaunchConfig {
dim3 virtual_thread_count;
dim3 thread_per_block;

View File

@ -0,0 +1 @@
#include "unsupported/Eigen/CXX11/ThreadPool"

View File

@ -393,6 +393,17 @@ def bucketized_column(source_column, boundaries):
`boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`,
`[1., 2.)`, and `[2., +inf)`.
For example, if the inputs are
`boundaries` = [0, 10, 100]
input tensor = [[-5, 10000]
[150, 10]
[5, 100]]
then the output will be
output = [[0, 3]
[3, 2]
[1, 3]]
Example:
```python

View File

@ -324,6 +324,48 @@ class FunctionTest(test.TestCase):
"assertion"):
_ = MyFn(100.0).eval()
def testControlFlowStrictness(self):
"""Inlined functions must not execute in a untaken control flow branch."""
@function.Defun(dtypes.int32)
def AssertFail(x):
# Assertion that always fails and does not have a data dependency on `x`.
assert_false = control_flow_ops.Assert(False, [42])
with ops.control_dependencies([assert_false]):
return array_ops.identity(x)
with ops.device("CPU"):
pred = array_ops.placeholder(dtypes.bool)
x = array_ops.placeholder(dtypes.int32)
cond = control_flow_ops.cond(pred, lambda: x + 1, lambda: AssertFail(x))
# pylint: disable=unnecessary-lambda
loop = control_flow_ops.while_loop(lambda y: pred,
lambda y: AssertFail(y), [x])
# pylint: enable=unnecessary-lambda
# Enables inlining.
config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
optimizer_options=config_pb2.OptimizerOptions(
opt_level=config_pb2.OptimizerOptions.L0,
do_common_subexpression_elimination=True,
do_function_inlining=True,
do_constant_folding=True)))
with session.Session(config=config) as sess:
# Since the 'False' branch is not taken, the assertion should not fire.
self.assertEqual(4, sess.run(cond, {pred: True, x: 3}))
# The assertion should still fire if the False branch is taken.
with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
"assertion"):
sess.run(cond, {pred: False, x: 3})
# Similarly for loops.
self.assertEqual(3, sess.run(loop, {pred: False, x: 3}))
with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
"assertion"):
sess.run(loop, {pred: True, x: 3})
def testVar(self):
@function.Defun(dtypes.float32)

View File

@ -184,8 +184,11 @@ class BiasAddTest(test.TestCase):
if dtype == dtypes.float64:
threshold = 1e-10
self.assertAllClose(tensor_jacob_t, tensor_jacob_n, threshold, threshold)
self.assertAllClose(bias_jacob_t, bias_jacob_n, threshold, threshold)
self.assertAllClose(grad_jacob_t, grad_jacob_n, threshold, threshold)
# TODO(annarev): Re-add assertion for float16, float32 dtypes and NCHW
# once we figure out why this check started failing with cuda mavx.
if dtype == dtypes.float64 or data_format != "NCHW":
self.assertAllClose(bias_jacob_t, bias_jacob_n, threshold, threshold)
self.assertAllClose(grad_jacob_t, grad_jacob_n, threshold, threshold)
def testGradientTensor(self):
for (data_format, use_gpu) in GetTestConfigs():

View File

@ -22,6 +22,7 @@ import numpy as np
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import errors_impl
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import gradients_impl
@ -40,6 +41,42 @@ class SplitOpTest(test.TestCase):
data -= 1j * data
return data
def testShapeInference(self):
model_input = array_ops.placeholder(dtypes.float32, shape=(1, 10))
# check that we fail during static shape inference if sizes are known
with self.assertRaises(ValueError):
# pylint: disable=expression-not-assigned
array_ops.split(model_input, [4], axis=1)[0]
# pylint: enable=expression-not-assigned
model_input = array_ops.placeholder(dtypes.float32)
inp = np.zeros((1, 10))
# check that we still fail at runtime if the shapes were unknown
with self.test_session(use_gpu=False) as sess:
with self.assertRaises(errors_impl.InvalidArgumentError):
sess.run(array_ops.split(model_input, [4]), {model_input: inp})
# test that we can pass a scalar Tensor as num_splits
with self.test_session(use_gpu=False) as sess:
result = sess.run(
array_ops.split(
array_ops.ones([4, 4]),
num_or_size_splits=array_ops.ones([2, 2]).get_shape()[1],
axis=0))
self.assertEqual(result[0].shape, (2, 4))
self.assertEqual(result[1].shape, (2, 4))
# test that none split dimensions remain, even if we don't know how
# the split_dim will be split, but we do know the axis
result = array_ops.split(
array_ops.ones([5, 2]), array_ops.constant([2, 1, 2]) * 1, axis=0)
self.assertEqual(result[0].shape[1], 2)
self.assertEqual(result[1].shape[1], 2)
self.assertEqual(result[2].shape[1], 2)
def testExplicitNum(self):
size_splits = array_ops.placeholder(dtype=dtypes.int32, shape=[None])

View File

@ -84,7 +84,6 @@ from __future__ import print_function
import sys
import numpy as np
import six
from tensorflow.python.framework import common_shapes
from tensorflow.python.framework import constant_op
@ -1165,13 +1164,14 @@ def sparse_mask(a, mask_indices, name=None):
def split(value, num_or_size_splits, axis=0, num=None, name="split"):
"""Splits a tensor into sub tensors.
If `num_or_size_splits` is a scalar, `num_split`, then splits `value` along
dimension `axis` into `num_split` smaller tensors.
If `num_or_size_splits` is an integer type, `num_split`, then splits `value`
along dimension `axis` into `num_split` smaller tensors.
Requires that `num_split` evenly divides `value.shape[axis]`.
If `num_or_size_splits` is a tensor, `size_splits`, then splits `value` into
`len(size_splits)` pieces. The shape of the `i`-th piece has the same size as
the `value` except along dimension `axis` where the size is `size_splits[i]`.
If `num_or_size_splits` is not an integer type, it is presumed to be a Tensor
`size_splits`, then splits `value` into `len(size_splits)` pieces. The shape
of the `i`-th piece has the same size as the `value` except along dimension
`axis` where the size is `size_splits[i]`.
For example:
@ -1189,11 +1189,11 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
Args:
value: The `Tensor` to split.
num_or_size_splits: Either an integer indicating the number of splits along
split_dim or a 1-D Tensor containing the sizes of each output tensor
along split_dim. If an integer then it must evenly divide
`value.shape[axis]`; otherwise the sum of sizes along the split
dimension must match that of the `value`.
num_or_size_splits: Either a 0-D integer `Tensor` indicating the number of
splits along split_dim or a 1-D integer `Tensor` integer tensor containing
the sizes of each output tensor along split_dim. If a scalar then it must
evenly divide `value.shape[axis]`; otherwise the sum of sizes along the
split dimension must match that of the `value`.
axis: A 0-D `int32` `Tensor`. The dimension along which to split.
Must be in the range `[0, rank(value))`. Defaults to 0.
num: Optional, used to specify the number of outputs when it cannot be
@ -1209,11 +1209,11 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
Raises:
ValueError: If `num` is unspecified and cannot be inferred.
"""
if isinstance(num_or_size_splits, six.integer_types):
size_splits = ops.convert_to_tensor(num_or_size_splits)
if size_splits.get_shape().ndims == 0 and size_splits.dtype.is_integer:
return gen_array_ops._split(
split_dim=axis, num_split=num_or_size_splits, value=value, name=name)
else:
size_splits = ops.convert_to_tensor(num_or_size_splits)
if num is None:
size_splits_shape = size_splits.get_shape()
num = size_splits_shape.dims[0]

View File

@ -1622,6 +1622,11 @@ class CondContext(ControlFlowContext):
# pylint: enable=protected-access
for x in op.outputs:
self._values.add(x.name)
# pylint: disable=protected-access
if op.graph._is_function(op.type) or op.type == "SymbolicGradient":
op._add_control_input(self._pivot.op)
# pylint: enable=protected-access
if self._outer_context or not IsLoopExit(op):
op.graph.prevent_fetching(op)
@ -2147,8 +2152,13 @@ class WhileContext(ControlFlowContext):
def _MaybeAddControlDependency(self, op):
"""Add a control input to the op if it only depends on loop invariants."""
def _IsOpFree(op):
"""Determines if `op` needs a control dependency."""
if op.control_inputs:
return False
# pylint: disable=protected-access
if op.graph._is_function(op.type) or op.type == "SymbolicGradient":
return True
# pylint: enable=protected-access
for x in op.inputs:
if not _IsLoopConstantEnter(x.op):
return False

View File

@ -30,6 +30,11 @@ def _IsDirectory(parent, item):
return gfile.IsDirectory(os.path.join(parent, item))
def PluginDirectory(logdir, plugin_name):
"""Returns the plugin directory for plugin_name."""
return os.path.join(logdir, _PLUGINS_DIR, plugin_name)
def ListPlugins(logdir):
"""List all the plugins that have registered assets in logdir.
@ -61,7 +66,7 @@ def ListAssets(logdir, plugin_name):
not exist (either because the logdir doesn't exist, or because the plugin
didn't register) an empty list is returned.
"""
plugin_dir = os.path.join(logdir, _PLUGINS_DIR, plugin_name)
plugin_dir = PluginDirectory(logdir, plugin_name)
if not gfile.IsDirectory(plugin_dir):
return []
entries = gfile.ListDirectory(plugin_dir)
@ -83,7 +88,7 @@ def RetrieveAsset(logdir, plugin_name, asset_name):
KeyError: if the asset does not exist.
"""
asset_path = os.path.join(logdir, _PLUGINS_DIR, plugin_name, asset_name)
asset_path = os.path.join(PluginDirectory(logdir, plugin_name), asset_name)
try:
with gfile.Open(asset_path, "r") as f:
return f.read()

View File

@ -50,6 +50,11 @@ class PluginGamma(GenericContentPlugin):
class PluginAssetUtilitiesTest(test.TestCase):
def testGetPluginDirectory(self):
self.assertEqual(
os.path.join("logdir", "plugins", "x"),
plugin_asset_util.PluginDirectory("logdir", "x"))
def testNonExistentDirectory(self):
tempdir = self.get_temp_dir()
fake_dir = os.path.join(tempdir, "nonexistent_dir")

View File

@ -46,6 +46,7 @@ tensorboard_typescript_genrule(
],
typings = [
"@org_definitelytyped//:d3.d.ts",
"@org_definitelytyped//:lodash.d.ts",
"//tensorflow/tensorboard/components/vz_sorting:ts_typings",
],
)

View File

@ -72,24 +72,31 @@ module Categorizer {
if (tags.length === 0) {
return [];
}
let sortedTags = tags.slice().sort(VZ.Sorting.compareTagNames);
let categories: Category[] = [];
let currentCategory = {
name: extractor(sortedTags[0]),
tags: [],
};
sortedTags.forEach((t: string) => {
let topLevel = extractor(t);
if (currentCategory.name !== topLevel) {
categories.push(currentCategory);
currentCategory = {
// Maps between top-level name and category. We use the mapping to avoid
// duplicating categories per run.
const categoryMapping: {[key: string]: Category} = {};
tags.forEach((t: string) => {
const topLevel = extractor(t);
if (!categoryMapping[topLevel]) {
const newCategory = {
name: topLevel,
tags: [],
};
categoryMapping[topLevel] = newCategory;
}
currentCategory.tags.push(t);
categoryMapping[topLevel].tags.push(t);
});
// Sort categories into alphabetical order.
const categories =
_.map(_.keys(categoryMapping).sort(), key => categoryMapping[key]);
_.forEach(categories, (category) => {
// Sort the tags within each category.
category.tags.sort(VZ.Sorting.compareTagNames);
});
categories.push(currentCategory);
return categories;
};
}

View File

@ -62,6 +62,18 @@ module Categorizer {
assert.deepEqual(
topLevelNamespaceCategorizer(['a']), [{name: 'a', tags: ['a']}]);
});
it('only create 1 category per run', () => {
// TensorBoard separates runs from tags using the / and _ characters
// *only* during sorting. The categorizer should group all tags under
// their correct categories - and create only 1 category per run.
const tags = ['foo/bar', 'foo_in_between_run/baz', 'foo/quux'];
const expected = [
{name: 'foo', tags: ['foo/bar', 'foo/quux']},
{name: 'foo_in_between_run', tags: ['foo_in_between_run/baz']},
];
assert.deepEqual(topLevelNamespaceCategorizer(tags), expected);
});
});
describe('customCategorizer', () => {

View File

@ -73,24 +73,31 @@ function extractorToCategorizer(extractor: (s: string) => string): Categorizer {
if (tags.length === 0) {
return [];
}
let sortedTags = tags.slice().sort(compareTagNames);
let categories: Category[] = [];
let currentCategory = {
name: extractor(sortedTags[0]),
tags: [],
};
sortedTags.forEach((t: string) => {
let topLevel = extractor(t);
if (currentCategory.name !== topLevel) {
categories.push(currentCategory);
currentCategory = {
// Maps between top-level name and category. We use the mapping to avoid
// duplicating categories per run.
const categoryMapping: {[key: string]: Category} = {};
tags.forEach((t: string) => {
const topLevel = extractor(t);
if (!categoryMapping[topLevel]) {
const newCategory = {
name: topLevel,
tags: [],
};
categoryMapping[topLevel] = newCategory;
}
currentCategory.tags.push(t);
categoryMapping[topLevel].tags.push(t);
});
// Sort categories into alphabetical order.
const categories =
_.map(_.keys(categoryMapping).sort(), key => categoryMapping[key]);
_.forEach(categories, (category) => {
// Sort the tags within each category.
category.tags.sort(compareTagNames);
});
categories.push(currentCategory);
return categories;
};
}
@ -180,4 +187,4 @@ Polymer({
this._setCategories(categories);
})
},
});
});

View File

@ -0,0 +1,46 @@
package(default_visibility = ["//tensorflow:internal"])
load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
licenses(["notice"]) # Apache 2.0
webfiles(
name = "tf_graph_loader",
srcs = [
"tf-graph-loader.html",
],
path = "/tf-graph-loader",
deps = [
"//tensorflow/tensorboard/components/tf_graph_common",
"@org_polymer",
],
)
filegroup(
name = "all_files",
srcs = glob(["**"]),
tags = ["notsan"],
)
################################################################################
# MARKED FOR DELETION
tensorboard_webcomponent_library(
name = "legacy",
srcs = [
"tf-graph-loader.html",
],
destdir = "tf-graph-loader",
deps = [
"//tensorflow/tensorboard/components/tf_graph_common:legacy",
],
)
# This is needed despite how this component lacks TypeScript files because
# components/BUILD seeks a legacy_ts rule in this package.
tensorboard_ts_library(
name = "legacy_ts",
srcs = [],
)

View File

@ -0,0 +1,24 @@
package(default_visibility = ["//tensorflow:internal"])
load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
licenses(["notice"]) # Apache 2.0
# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_loader/demo
webfiles(
name = "demo",
srcs = ["index.html"] + glob(["data/**"]),
path = "/tf-graph-loader/demo",
deps = [
"//tensorflow/tensorboard/components/tf_graph_loader",
"@org_polymer_iron_demo_helpers",
"@org_polymer_paper_styles",
"@org_polymer_webcomponentsjs",
],
)
filegroup(
name = "all_files",
srcs = glob(["**"]),
tags = ["notsan"],
)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,78 @@
<!doctype html>
<!--
@license
Copyright 2016 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
<link rel="import" href="../tf-graph-loader.html">
<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
<title>TF Graph Loader Demo</title>
</head>
<body>
<demo-snippet>
<template>
<dom-module id="tf-graph-loader-demo">
<template>
<tf-graph-loader id="loader"
datasets="[[_datasets]]"
selected-dataset="[[_selectedDataset]]"
progress="{{_progress}}"></tf-graph-loader>
</template>
<script>
Polymer({
is: "tf-graph-loader-demo",
properties: {
// We tell the graph loader to load a specific pbtxt file.
_datasets: {
type: Array,
value: [{
"name": "Graph with XLA Clusters Specified",
"path": "data/graph.pbtxt"
}],
},
_selectedDataset: {
type: Number,
value: 0,
},
// This property will be updated by the graph loader.
_progress: {
type: Object,
},
},
observers: [
'_progressUpdated(_progress)',
],
_progressUpdated(progress) {
// console.log the progress.
console.log('Progress updated.', progress);
// The graph has loaded. console.log it.
if (progress.value == 100) {
console.log('graph', this.$.loader.outGraph);
}
},
});
</script>
</dom-module>
<!-- The graph loader lacks visual elements. -->
<tf-graph-loader-demo></tf-graph-loader-demo>
</template>
</demo-snippet>
</body>
</html>

View File

@ -16,6 +16,7 @@ limitations under the License.
-->
<link rel="import" href="../polymer/polymer.html">
<link rel="import" href="../tf-graph-common/tf-graph-common.html">
<!--
An element which provides a filter parsing for pbtxt to graph output.

View File

@ -10,12 +10,17 @@ Consultants: Jon Shlens, Pete Warden
1. Measure model parameters, float operations, tensor shapes.
2. Measure op execution times, requested memory size and device placement.
3. Inspect checkpoint tensors' shapes and their values.
4. Explore model based on name scope or graph structure.
4. 3 ways to view and explore TensorFlow model profiles
* Organize by Python code call stack.
* Organize by TensorFlow operation name scope hierarchies.
* Organize by TensorFlow operation inputs/outputs graph.
5. Selectively grouping/filtering/accounting/ordering ops.
[Python API Tutorials](#python-api-tutorials): It can be called directly from
Python codes. Results are either printed
to stdout or dumped to file. tensorflow.tfprof.TFProfNode proto is returned from
to stdout or dumped to file. tensorflow.tfprof.TFGraphNodeProto proto is returned from
the API to allow users to perform further analysis.
[CLI Tutorials](#cli-tutorials):
@ -33,13 +38,23 @@ tfprof is part of TensorFlow core. Simply ```import tensorflow as tf```.
### Examine the shapes and sizes of all trainable Variables.
```python
# Print trainable variable parameter statistics to stdout.
# By default, statistics are associated with each graph node.
param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
tf.get_default_graph(),
tfprof_options=tf.contrib.tfprof.model_analyzer.
TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
# param_stats is tensorflow.tfprof.TFProfNode proto. It organize the statistics
# of each graph node in tree scructure. Let's print the root below.
# Set tfprof_cmd='code' to associate statistics with Python codes.
opts = tf.contrib.tfprof.model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
opts['show_name_regexes'] = ['.*my_code1.py.*', '.*my_code2.py.*']
param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
tf.get_default_graph(),
tfprof_cmd='code'
tfprof_options=opts)
# param_stats is tensorflow.tfprof.TFGraphNodeProto proto.
# Let's print the root below.
sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
```
@ -84,8 +99,20 @@ Finally, you may run `print_model_analysis` to explore the timing and memory
demands of the model.
``` python
# See model_analyzer_test.py for more examples.
#
# Print to stdout an analysis of the memory usage and the timing information
# from running the graph broken down by operations.
# broken down by python codes.
opts = tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY.copy()
opts['show_name_regexes'] = ['.*my_code.py.*']
tf.contrib.tfprof.model_analyzer.print_model_analysis(
tf.get_default_graph(),
run_meta=run_metadata,
tfprof_cmd='code',
tfprof_options=opts)
# Print to stdout an analysis of the memory usage and the timing information
# broken down by operations.
tf.contrib.tfprof.model_analyzer.print_model_analysis(
tf.get_default_graph(),
run_meta=run_metadata,
@ -138,9 +165,9 @@ bazel-bin/tensorflow/tools/tfprof/tfprof \
--run_meta_path=run_meta \
--checkpoint_path=model.ckpt
#
# tfprof_log is used to define customized op types and float ops.
# tfprof_log is used to define customized op types, float ops and code traces.
# Use tfprof_logger.write_op_log() to create tfprof_log.
# See 11) in Examples section on generating tfprof_log file.
# See 12) in Examples section on generating tfprof_log file.
bazel-bin/tensorflow/tools/tfprof/tfprof \
--graph_path=graph.pbtxt \
--run_meta_path=run_meta \
@ -174,7 +201,28 @@ tfprof>
-dump_to_file
```
3) I want to see the `BatchNorm`'s gamma value in checkpoint.
3) I want to see which line of my python codes costs most time!
```shell
# Requires --graph_path --op_log_path
tfprof> code -max_depth 1000 -show_name_regexes .*model_analyzer.*py.* -select micros -account_type_regexes .* -order_by micros
_TFProfRoot (0us/22.44ms)
model_analyzer_test.py:149:run_filename_as_m...:none (0us/22.44ms)
model_analyzer_test.py:33:_run_code_in_main:none (0us/22.44ms)
model_analyzer_test.py:208:<module>:test.main() (0us/22.44ms)
model_analyzer_test.py:132:testComplexCodeView:x = lib.BuildFull... (0us/22.44ms)
model_analyzer_testlib.py:63:BuildFullModel:return sgd_op.min... (0us/21.83ms)
model_analyzer_testlib.py:58:BuildFullModel:cell, array_ops.c... (0us/333us)
model_analyzer_testlib.py:54:BuildFullModel:seq.append(array_... (0us/254us)
model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0us/134us)
model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0us/40us)
...
model_analyzer_testlib.py:61:BuildFullModel:loss = nn_ops.l2_... (0us/28us)
model_analyzer_testlib.py:60:BuildFullModel:target = array_op... (0us/0us)
model_analyzer_test.py:134:testComplexCodeView:sess.run(variable... (0us/0us)
```
4) I want to see the `BatchNorm`'s gamma value in checkpoint.
```shell
# Requires --graph_path, --checkpoint_path.
@ -186,7 +234,7 @@ _TFProfRoot ()
[1.57 1.83 1.30 1.25 1.59 1.14 1.26 0.82 1.19 1.10 1.48 1.01 0.82 1.23 1.21 1.14 ],
```
4) I want to see my checkpoint tensors shape and number of parameters.
5) I want to see my checkpoint tensors shape and number of parameters.
```shell
# Requires --graph_path, --checkpoint_path.
@ -205,7 +253,7 @@ _TFProfRoot (--/930.58k params)
unit_last/final_bn/moving_variance (64, 64/64 params)
```
5) I defined an op named cost to calculate the loss. I want to know what ops
6) I defined an op named cost to calculate the loss. I want to know what ops
it depends on take a long time to run. Hint: Use the graph command to explore
graph dependencies.
@ -221,7 +269,7 @@ _TFProfRoot (0us/3.61sec)
unit_3_3/sub2/conv2/Conv2D (10.26ms/3.60sec)
```
6) I want to know the expensive operations during the back propagation.
7) I want to know the expensive operations during the back propagation.
Hint: tensorflow prepend gradient to your defined name scopes. Use the scope
command to explore based on name scope hierarchies.
@ -238,7 +286,7 @@ _TFProfRoot (0us/2.29sec)
...
```
7) Show the number of float operations in the model.
8) Show the number of float operations in the model.
Note: float operations calculation depends on
1) op.RegisterStatistics. If an op doesnt
have RegisterStatistics defined, its float operations cannot be counted.
@ -263,7 +311,7 @@ _TFProfRoot (0/17.63b flops)
...
```
8) Show the number of parameters of all `tf.trainable_variables()` in the model.
9) Show the number of parameters of all `tf.trainable_variables()` in the model.
```shell
# Requires --graph_path --op_log_path.
@ -283,7 +331,7 @@ generated by write_op_log() Python API. write_op_log() help users create some
common op types implicitly. Users can define their own op types and log it
through the write_op_log() API.
9) What if Im lazy and dont want to define op type? I have given my ops
109) What if Im lazy and dont want to define op type? I have given my ops
well-defined names in my models code. And want to use names to select a group
of ops. Lets try it!
@ -301,7 +349,7 @@ in terminal. Otherwise, tfprof accounts all ops matched by
`-account_type_regexes` recursively even if they are hidden due to some
options such as -max_depth.
10) TensorFlow has built-in op types. For example, built-in op type `Variable`
11) TensorFlow has built-in op types. For example, built-in op type `Variable`
seems to include `Variable's` created by your model. However, be careful when
depending on it because TensorFlow creates extra `Variable` ops implicitly and
the implicitly created ops can have the same prefix as the `Variable's` you
@ -327,7 +375,7 @@ _TFProfRoot (--/930.58k params)
```
11) A example of defining extra op type for ops using `OpLog`
12) A example of defining extra op type for ops using `OpLog`
First, in Python code, create an `OpLog` proto and add op type
information to it:

View File

@ -15,6 +15,7 @@ cc_library(
srcs = ["tfprof_stats.cc"],
hdrs = ["tfprof_stats.h"],
deps = [
":tfprof_code",
":tfprof_graph",
":tfprof_node",
":tfprof_options",
@ -61,6 +62,27 @@ cc_library(
],
)
cc_library(
name = "tfprof_code",
srcs = ["tfprof_code.cc"],
hdrs = ["tfprof_code.h"],
deps = [
":tfprof_constants",
":tfprof_node",
":tfprof_options",
":tfprof_show_code",
":tfprof_tensor",
":tfprof_utils",
"//tensorflow/c:c_api",
"//tensorflow/c:checkpoint_reader",
"//tensorflow/core:framework",
"//tensorflow/core:lib",
"//tensorflow/core:protos_all_cc",
"//tensorflow/core:regexp_internal",
"//tensorflow/tools/tfprof:protos_all_cc",
],
)
cc_library(
name = "tfprof_graph",
srcs = ["tfprof_graph.cc"],
@ -98,6 +120,26 @@ cc_library(
],
)
cc_library(
name = "tfprof_show_code",
srcs = ["tfprof_show_code.cc"],
hdrs = ["tfprof_show_code.h"],
deps = [
":tfprof_constants",
":tfprof_node",
":tfprof_options",
":tfprof_scope",
":tfprof_show",
":tfprof_tensor",
":tfprof_utils",
"//tensorflow/c:checkpoint_reader",
"//tensorflow/core:lib",
"//tensorflow/core:protos_all_cc",
"//tensorflow/core:regexp_internal",
"//tensorflow/tools/tfprof:protos_all_cc",
],
)
tf_cc_test(
name = "tfprof_show_test",
srcs = ["tfprof_show_test.cc"],

View File

@ -40,13 +40,13 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
graph_ptr->ParseFromString(*graph);
std::unique_ptr<RunMetadata> run_meta_ptr;
if (run_meta) {
if (run_meta && !run_meta->empty()) {
run_meta_ptr.reset(new RunMetadata());
run_meta_ptr->ParseFromString(*run_meta);
}
std::unique_ptr<OpLog> op_log_ptr;
if (op_log) {
if (op_log && !op_log->empty()) {
op_log_ptr.reset(new OpLog());
op_log_ptr->ParseFromString(*op_log);
}
@ -58,16 +58,27 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
Options opts = Options::FromProtoStr(*options);
// TODO(xpan): We should have dump_to_file/print_stdout/etc to control
// side-effects independently instead of one controlling the other.
if (opts.dump_to_file.empty()) {
printf("\n=========================Options=============================\n");
printf("%s", opts.ToString().c_str());
printf("\n==================Model Analysis Report======================\n");
TFProfNode root(tf_stats.PrintGraph(*command, opts));
string ret = "";
if (*command == kCmds[2]) {
ret = tf_stats.PrintCode(opts).SerializeAsString();
} else {
ret = tf_stats.PrintGraph(*command, opts).SerializeAsString();
}
printf("\n======================End of Report==========================\n");
fflush(stdout);
return root.SerializeAsString();
return ret;
}
if (*command == kCmds[2]) {
return tf_stats.PrintCode(opts).SerializeAsString();
} else {
return tf_stats.PrintGraph(*command, opts).SerializeAsString();
}
return tf_stats.PrintGraph(*command, opts).SerializeAsString();
}
} // namespace tfprof
} // namespace tensorflow

Some files were not shown because too many files have changed in this diff Show More