Merge commit for internal changes
This commit is contained in:
commit
78fe1944d9
@ -342,6 +342,7 @@ filegroup(
|
||||
"//tensorflow/tensorboard/components/tf_globals:all_files",
|
||||
"//tensorflow/tensorboard/components/tf_globals_d3v4:all_files",
|
||||
"//tensorflow/tensorboard/components/tf_graph_common:all_files",
|
||||
"//tensorflow/tensorboard/components/tf_graph_loader:all_files",
|
||||
"//tensorflow/tensorboard/components/tf_histogram_dashboard:all_files",
|
||||
"//tensorflow/tensorboard/components/tf_histogram_dashboard/demo:all_files",
|
||||
"//tensorflow/tensorboard/components/tf_image_dashboard:all_files",
|
||||
|
@ -73,7 +73,7 @@ cc_library(
|
||||
"//tensorflow/compiler/xla:util",
|
||||
"//tensorflow/compiler/xla:xla_data_proto",
|
||||
"//tensorflow/compiler/xla/client:client_library",
|
||||
"//tensorflow/compiler/xla/client:local_client",
|
||||
"//tensorflow/compiler/xla/client:compile_only_client",
|
||||
"//tensorflow/compiler/xla/service:compiler",
|
||||
"//tensorflow/compiler/xla/service/cpu:cpu_compiler",
|
||||
"//tensorflow/core:core_cpu",
|
||||
|
@ -27,7 +27,7 @@ limitations under the License.
|
||||
#include "tensorflow/compiler/tf2xla/xla_compiler.h"
|
||||
#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
|
||||
#include "tensorflow/compiler/xla/client/client_library.h"
|
||||
#include "tensorflow/compiler/xla/client/local_client.h"
|
||||
#include "tensorflow/compiler/xla/client/compile_only_client.h"
|
||||
#include "tensorflow/compiler/xla/service/compiler.h"
|
||||
#include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
|
||||
#include "tensorflow/compiler/xla/shape_util.h"
|
||||
@ -274,7 +274,8 @@ Status CreateXlaArgs(const Graph& graph,
|
||||
|
||||
// Converts the TensorFlow graph into an XLA computation, by executing the
|
||||
// graph symbolically, with each op building up the XLA HLO.
|
||||
Status ConvertGraphToXla(xla::LocalClient* client, std::unique_ptr<Graph> graph,
|
||||
Status ConvertGraphToXla(xla::CompileOnlyClient* client,
|
||||
std::unique_ptr<Graph> graph,
|
||||
xla::Computation* computation, bool* has_context_arg) {
|
||||
// Create a device and context to convert the graph into an XLA computation.
|
||||
XlaOpRegistry::RegisterCompilationKernels();
|
||||
@ -333,7 +334,8 @@ Status ConvertGraphToXla(xla::LocalClient* client, std::unique_ptr<Graph> graph,
|
||||
}
|
||||
|
||||
// Compiles the XLA computation into executable code.
|
||||
Status CompileXla(xla::LocalClient* client, const xla::Computation& computation,
|
||||
Status CompileXla(xla::CompileOnlyClient* client,
|
||||
const xla::Computation& computation,
|
||||
const xla::cpu::CpuAotCompilationOptions& aot_opts,
|
||||
CompileResult* compile_result) {
|
||||
// Retrieves arg and result layouts from the computation.
|
||||
@ -350,7 +352,7 @@ Status CompileXla(xla::LocalClient* client, const xla::Computation& computation,
|
||||
for (int i = 0; i < pshape->parameters_size(); ++i) {
|
||||
arg_layouts.push_back(pshape->mutable_parameters(i));
|
||||
}
|
||||
xla::LocalClient::AheadOfTimeComputationInstance instance;
|
||||
xla::CompileOnlyClient::AotComputationInstance instance;
|
||||
instance.computation = &computation;
|
||||
instance.argument_layouts = std::move(arg_layouts);
|
||||
instance.result_layout = &pshape->result();
|
||||
@ -365,7 +367,7 @@ Status CompileXla(xla::LocalClient* client, const xla::Computation& computation,
|
||||
std::move(aot_or.ValueOrDie().back()));
|
||||
compile_result->entry_point = aot_opts.entry_point_name();
|
||||
compile_result->pointer_size =
|
||||
xla::LocalClient::PointerSizeForTriple(aot_opts.triple());
|
||||
xla::CompileOnlyClient::PointerSizeForTriple(aot_opts.triple());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
@ -394,8 +396,9 @@ Status CompileGraph(std::unique_ptr<Graph> graph, const MainFlags& flags,
|
||||
namespace gpu = perftools::gputools;
|
||||
gpu::Platform* cpu_platform =
|
||||
gpu::MultiPlatformManager::PlatformWithName("Host").ValueOrDie();
|
||||
xla::LocalClient* client =
|
||||
xla::ClientLibrary::GetOrCreateLocalClient(cpu_platform).ValueOrDie();
|
||||
xla::CompileOnlyClient* client =
|
||||
xla::ClientLibrary::GetOrCreateCompileOnlyClient(cpu_platform)
|
||||
.ValueOrDie();
|
||||
xla::Computation computation;
|
||||
TF_RETURN_IF_ERROR(ConvertGraphToXla(client, std::move(graph), &computation,
|
||||
&compile_result->has_context_arg));
|
||||
|
@ -99,6 +99,26 @@ cc_library(
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "compile_only_client",
|
||||
srcs = ["compile_only_client.cc"],
|
||||
hdrs = ["compile_only_client.h"],
|
||||
deps = [
|
||||
":client",
|
||||
":computation",
|
||||
"//tensorflow/compiler/xla:status_macros",
|
||||
"//tensorflow/compiler/xla:statusor",
|
||||
"//tensorflow/compiler/xla:util",
|
||||
"//tensorflow/compiler/xla:xla_data_proto",
|
||||
"//tensorflow/compiler/xla/service:compile_only_service",
|
||||
"//tensorflow/compiler/xla/service:compiler",
|
||||
"//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:stream_executor_no_cuda",
|
||||
"@llvm//:support",
|
||||
],
|
||||
)
|
||||
|
||||
# This target is used to instantiate the XLA service in-process and create
|
||||
# a client for it.
|
||||
cc_library(
|
||||
@ -106,12 +126,14 @@ cc_library(
|
||||
srcs = ["client_library.cc"],
|
||||
hdrs = ["client_library.h"],
|
||||
deps = [
|
||||
":compile_only_client",
|
||||
":local_client",
|
||||
"//tensorflow/compiler/xla:status_macros",
|
||||
"//tensorflow/compiler/xla:statusor",
|
||||
"//tensorflow/compiler/xla:types",
|
||||
"//tensorflow/compiler/xla:util",
|
||||
"//tensorflow/compiler/xla/service:backend",
|
||||
"//tensorflow/compiler/xla/service:compile_only_service",
|
||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
||||
"//tensorflow/compiler/xla/service:local_service",
|
||||
"//tensorflow/compiler/xla/service:platform_util",
|
||||
|
@ -69,8 +69,8 @@ ClientLibrary::~ClientLibrary() = default;
|
||||
TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
|
||||
}
|
||||
|
||||
auto it = client_library.instances_.find(platform->id());
|
||||
if (it != client_library.instances_.end()) {
|
||||
auto it = client_library.local_instances_.find(platform->id());
|
||||
if (it != client_library.local_instances_.end()) {
|
||||
return it->second->client.get();
|
||||
}
|
||||
|
||||
@ -78,13 +78,13 @@ ClientLibrary::~ClientLibrary() = default;
|
||||
service_options.set_platform(platform);
|
||||
service_options.set_number_of_replicas(replica_count);
|
||||
|
||||
std::unique_ptr<LocalInstance> instance = MakeUnique<LocalInstance>();
|
||||
auto instance = MakeUnique<LocalInstance>();
|
||||
TF_ASSIGN_OR_RETURN(instance->service,
|
||||
LocalService::NewService(service_options));
|
||||
instance->client = MakeUnique<LocalClient>(instance->service.get());
|
||||
LocalClient* cl = instance->client.get();
|
||||
|
||||
client_library.instances_.insert(
|
||||
client_library.local_instances_.insert(
|
||||
std::make_pair(platform->id(), std::move(instance)));
|
||||
return cl;
|
||||
}
|
||||
@ -99,9 +99,35 @@ ClientLibrary::~ClientLibrary() = default;
|
||||
perftools::gputools::Platform* platform) {
|
||||
ClientLibrary& client_library = Singleton();
|
||||
tensorflow::mutex_lock lock(client_library.service_mutex_);
|
||||
auto it = client_library.instances_.find(platform->id());
|
||||
CHECK(it != client_library.instances_.end());
|
||||
auto it = client_library.local_instances_.find(platform->id());
|
||||
CHECK(it != client_library.local_instances_.end());
|
||||
return it->second->service.get();
|
||||
}
|
||||
|
||||
/* static */ StatusOr<CompileOnlyClient*>
|
||||
ClientLibrary::GetOrCreateCompileOnlyClient(
|
||||
perftools::gputools::Platform* platform) {
|
||||
ClientLibrary& client_library = Singleton();
|
||||
tensorflow::mutex_lock lock(client_library.service_mutex_);
|
||||
|
||||
if (platform == nullptr) {
|
||||
TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
|
||||
}
|
||||
|
||||
auto it = client_library.compile_only_instances_.find(platform->id());
|
||||
if (it != client_library.compile_only_instances_.end()) {
|
||||
return it->second->client.get();
|
||||
}
|
||||
|
||||
auto instance = MakeUnique<CompileOnlyInstance>();
|
||||
TF_ASSIGN_OR_RETURN(instance->service,
|
||||
CompileOnlyService::NewService(platform));
|
||||
instance->client = MakeUnique<CompileOnlyClient>(instance->service.get());
|
||||
CompileOnlyClient* cl = instance->client.get();
|
||||
|
||||
client_library.compile_only_instances_.insert(
|
||||
std::make_pair(platform->id(), std::move(instance)));
|
||||
return cl;
|
||||
}
|
||||
|
||||
} // namespace xla
|
||||
|
@ -26,7 +26,9 @@ limitations under the License.
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "tensorflow/compiler/xla/client/compile_only_client.h"
|
||||
#include "tensorflow/compiler/xla/client/local_client.h"
|
||||
#include "tensorflow/compiler/xla/service/compile_only_service.h"
|
||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
||||
#include "tensorflow/compiler/xla/service/local_service.h"
|
||||
#include "tensorflow/compiler/xla/statusor.h"
|
||||
@ -76,6 +78,13 @@ class ClientLibrary {
|
||||
// access user computations from client.
|
||||
static LocalService* GetXlaService(perftools::gputools::Platform* platform);
|
||||
|
||||
// Singleton constructor-or-accessor for compile-only clients. Arguments:
|
||||
//
|
||||
// platform : The platform the underlying XLA service should target. If
|
||||
// null then default platform is used.
|
||||
static StatusOr<CompileOnlyClient*> GetOrCreateCompileOnlyClient(
|
||||
perftools::gputools::Platform* platform = nullptr);
|
||||
|
||||
private:
|
||||
// Returns the singleton instance of ClientLibrary.
|
||||
static ClientLibrary& Singleton();
|
||||
@ -90,10 +99,21 @@ class ClientLibrary {
|
||||
std::unique_ptr<LocalClient> client;
|
||||
};
|
||||
|
||||
struct CompileOnlyInstance {
|
||||
// Service that is wrapped by the singleton client object.
|
||||
std::unique_ptr<CompileOnlyService> service;
|
||||
// Singleton client object.
|
||||
std::unique_ptr<CompileOnlyClient> client;
|
||||
};
|
||||
|
||||
tensorflow::mutex service_mutex_; // Guards the singleton creation state.
|
||||
std::unordered_map<perftools::gputools::Platform::Id,
|
||||
std::unique_ptr<LocalInstance>>
|
||||
instances_ GUARDED_BY(service_mutex_);
|
||||
local_instances_ GUARDED_BY(service_mutex_);
|
||||
|
||||
std::unordered_map<perftools::gputools::Platform::Id,
|
||||
std::unique_ptr<CompileOnlyInstance>>
|
||||
compile_only_instances_ GUARDED_BY(service_mutex_);
|
||||
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(ClientLibrary);
|
||||
};
|
||||
|
59
tensorflow/compiler/xla/client/compile_only_client.cc
Normal file
59
tensorflow/compiler/xla/client/compile_only_client.cc
Normal file
@ -0,0 +1,59 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/compiler/xla/client/compile_only_client.h"
|
||||
|
||||
#include "external/llvm/include/llvm/ADT/Triple.h"
|
||||
#include "tensorflow/compiler/xla/ptr_util.h"
|
||||
#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
|
||||
#include "tensorflow/compiler/xla/status_macros.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
|
||||
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
|
||||
CompileOnlyClient::CompileAheadOfTime(
|
||||
const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
|
||||
const AotCompilationOptions& options) {
|
||||
std::vector<CompileOnlyService::AotComputationInstance> service_instances;
|
||||
service_instances.reserve(computations.size());
|
||||
for (const AotComputationInstance& instance : computations) {
|
||||
service_instances.push_back({});
|
||||
CompileOnlyService::AotComputationInstance& service_instance =
|
||||
service_instances.back();
|
||||
TF_RET_CHECK(instance.computation != nullptr);
|
||||
service_instance.computation = instance.computation->handle();
|
||||
service_instance.argument_layouts = instance.argument_layouts;
|
||||
service_instance.result_layout = instance.result_layout;
|
||||
}
|
||||
return compiler_service_->CompileAheadOfTime(service_instances, options);
|
||||
}
|
||||
|
||||
int64 CompileOnlyClient::PointerSizeForTriple(
|
||||
tensorflow::StringPiece target_triple) {
|
||||
llvm::Triple triple(
|
||||
llvm::Triple::normalize(llvm_ir::AsStringRef(target_triple)));
|
||||
if (triple.isArch64Bit()) {
|
||||
return 8;
|
||||
} else if (triple.isArch32Bit()) {
|
||||
return 4;
|
||||
} else {
|
||||
CHECK(triple.isArch16Bit());
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace xla
|
66
tensorflow/compiler/xla/client/compile_only_client.h
Normal file
66
tensorflow/compiler/xla/client/compile_only_client.h
Normal file
@ -0,0 +1,66 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
|
||||
#define TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
|
||||
|
||||
#include "tensorflow/compiler/xla/client/client.h"
|
||||
#include "tensorflow/compiler/xla/client/computation.h"
|
||||
#include "tensorflow/compiler/xla/service/compile_only_service.h"
|
||||
#include "tensorflow/compiler/xla/service/compiler.h"
|
||||
#include "tensorflow/compiler/xla/statusor.h"
|
||||
#include "tensorflow/compiler/xla/xla_data.pb.h"
|
||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||
|
||||
namespace xla {
|
||||
|
||||
// An XLA Client specialization for doing ahead-of-time compilation. This does
|
||||
// not require (or attempt to instantiate) an execution-capable backend for the
|
||||
// relevant platform.
|
||||
class CompileOnlyClient : public Client {
|
||||
public:
|
||||
explicit CompileOnlyClient(CompileOnlyService* service)
|
||||
: Client(service), compiler_service_(service) {}
|
||||
|
||||
CompileOnlyClient(const CompileOnlyClient&) = delete;
|
||||
void operator=(const CompileOnlyClient&) = delete;
|
||||
|
||||
// A description of a computation to compile using CompileAheadOfTime.
|
||||
struct AotComputationInstance {
|
||||
const Computation* computation;
|
||||
// Inform the compiler of the expected layout for arguments.
|
||||
std::vector<const Shape*> argument_layouts;
|
||||
// Specifies the expected result layout.
|
||||
const Shape* result_layout;
|
||||
};
|
||||
|
||||
// Compiles a list of computations for ahead-of-time execution. This is
|
||||
// intended for use in static compilation. The |options| parameter describes
|
||||
// the target for which the compiler should emit code.
|
||||
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
|
||||
CompileAheadOfTime(
|
||||
const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
|
||||
const AotCompilationOptions& options);
|
||||
|
||||
// Returns the size of a pointer in bytes for a given triple.
|
||||
static int64 PointerSizeForTriple(tensorflow::StringPiece triple);
|
||||
|
||||
private:
|
||||
CompileOnlyService* compiler_service_;
|
||||
};
|
||||
|
||||
} // namespace xla
|
||||
|
||||
#endif // TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
|
@ -23,13 +23,15 @@ limitations under the License.
|
||||
|
||||
namespace xla {
|
||||
|
||||
// Wraps a GlobalDataHandle with a lifetime.
|
||||
// A GlobalData object represents a globally-accessible allocation of
|
||||
// data in the associated XLA service.
|
||||
class GlobalData {
|
||||
public:
|
||||
// Gives ownership of the global data handle to this object.
|
||||
GlobalData(ServiceInterface* parent, GlobalDataHandle handle);
|
||||
|
||||
// Unregisters the wrapped handle.
|
||||
// Unregisters the wrapped handle, which causes the service to
|
||||
// deallocate the associated data.
|
||||
~GlobalData();
|
||||
|
||||
const GlobalDataHandle& handle() const { return handle_; }
|
||||
|
@ -176,10 +176,10 @@ StatusOr<std::unique_ptr<ShapedBuffer>> LocalExecutable::Run(
|
||||
TF_RETURN_IF_ERROR(ValidateExecutionOptions(arguments, options, *backend_));
|
||||
|
||||
ExecutableRunOptions actual_options = options;
|
||||
Backend::StreamPtr stream;
|
||||
if (options.stream() == nullptr) {
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
stream, BorrowStreamForDevice(options.device_ordinal(), backend_));
|
||||
Backend::StreamPtr stream,
|
||||
BorrowStreamForDevice(options.device_ordinal(), backend_));
|
||||
actual_options.set_stream(stream.get());
|
||||
}
|
||||
if (options.allocator() == nullptr) {
|
||||
@ -261,38 +261,6 @@ tensorflow::Status LocalClient::ResolveArguments(
|
||||
argument_ptrs);
|
||||
}
|
||||
|
||||
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
|
||||
LocalClient::CompileAheadOfTime(
|
||||
const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
|
||||
computations,
|
||||
const AotCompilationOptions& options) {
|
||||
std::vector<LocalService::AheadOfTimeComputationInstance> service_instances;
|
||||
service_instances.reserve(computations.size());
|
||||
for (const AheadOfTimeComputationInstance& instance : computations) {
|
||||
service_instances.push_back({});
|
||||
LocalService::AheadOfTimeComputationInstance& service_instance =
|
||||
service_instances.back();
|
||||
TF_RET_CHECK(instance.computation != nullptr);
|
||||
service_instance.computation = instance.computation->handle();
|
||||
service_instance.argument_layouts = instance.argument_layouts;
|
||||
service_instance.result_layout = instance.result_layout;
|
||||
}
|
||||
return local_service_->CompileAheadOfTime(service_instances, options);
|
||||
}
|
||||
|
||||
int64 LocalClient::PointerSizeForTriple(tensorflow::StringPiece target_triple) {
|
||||
llvm::Triple triple(
|
||||
llvm::Triple::normalize(llvm_ir::AsStringRef(target_triple)));
|
||||
if (triple.isArch64Bit()) {
|
||||
return 8;
|
||||
} else if (triple.isArch32Bit()) {
|
||||
return 4;
|
||||
} else {
|
||||
CHECK(triple.isArch16Bit());
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
se::Platform* LocalClient::platform() const {
|
||||
return local_service_->backend().platform();
|
||||
}
|
||||
|
@ -148,7 +148,7 @@ class LocalExecutable {
|
||||
const ExecutableBuildOptions& build_options_;
|
||||
};
|
||||
|
||||
// An XLA service client object for use when the client and service run in
|
||||
// An XLA Client specialization for use when the client and service run in
|
||||
// the same process.
|
||||
class LocalClient : public Client {
|
||||
public:
|
||||
@ -182,30 +182,6 @@ class LocalClient : public Client {
|
||||
const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
|
||||
const ExecutableBuildOptions& options);
|
||||
|
||||
// A description of a computation to compile using CompileAheadOfTime.
|
||||
struct AheadOfTimeComputationInstance {
|
||||
const Computation* computation;
|
||||
// Inform the compiler of the expected layout for arguments.
|
||||
std::vector<const Shape*> argument_layouts;
|
||||
// Specifies the expected result layout.
|
||||
const Shape* result_layout;
|
||||
};
|
||||
|
||||
// Compiles a list of computations for ahead-of-time execution. This is
|
||||
// intended for use in static compilation. The |options| parameter describes
|
||||
// the target for which the compiler should emit code.
|
||||
//
|
||||
// TODO(b/31222190): This doesn't really belong in LocalClient. Move it to its
|
||||
// own library.
|
||||
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
|
||||
CompileAheadOfTime(
|
||||
const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
|
||||
computations,
|
||||
const AotCompilationOptions& options);
|
||||
|
||||
// Returns the size of a pointer in bytes for a given triple.
|
||||
static int64 PointerSizeForTriple(tensorflow::StringPiece triple);
|
||||
|
||||
// Returns the platform that the underlying service targets.
|
||||
perftools::gputools::Platform* platform() const;
|
||||
|
||||
|
@ -16,6 +16,7 @@ limitations under the License.
|
||||
#include "tensorflow/compiler/xla/literal_util.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <functional>
|
||||
#include <limits>
|
||||
#include <numeric>
|
||||
@ -308,37 +309,16 @@ template <typename T, typename WT>
|
||||
|
||||
/* static */ std::unique_ptr<Literal> LiteralUtil::Relayout(
|
||||
const Literal& original, const Layout& layout) {
|
||||
// Note: if this were a performance bottleneck, we avoid cloning and just make
|
||||
// an uninitialized array instead, since all values are clobbered below.
|
||||
std::unique_ptr<Literal> result = CloneToUnique(original);
|
||||
*result->mutable_shape()->mutable_layout() = layout;
|
||||
const PrimitiveType primitive_type = original.shape().element_type();
|
||||
switch (primitive_type) {
|
||||
case F32:
|
||||
LiteralUtil::EachCell<float>(
|
||||
original,
|
||||
[&](tensorflow::gtl::ArraySlice<int64> indices, float value) {
|
||||
LiteralUtil::Set<float>(result.get(), indices, value);
|
||||
});
|
||||
return result;
|
||||
case S32:
|
||||
LiteralUtil::EachCell<int32>(
|
||||
original,
|
||||
[&](tensorflow::gtl::ArraySlice<int64> indices, int32 value) {
|
||||
LiteralUtil::Set<int32>(result.get(), indices, value);
|
||||
});
|
||||
return result;
|
||||
case U32:
|
||||
LiteralUtil::EachCell<uint32>(
|
||||
original,
|
||||
[&](tensorflow::gtl::ArraySlice<int64> indices, uint32 value) {
|
||||
LiteralUtil::Set<uint32>(result.get(), indices, value);
|
||||
});
|
||||
return result;
|
||||
default:
|
||||
LOG(FATAL) << "not yet implemented: "
|
||||
<< PrimitiveType_Name(primitive_type);
|
||||
}
|
||||
|
||||
const Shape& shape = original.shape();
|
||||
std::vector<int64> base(ShapeUtil::Rank(shape), 0);
|
||||
std::vector<int64> copy_size(shape.dimensions().begin(),
|
||||
shape.dimensions().end());
|
||||
|
||||
TF_CHECK_OK(Copy(original, base, result.get(), base, copy_size));
|
||||
return result;
|
||||
}
|
||||
|
||||
/* static */ StatusOr<std::unique_ptr<Literal>> LiteralUtil::Reshape(
|
||||
@ -346,25 +326,19 @@ template <typename T, typename WT>
|
||||
if (ShapeUtil::IsTuple(input.shape())) {
|
||||
return InvalidArgument("Reshape does not support tuples.");
|
||||
}
|
||||
|
||||
std::unique_ptr<Literal> output;
|
||||
if (!LayoutUtil::IsMonotonicWithDim0Major(input.shape().layout())) {
|
||||
return Unimplemented(
|
||||
"Input shape must have a monotonic layout where dimension 0 is major, "
|
||||
"was: %s",
|
||||
LayoutUtil::HumanString(input.shape().layout()).c_str());
|
||||
std::vector<int64> minor_to_major(ShapeUtil::Rank(input.shape()));
|
||||
std::iota(minor_to_major.rbegin(), minor_to_major.rend(),
|
||||
static_cast<int64>(0));
|
||||
output = Relayout(input, LayoutUtil::MakeLayout(minor_to_major));
|
||||
} else {
|
||||
output = CloneToUnique(input);
|
||||
}
|
||||
std::vector<int64> layout(dimensions.size());
|
||||
std::iota(layout.rbegin(), layout.rend(), 0);
|
||||
|
||||
// Because the layout is monotonic, we can simply reuse the same sequence of
|
||||
// values without changing their order.
|
||||
std::unique_ptr<Literal> output = CloneToUnique(input);
|
||||
output->clear_shape();
|
||||
output->mutable_shape()->set_element_type(input.shape().element_type());
|
||||
for (int64 dimension : dimensions) {
|
||||
output->mutable_shape()->add_dimensions(dimension);
|
||||
}
|
||||
*output->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout(layout);
|
||||
*output->mutable_shape() =
|
||||
ShapeUtil::MakeShape(input.shape().element_type(), dimensions);
|
||||
|
||||
int64 elements_before = ShapeUtil::ElementsIn(input.shape());
|
||||
int64 elements_after = ShapeUtil::ElementsIn(output->shape());
|
||||
@ -378,73 +352,42 @@ template <typename T, typename WT>
|
||||
return std::move(output);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
template <class T>
|
||||
void TransposeLiteralInternal(const Literal& original,
|
||||
tensorflow::gtl::ArraySlice<int64> permutation,
|
||||
Literal* result) {
|
||||
std::vector<int64> new_indices(ShapeUtil::Rank(original.shape()));
|
||||
LiteralUtil::EachCell<T>(
|
||||
original, [&](tensorflow::gtl::ArraySlice<int64> indices, T value) {
|
||||
for (int64 i = 0; i < indices.size(); ++i) {
|
||||
new_indices[i] = indices[permutation[i]];
|
||||
}
|
||||
LiteralUtil::Set<T>(result, new_indices, value);
|
||||
});
|
||||
}
|
||||
} // namespace
|
||||
|
||||
/* static */ std::unique_ptr<Literal> LiteralUtil::Transpose(
|
||||
const Literal& original, tensorflow::gtl::ArraySlice<int64> permutation) {
|
||||
CHECK(!ShapeUtil::IsTuple(original.shape()))
|
||||
<< "tuple is not supported for transpose";
|
||||
std::vector<int64> dimension_numbers(ShapeUtil::Rank(original.shape()));
|
||||
std::iota(dimension_numbers.begin(), dimension_numbers.end(), 0);
|
||||
CHECK(std::is_permutation(permutation.begin(), permutation.end(),
|
||||
dimension_numbers.begin()))
|
||||
<< "given permutation is not a permutation of dimension numbers";
|
||||
std::vector<int64> new_dimension_sizes;
|
||||
for (const int64 dim : permutation) {
|
||||
new_dimension_sizes.push_back(original.shape().dimensions(dim));
|
||||
}
|
||||
const auto result_shape = ShapeUtil::MakeShape(
|
||||
original.shape().element_type(), new_dimension_sizes);
|
||||
std::unique_ptr<Literal> result = CloneToUnique(original);
|
||||
*result->mutable_shape() = result_shape;
|
||||
const PrimitiveType primitive_type = original.shape().element_type();
|
||||
switch (primitive_type) {
|
||||
case F32:
|
||||
TransposeLiteralInternal<float>(original, permutation, result.get());
|
||||
return result;
|
||||
case F64:
|
||||
TransposeLiteralInternal<double>(original, permutation, result.get());
|
||||
return result;
|
||||
case PRED:
|
||||
TransposeLiteralInternal<bool>(original, permutation, result.get());
|
||||
return result;
|
||||
case S8:
|
||||
TransposeLiteralInternal<int8>(original, permutation, result.get());
|
||||
return result;
|
||||
case U8:
|
||||
TransposeLiteralInternal<uint8>(original, permutation, result.get());
|
||||
return result;
|
||||
case S32:
|
||||
TransposeLiteralInternal<int32>(original, permutation, result.get());
|
||||
return result;
|
||||
case U32:
|
||||
TransposeLiteralInternal<uint32>(original, permutation, result.get());
|
||||
return result;
|
||||
case S64:
|
||||
TransposeLiteralInternal<int64>(original, permutation, result.get());
|
||||
return result;
|
||||
case U64:
|
||||
TransposeLiteralInternal<uint64>(original, permutation, result.get());
|
||||
return result;
|
||||
default:
|
||||
LOG(FATAL) << "not yet implemented: "
|
||||
<< PrimitiveType_Name(primitive_type);
|
||||
<< "Tuple is not supported for transpose";
|
||||
CHECK(IsPermutation(permutation, ShapeUtil::Rank(original.shape())))
|
||||
<< "Given permutation is not a permutation of dimension numbers";
|
||||
// To transpose the array, we just permute the dimensions and layout, and
|
||||
// do a straight memory copy of the raw data set.
|
||||
// This is considerably faster than iterating over every array element using
|
||||
// the EachCell<>() and Set<>() APIs.
|
||||
std::vector<int64> inverse_permutation = InversePermutation(permutation);
|
||||
Shape shape =
|
||||
ShapeUtil::PermuteDimensions(inverse_permutation, original.shape());
|
||||
// Replace the layout with one affine to the original shape, such that a
|
||||
// transpose operation can be performed by leaving the flat values
|
||||
// representation intact.
|
||||
// For example, consider the shape F32[11,8]{1,0} under a {1,0} permutation.
|
||||
// The shape with affine layout resulting from that operation will be
|
||||
// F32[8,11]{0,1}, since it leave the original most minor (the 8 sized), the
|
||||
// most minor.
|
||||
// Essentially, given MinMaj(Di) the position of the Di dimension within the
|
||||
// minor to major vector, and given T(Di) the index that the original Di
|
||||
// dimension has within the transposed array, a layout is affine if
|
||||
// MinMaj(Di) == TMinMaj(T(Di)), with TMinMaj() being the minor to major
|
||||
// vector of the affine layout.
|
||||
Layout* layout = shape.mutable_layout();
|
||||
layout->clear_minor_to_major();
|
||||
for (auto index : original.shape().layout().minor_to_major()) {
|
||||
layout->add_minor_to_major(inverse_permutation[index]);
|
||||
}
|
||||
std::unique_ptr<Literal> new_literal = CreateFromShape(shape);
|
||||
DCHECK_GE(ShapeUtil::ByteSizeOf(new_literal->shape()),
|
||||
ShapeUtil::ByteSizeOf(original.shape()));
|
||||
std::memcpy(MutableInternalData(new_literal.get()), InternalData(original),
|
||||
ShapeUtil::ByteSizeOf(original.shape()));
|
||||
return new_literal;
|
||||
}
|
||||
|
||||
/* static */ std::unique_ptr<Literal> LiteralUtil::Slice(
|
||||
@ -793,47 +736,14 @@ void TransposeLiteralInternal(const Literal& original,
|
||||
const Literal& literal,
|
||||
const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
|
||||
const string& value)>& per_cell) {
|
||||
if (ShapeUtil::Rank(literal.shape()) == 1) {
|
||||
for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
|
||||
per_cell({i0}, GetAsString(literal, {i0}));
|
||||
}
|
||||
if (ShapeUtil::HasZeroElements(literal.shape())) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (ShapeUtil::Rank(literal.shape()) == 2) {
|
||||
for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
|
||||
for (int64 i1 = 0; i1 < literal.shape().dimensions(1); ++i1) {
|
||||
per_cell({i0, i1}, GetAsString(literal, {i0, i1}));
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (ShapeUtil::Rank(literal.shape()) == 3) {
|
||||
for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
|
||||
for (int64 i1 = 0; i1 < literal.shape().dimensions(1); ++i1) {
|
||||
for (int64 i2 = 0; i2 < literal.shape().dimensions(2); ++i2) {
|
||||
per_cell({i0, i1, i2}, GetAsString(literal, {i0, i1, i2}));
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (ShapeUtil::Rank(literal.shape()) == 4) {
|
||||
for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
|
||||
for (int64 i1 = 0; i1 < literal.shape().dimensions(1); ++i1) {
|
||||
for (int64 i2 = 0; i2 < literal.shape().dimensions(2); ++i2) {
|
||||
for (int64 i3 = 0; i3 < literal.shape().dimensions(3); ++i3) {
|
||||
per_cell({i0, i1, i2, i3}, GetAsString(literal, {i0, i1, i2, i3}));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
LOG(FATAL) << "unhandled rank: " << ShapeUtil::Rank(literal.shape());
|
||||
std::vector<int64> indices = IndexUtil::LinearIndexToMultidimensionalIndex(
|
||||
literal.shape(), /*linear_index=*/0);
|
||||
do {
|
||||
per_cell(indices, GetAsString(literal, indices));
|
||||
} while (IndexUtil::BumpIndices(literal.shape(), &indices));
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
@ -239,6 +239,11 @@ class LiteralUtil {
|
||||
// Clones literal into an owned unique_ptr version.
|
||||
static std::unique_ptr<Literal> CloneToUnique(const Literal& literal);
|
||||
|
||||
// Returns the linear index of the given index within the literal's
|
||||
// element_type repeated field.
|
||||
static int64 LinearIndex(const Literal& literal,
|
||||
tensorflow::gtl::ArraySlice<int64> multi_index);
|
||||
|
||||
// Gets or sets an element in the literal at the given index. The index is
|
||||
// CHECKed against the dimension sizes.
|
||||
template <typename NativeT>
|
||||
@ -427,11 +432,6 @@ class LiteralUtil {
|
||||
"Cannot map native type to primitive type.");
|
||||
}
|
||||
|
||||
// Returns the linear index of the given index within the literal's
|
||||
// element_type repeated field.
|
||||
static int64 LinearIndex(const Literal& literal,
|
||||
tensorflow::gtl::ArraySlice<int64> multi_index);
|
||||
|
||||
// Internal template helper for the Copy() API, matching its arguments one by
|
||||
// one.
|
||||
//
|
||||
|
@ -469,6 +469,26 @@ TEST_F(LiteralUtilTest, ReshapeR4) {
|
||||
EXPECT_TRUE(LiteralUtil::Equal(*expected, *reshape));
|
||||
}
|
||||
|
||||
TEST_F(LiteralUtilTest, ReshapeR4Dim0Minor) {
|
||||
// clang-format off
|
||||
// F32[1x3x2x4]
|
||||
auto original = LiteralUtil::CreateR4WithLayout<float>({{
|
||||
{{10, 11, 12, 13}, {14, 15, 16, 17}},
|
||||
{{18, 19, 20, 21}, {22, 23, 24, 25}},
|
||||
{{26, 27, 28, 29}, {30, 31, 32, 33}},
|
||||
}}, layout_r4_dim0minor_);
|
||||
// F32[1x3x4x2]
|
||||
auto expected = LiteralUtil::CreateR3WithLayout<float>({
|
||||
{{10, 11}, {12, 13}, {14, 15}, {16, 17}},
|
||||
{{18, 19}, {20, 21}, {22, 23}, {24, 25}},
|
||||
{{26, 27}, {28, 29}, {30, 31}, {32, 33}},
|
||||
}, layout_r3_dim0major_);
|
||||
// clang-format on
|
||||
auto reshape = LiteralUtil::Reshape(*original, {3, 4, 2}).ConsumeValueOrDie();
|
||||
|
||||
EXPECT_TRUE(LiteralUtil::Equal(*expected, *reshape));
|
||||
}
|
||||
|
||||
TEST_F(LiteralUtilTest, TransposeR0) {
|
||||
auto original = LiteralUtil::CreateR0<float>(1.7f);
|
||||
auto reshape = LiteralUtil::Transpose(*original, /*permutation=*/{});
|
||||
@ -659,15 +679,15 @@ TEST_F(LiteralUtilTest, Copy) {
|
||||
primitive_util::NativeToPrimitiveType<uint32>(), dimensions, layout);
|
||||
auto blank = LiteralUtil::CreateFromShape(shape);
|
||||
auto source = LiteralUtil::CreateFromShape(shape);
|
||||
const int64 sbase[] = {0, 0, 0, 0};
|
||||
const int64 incr[] = {1, 1, 1, 1};
|
||||
const int64 zero_base[] = {0, 0, 0, 0};
|
||||
const int64 step[] = {1, 1, 1, 1};
|
||||
uint32 seqnr = 0;
|
||||
auto init_proc = [&](const std::vector<int64>& indexes) {
|
||||
LiteralUtil::Set(source.get(), indexes, ++seqnr);
|
||||
return true;
|
||||
};
|
||||
|
||||
ShapeUtil::ForEachIndex(source->shape(), sbase, dimensions, incr,
|
||||
ShapeUtil::ForEachIndex(source->shape(), zero_base, dimensions, step,
|
||||
init_proc);
|
||||
|
||||
const int64 src_base[] = {3, 1, 5, 7};
|
||||
@ -691,7 +711,7 @@ TEST_F(LiteralUtilTest, Copy) {
|
||||
bval == LiteralUtil::Get<uint32>(*source, source_indexes));
|
||||
return matched;
|
||||
};
|
||||
ShapeUtil::ForEachIndex(source->shape(), sbase, copy_size, incr,
|
||||
ShapeUtil::ForEachIndex(source->shape(), zero_base, copy_size, step,
|
||||
check_proc);
|
||||
EXPECT_TRUE(matched);
|
||||
}
|
||||
@ -710,5 +730,43 @@ TEST_F(LiteralUtilTest, CopyScalars) {
|
||||
EXPECT_EQ(LiteralUtil::Get<uint32>(*vect, {4}), 17);
|
||||
}
|
||||
|
||||
TEST_F(LiteralUtilTest, Populate) {
|
||||
struct PopulateData {
|
||||
std::vector<int64> dimensions;
|
||||
std::vector<int64> layout;
|
||||
} populate_data[] = {
|
||||
{{}, {}},
|
||||
{{16}, {0}},
|
||||
{{4, 16}, {1, 0}},
|
||||
{{21, 12}, {0, 1}},
|
||||
{{6, 11, 17}, {2, 0, 1}},
|
||||
{{6, 11, 5, 17}, {3, 2, 0, 1}},
|
||||
};
|
||||
for (const auto& data : populate_data) {
|
||||
Shape shape = ShapeUtil::MakeShapeWithLayout(
|
||||
primitive_util::NativeToPrimitiveType<uint32>(), data.dimensions,
|
||||
data.layout);
|
||||
auto literal = LiteralUtil::CreateFromShape(shape);
|
||||
auto generator = [&](tensorflow::gtl::ArraySlice<int64> indexes) -> uint32 {
|
||||
// Offsets from linear index just to avoid R0 literals to be initialized
|
||||
// with zero.
|
||||
return LiteralUtil::LinearIndex(*literal, indexes) + 17;
|
||||
};
|
||||
TF_EXPECT_OK(LiteralUtil::Populate<uint32>(literal.get(), generator));
|
||||
|
||||
std::vector<int64> zero_base(data.dimensions.size(), 0);
|
||||
std::vector<int64> step(data.dimensions.size(), 1);
|
||||
bool matched = true;
|
||||
auto check_function = [&](const std::vector<int64>& indexes) {
|
||||
auto value = LiteralUtil::Get<uint32>(*literal, indexes);
|
||||
matched = matched && (value == generator(indexes));
|
||||
return matched;
|
||||
};
|
||||
ShapeUtil::ForEachIndex(literal->shape(), zero_base, data.dimensions, step,
|
||||
check_function);
|
||||
EXPECT_TRUE(matched);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace xla
|
||||
|
@ -406,6 +406,27 @@ cc_library(
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "compile_only_service",
|
||||
srcs = ["compile_only_service.cc"],
|
||||
hdrs = ["compile_only_service.h"],
|
||||
deps = [
|
||||
":backend",
|
||||
":compiler",
|
||||
":computation_layout",
|
||||
":computation_tracker",
|
||||
":platform_util",
|
||||
":service",
|
||||
"//tensorflow/compiler/xla:status_macros",
|
||||
"//tensorflow/compiler/xla:statusor",
|
||||
"//tensorflow/compiler/xla:types",
|
||||
"//tensorflow/compiler/xla:util",
|
||||
"//tensorflow/compiler/xla:xla_data_proto",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:stream_executor_no_cuda",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "cpu_plugin",
|
||||
deps = [
|
||||
|
131
tensorflow/compiler/xla/service/compile_only_service.cc
Normal file
131
tensorflow/compiler/xla/service/compile_only_service.cc
Normal file
@ -0,0 +1,131 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/compiler/xla/service/compile_only_service.h"
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "tensorflow/compiler/xla/service/backend.h"
|
||||
#include "tensorflow/compiler/xla/service/computation_layout.h"
|
||||
#include "tensorflow/compiler/xla/service/computation_tracker.h"
|
||||
#include "tensorflow/compiler/xla/service/platform_util.h"
|
||||
#include "tensorflow/compiler/xla/status_macros.h"
|
||||
#include "tensorflow/compiler/xla/types.h"
|
||||
#include "tensorflow/compiler/xla/util.h"
|
||||
#include "tensorflow/core/lib/gtl/cleanup.h"
|
||||
#include "tensorflow/core/lib/strings/strcat.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
|
||||
/* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
|
||||
CompileOnlyService::NewService(perftools::gputools::Platform* platform) {
|
||||
ServiceOptions default_options;
|
||||
default_options.set_platform(platform);
|
||||
return NewService(default_options);
|
||||
}
|
||||
|
||||
/* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
|
||||
CompileOnlyService::NewService(const ServiceOptions& options) {
|
||||
perftools::gputools::Platform* platform = options.platform();
|
||||
if (platform == nullptr) {
|
||||
TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
|
||||
}
|
||||
|
||||
TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform));
|
||||
|
||||
TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> compute_constant_backend,
|
||||
CreateComputeConstantBackend());
|
||||
std::unique_ptr<CompileOnlyService> service(
|
||||
new CompileOnlyService(compiler, std::move(compute_constant_backend)));
|
||||
return std::move(service);
|
||||
}
|
||||
|
||||
CompileOnlyService::CompileOnlyService(
|
||||
Compiler* compiler, std::unique_ptr<Backend> compute_constant_backend)
|
||||
: Service(/*backend=*/nullptr, std::move(compute_constant_backend)),
|
||||
compiler_(compiler) {
|
||||
runs_in_client_process_ = true;
|
||||
}
|
||||
|
||||
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
|
||||
CompileOnlyService::CompileAheadOfTime(
|
||||
const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
|
||||
const AotCompilationOptions& options) {
|
||||
std::vector<std::unique_ptr<HloModule>> hlo_modules;
|
||||
std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
|
||||
for (const AotComputationInstance& instance : computations) {
|
||||
TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
|
||||
computation_tracker_.Resolve(instance.computation));
|
||||
VersionedComputationHandle versioned_handle =
|
||||
user_computation->GetVersionedHandle();
|
||||
|
||||
// Dump computation proto state if flag is set.
|
||||
legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
|
||||
const string& directory_path = flags->xla_dump_computations_to;
|
||||
if (!directory_path.empty()) {
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
std::unique_ptr<SessionModule> session_module,
|
||||
computation_tracker_.SnapshotComputation(versioned_handle.handle));
|
||||
string filename = tensorflow::strings::StrCat(
|
||||
"computation_", versioned_handle.handle.handle(), "__",
|
||||
session_module->entry().name(), "__version_",
|
||||
versioned_handle.version);
|
||||
TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
|
||||
*session_module));
|
||||
}
|
||||
|
||||
TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
|
||||
computation_tracker_.BuildHloModule(
|
||||
versioned_handle,
|
||||
/*include_unreachable_instructions=*/true));
|
||||
hlo_modules.push_back(std::move(hlo_module));
|
||||
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
std::shared_ptr<const ProgramShape> program_shape,
|
||||
user_computation->ComputeProgramShape(versioned_handle.version));
|
||||
|
||||
module_configs.push_back(MakeUnique<HloModuleConfig>(*program_shape));
|
||||
HloModuleConfig* module_config = module_configs.back().get();
|
||||
auto* computation_layout =
|
||||
module_config->mutable_entry_computation_layout();
|
||||
if (flags->xla_hlo_profile) {
|
||||
module_config->enable_hlo_profiling(true);
|
||||
}
|
||||
for (int i = 0; i < instance.argument_layouts.size(); ++i) {
|
||||
const Shape& argument_layout = *instance.argument_layouts[i];
|
||||
if (ShapeUtil::IsTuple(argument_layout)) {
|
||||
return Unimplemented("tuple arguments not supported yet");
|
||||
}
|
||||
TF_RETURN_IF_ERROR(
|
||||
computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape(
|
||||
argument_layout));
|
||||
}
|
||||
TF_RETURN_IF_ERROR(
|
||||
computation_layout->mutable_result_layout()->CopyLayoutFromShape(
|
||||
*instance.result_layout));
|
||||
}
|
||||
|
||||
return compiler_->CompileAheadOfTime(std::move(hlo_modules),
|
||||
std::move(module_configs),
|
||||
MakeHloDumper(), options);
|
||||
}
|
||||
|
||||
} // namespace xla
|
125
tensorflow/compiler/xla/service/compile_only_service.h
Normal file
125
tensorflow/compiler/xla/service/compile_only_service.h
Normal file
@ -0,0 +1,125 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
|
||||
#define TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
|
||||
|
||||
#include "tensorflow/compiler/xla/service/backend.h"
|
||||
#include "tensorflow/compiler/xla/service/compiler.h"
|
||||
#include "tensorflow/compiler/xla/service/service.h"
|
||||
#include "tensorflow/compiler/xla/statusor.h"
|
||||
#include "tensorflow/compiler/xla/xla_data.pb.h"
|
||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||
|
||||
namespace xla {
|
||||
|
||||
// An XLA Service specialization for ahead-of-time compilation. This only
|
||||
// instantiates a Compiler object for the relevant platform; it does not
|
||||
// instantiate or require an execution backend.
|
||||
class CompileOnlyService : public Service {
|
||||
public:
|
||||
// Factory for creating a CompileOnlyService. The parameter platform is the
|
||||
// platform that the service should target. If platform is null then the
|
||||
// default platform is used.
|
||||
static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
|
||||
perftools::gputools::Platform* platform);
|
||||
static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
|
||||
const ServiceOptions& options);
|
||||
|
||||
// A description of a computation to compile using CompileAheadOfTime.
|
||||
struct AotComputationInstance {
|
||||
ComputationHandle computation;
|
||||
std::vector<const Shape*> argument_layouts;
|
||||
const Shape* result_layout = nullptr;
|
||||
};
|
||||
|
||||
// Compiles a list of computations for ahead-of-time execution. This is
|
||||
// intended for use in static compilation. See
|
||||
// |CompileOnlyClient::CompileAheadOfTime| for additional details.
|
||||
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
|
||||
CompileAheadOfTime(
|
||||
const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
|
||||
const AotCompilationOptions& Options);
|
||||
|
||||
// Override Service methods that require or imply the existence of an
|
||||
// execute backend. Note that this does not include TransferToClient and
|
||||
// TransferToClientInProcess, as computing contants produces global data
|
||||
// that we may wish to transfer.
|
||||
tensorflow::Status Execute(const ExecuteRequest* arg,
|
||||
ExecuteResponse* result) override {
|
||||
return Unimplemented("CompileOnlyService does not support execution.");
|
||||
}
|
||||
tensorflow::Status ExecuteParallel(const ExecuteParallelRequest* arg,
|
||||
ExecuteParallelResponse* result) override {
|
||||
return Unimplemented("CompileOnlyService does not support execution.");
|
||||
}
|
||||
tensorflow::Status GetDeviceHandles(
|
||||
const GetDeviceHandlesRequest* arg,
|
||||
GetDeviceHandlesResponse* result) override {
|
||||
return Unimplemented("CompileOnlyService does not support devices.");
|
||||
}
|
||||
tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
|
||||
ExecuteAsyncResponse* result) override {
|
||||
return Unimplemented("CompileOnlyService does not support execution.");
|
||||
}
|
||||
tensorflow::Status WaitForExecution(
|
||||
const WaitForExecutionRequest* arg,
|
||||
WaitForExecutionResponse* result) override {
|
||||
return Unimplemented("CompileOnlyService does not support execution.");
|
||||
}
|
||||
tensorflow::Status TransferToServer(
|
||||
const TransferToServerRequest* arg,
|
||||
TransferToServerResponse* result) override {
|
||||
return Unimplemented(
|
||||
"CompileOnlyService does not support device data transfers.");
|
||||
}
|
||||
tensorflow::Status TransferToInfeed(
|
||||
const TransferToInfeedRequest* arg,
|
||||
TransferToInfeedResponse* result) override {
|
||||
return Unimplemented(
|
||||
"CompileOnlyService does not support device data transfers.");
|
||||
}
|
||||
tensorflow::Status TransferFromOutfeed(
|
||||
const TransferFromOutfeedRequest* arg,
|
||||
TransferFromOutfeedResponse* result) override {
|
||||
return Unimplemented(
|
||||
"CompileOnlyService does not support device data transfers.");
|
||||
}
|
||||
tensorflow::Status TransferToServerInProcess(
|
||||
const TransferToServerInProcessRequest* arg,
|
||||
TransferToServerInProcessResponse* result) override {
|
||||
return Unimplemented(
|
||||
"CompileOnlyService does not support device data transfers.");
|
||||
}
|
||||
tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
|
||||
ResetDeviceResponse* result) override {
|
||||
return Unimplemented("CompileOnlyService does not support devices.");
|
||||
}
|
||||
|
||||
private:
|
||||
explicit CompileOnlyService(
|
||||
Compiler* compiler, std::unique_ptr<Backend> compute_constant_backend);
|
||||
CompileOnlyService(const CompileOnlyService&) = delete;
|
||||
void operator=(const CompileOnlyService&) = delete;
|
||||
|
||||
// The compiler for the target platform. This is included in place of
|
||||
// the Service::execute_backend_'s compiler, since execute_backend_ is a
|
||||
// nullptr in CompileOnlyService.
|
||||
Compiler* compiler_;
|
||||
};
|
||||
|
||||
} // namespace xla
|
||||
|
||||
#endif // TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
|
@ -188,41 +188,52 @@ tensorflow::Status PrepareHloModuleForIrEmitting(
|
||||
return pipeline.Run(hlo_module).status();
|
||||
}
|
||||
|
||||
// Invokes the ptxas tool on the given PTX string, and dumps its output.
|
||||
void DumpPtxasInfo(const string& ptx) {
|
||||
// Invokes the ptxas tool on the given PTX string, and stores the resulting
|
||||
// SASS in *cubin. If -v 2 or greater, runs ptxas with -v and dumps the
|
||||
// resulting stderr (which contains register allocation info, etc.)
|
||||
// to VLOG(2). If ptxas binary is not found *sass is set to "".
|
||||
Status CompilePTX(const string& ptx, int cc_major, int cc_minor,
|
||||
string* cubin) {
|
||||
*cubin = "";
|
||||
|
||||
const string ptxas_path =
|
||||
tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin/ptxas");
|
||||
|
||||
// Do not log PTX stats if ptxas is not found at the given path.
|
||||
if (!tensorflow::Env::Default()->FileExists(ptxas_path).ok()) {
|
||||
LOG(WARNING)
|
||||
<< "Failed to dump PTX stats because ptxas is not found at path \""
|
||||
<< ptxas_path << "\".";
|
||||
return;
|
||||
}
|
||||
LOG(INFO) << "Invoking ptxas at path \"" << ptxas_path << "\".";
|
||||
TF_RETURN_IF_ERROR(tensorflow::Env::Default()->FileExists(ptxas_path));
|
||||
|
||||
// Write `ptx` into a temporary file.
|
||||
char tempdir_template[] = "/tmp/ptxXXXXXX";
|
||||
char* tempdir_name = mkdtemp(tempdir_template);
|
||||
CHECK_NOTNULL(tempdir_name);
|
||||
string ptx_path = tensorflow::io::JoinPath(tempdir_name, "ptx");
|
||||
|
||||
TF_CHECK_OK(
|
||||
tensorflow::WriteStringToFile(tensorflow::Env::Default(), ptx_path, ptx));
|
||||
LOG(INFO) << "ptx file written to: " << ptx_path;
|
||||
|
||||
// Invoke ptxas and collect its output.
|
||||
tensorflow::SubProcess ptxas_info_dumper;
|
||||
ptxas_info_dumper.SetProgram(ptxas_path, {ptxas_path, ptx_path, "-o",
|
||||
"/dev/null", "-v", "-arch=sm_35"});
|
||||
ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
|
||||
tensorflow::ACTION_PIPE);
|
||||
CHECK(ptxas_info_dumper.Start());
|
||||
string stderr_output;
|
||||
int exit_status = ptxas_info_dumper.Communicate(
|
||||
/*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
|
||||
XLA_LOG_LINES(tensorflow::INFO, stderr_output);
|
||||
if (exit_status != 0) {
|
||||
LOG(FATAL) << "Invalid PTX. See the error message above for reasons.";
|
||||
tensorflow::SubProcess ptxas_info;
|
||||
string arch = tensorflow::strings::StrCat("sm_", cc_major, cc_minor);
|
||||
string cubin_path = tensorflow::io::JoinPath(tempdir_name, "cubin");
|
||||
|
||||
if (VLOG_IS_ON(2)) {
|
||||
ptxas_info.SetProgram(ptxas_path, {ptxas_path, "-v", "-o", cubin_path,
|
||||
"-arch", arch, ptx_path});
|
||||
} else {
|
||||
ptxas_info.SetProgram(
|
||||
ptxas_path, {ptxas_path, "-o", cubin_path, "-arch", arch, ptx_path});
|
||||
}
|
||||
ptxas_info.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
|
||||
CHECK(ptxas_info.Start());
|
||||
string stderr_output;
|
||||
int ptxas_exit_status = ptxas_info.Communicate(
|
||||
/*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
|
||||
|
||||
TF_RET_CHECK(ptxas_exit_status == 0);
|
||||
return tensorflow::ReadFileToString(tensorflow::Env::Default(), cubin_path,
|
||||
cubin);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@ -298,10 +309,14 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
|
||||
|
||||
// Reserve space for the PTX to be generated for this module.
|
||||
string* ptx;
|
||||
string* cubin;
|
||||
{
|
||||
tensorflow::mutex_lock lock(mutex_);
|
||||
generated_ptxes_.emplace_back(MakeUnique<string>());
|
||||
ptx = generated_ptxes_.back().get();
|
||||
|
||||
generated_cubins_.emplace_back(MakeUnique<string>());
|
||||
cubin = generated_cubins_.back().get();
|
||||
}
|
||||
int cc_major, cc_minor;
|
||||
if (!stream_exec->GetDeviceDescription().cuda_compute_capability(&cc_major,
|
||||
@ -318,9 +333,6 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
|
||||
XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(llvm_module));
|
||||
VLOG(2) << "PTX:";
|
||||
XLA_VLOG_LINES(2, *ptx);
|
||||
if (VLOG_IS_ON(2)) {
|
||||
DumpPtxasInfo(*ptx);
|
||||
}
|
||||
|
||||
auto thunk_schedule = MakeUnique<ThunkSchedule>(
|
||||
ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
|
||||
@ -328,9 +340,13 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
|
||||
VLOG(2) << "Printing the thunk schedule...";
|
||||
XLA_VLOG_LINES(2, thunk_schedule->ToString());
|
||||
|
||||
TF_RET_CHECK(CompilePTX(*ptx, cc_major, cc_minor, cubin).ok());
|
||||
|
||||
auto* gpu_executable =
|
||||
new GpuExecutable(*ptx, std::move(thunk_schedule), std::move(hlo_module),
|
||||
new GpuExecutable(*cubin, *ptx, {cc_major, cc_minor},
|
||||
std::move(thunk_schedule), std::move(hlo_module),
|
||||
std::move(module_config), std::move(buffer_assignment));
|
||||
|
||||
if (flags->xla_gpu_embed_ir) {
|
||||
DCHECK_NE("", ir_module_string_before_opt);
|
||||
gpu_executable->set_ir_module_string(ir_module_string_before_opt);
|
||||
|
@ -71,6 +71,7 @@ class GpuCompiler : public Compiler {
|
||||
// StreamExecutor (b/24776264).
|
||||
tensorflow::mutex mutex_;
|
||||
std::vector<std::unique_ptr<string>> generated_ptxes_ GUARDED_BY(mutex_);
|
||||
std::vector<std::unique_ptr<string>> generated_cubins_ GUARDED_BY(mutex_);
|
||||
|
||||
// The size in bytes of a pointer. Used for computing ShapeSizeBytes.
|
||||
int64 pointer_size_;
|
||||
|
@ -107,13 +107,17 @@ class HloExecutionProfiler {
|
||||
|
||||
// Implementation note: HLO profiling is always enabled for GPU executables,
|
||||
// since we can use timers around thunks.
|
||||
GpuExecutable::GpuExecutable(tensorflow::StringPiece ptx,
|
||||
GpuExecutable::GpuExecutable(tensorflow::StringPiece cubin,
|
||||
tensorflow::StringPiece ptx,
|
||||
std::pair<int, int> compute_capability,
|
||||
std::unique_ptr<ThunkSchedule> thunk_schedule,
|
||||
std::unique_ptr<HloModule> hlo_module,
|
||||
std::unique_ptr<HloModuleConfig> module_config,
|
||||
std::unique_ptr<BufferAssignment> assignment)
|
||||
: Executable(std::move(hlo_module), std::move(module_config)),
|
||||
cubin_(cubin),
|
||||
ptx_(ptx),
|
||||
compute_capability_(compute_capability),
|
||||
thunk_schedule_(std::move(thunk_schedule)),
|
||||
assignment_(std::move(assignment)) {}
|
||||
|
||||
@ -186,6 +190,13 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteOnStream(
|
||||
// false.
|
||||
TF_RET_CHECK(!module_config().has_hybrid_result());
|
||||
|
||||
// Ensure the compute capability of the cubin and the stream match.
|
||||
std::pair<int, int> stream_compute_compatibility;
|
||||
stream->parent()->GetDeviceDescription().cuda_compute_capability(
|
||||
&stream_compute_compatibility.first,
|
||||
&stream_compute_compatibility.second);
|
||||
TF_RET_CHECK(stream_compute_compatibility == compute_capability_);
|
||||
|
||||
BufferAllocations::Builder buffer_allocations_builder;
|
||||
for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
|
||||
++i) {
|
||||
|
@ -40,15 +40,17 @@ limitations under the License.
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
// GPU-targeting implementation of the XLA Executable interface.
|
||||
//
|
||||
// Launches the given CUDA kernel via the StreamExecutor.
|
||||
//
|
||||
// This is an immutable data type after initialization, and thus thread safe.
|
||||
|
||||
// GPUExecutable should eventually be updated to associate a compute
|
||||
// capability with the PTX and store multiple cubins, each with their own
|
||||
// associated CC's, rather than including CC as a property of GpuExecutable.
|
||||
class GpuExecutable : public Executable {
|
||||
public:
|
||||
GpuExecutable(tensorflow::StringPiece ptx,
|
||||
GpuExecutable(tensorflow::StringPiece cubin, tensorflow::StringPiece ptx,
|
||||
std::pair<int, int> compute_capability,
|
||||
std::unique_ptr<ThunkSchedule> thunk_schedule,
|
||||
std::unique_ptr<HloModule> hlo_module,
|
||||
std::unique_ptr<HloModuleConfig> module_config,
|
||||
@ -62,7 +64,8 @@ class GpuExecutable : public Executable {
|
||||
ir_module_string_ = ir_module_string;
|
||||
}
|
||||
|
||||
// Returns the compiled PTX for the computation.
|
||||
// Returns the compiled CUDA binary for the computation.
|
||||
tensorflow::StringPiece cubin() const { return cubin_; }
|
||||
tensorflow::StringPiece ptx() const { return ptx_; }
|
||||
|
||||
StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
|
||||
@ -104,8 +107,10 @@ class GpuExecutable : public Executable {
|
||||
// This string should be modified only before ExecuteOnStream.
|
||||
string ir_module_string_;
|
||||
|
||||
// The reference to the compiled PTX for the computation.
|
||||
const tensorflow::StringPiece ptx_;
|
||||
// The reference to the compiled PTX & CUDA binary for the computation.
|
||||
tensorflow::StringPiece cubin_;
|
||||
tensorflow::StringPiece ptx_;
|
||||
std::pair<int, int> compute_capability_;
|
||||
|
||||
// The thunks to be invoked by this GpuExecutable. They are generated by the
|
||||
// IrEmitter.
|
||||
|
@ -41,13 +41,10 @@ tensorflow::Status KernelThunk::Initialize(const GpuExecutable& executable) {
|
||||
// Already initialized by another thread.
|
||||
return tensorflow::Status::OK();
|
||||
}
|
||||
|
||||
loader_spec_.reset(new se::MultiKernelLoaderSpec(io_buffers_.size() + 1));
|
||||
tensorflow::StringPiece ptx = executable.ptx();
|
||||
// Convert tensorflow::StringPiece to se::port::StringPiece because
|
||||
// StreamExecutor uses the latter.
|
||||
loader_spec_->AddCudaPtxInMemory(
|
||||
se::port::StringPiece(ptx.data(), ptx.size()), kernel_name_);
|
||||
|
||||
tensorflow::StringPiece cubin = executable.cubin();
|
||||
loader_spec_->AddCudaCubinInMemory(cubin.data(), kernel_name_);
|
||||
return tensorflow::Status::OK();
|
||||
}
|
||||
|
||||
|
@ -195,7 +195,7 @@ TEST_F(HloConstantFoldingTest, TransposeConstantFold) {
|
||||
|
||||
HloInstruction* root = computation->root_instruction();
|
||||
EXPECT_THAT(root, op::Constant());
|
||||
EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
|
||||
EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), shape));
|
||||
|
||||
using NativeT = typename primitive_util::PrimitiveTypeToNative<F32>::type;
|
||||
bool matched = true;
|
||||
|
@ -128,70 +128,6 @@ StatusOr<GlobalDataHandle> LocalService::AllocateBufferOnDevice(
|
||||
allocation_size));
|
||||
}
|
||||
|
||||
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
|
||||
LocalService::CompileAheadOfTime(
|
||||
const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
|
||||
computations,
|
||||
const AotCompilationOptions& options) {
|
||||
std::vector<std::unique_ptr<HloModule>> hlo_modules;
|
||||
std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
|
||||
for (const AheadOfTimeComputationInstance& instance : computations) {
|
||||
TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
|
||||
computation_tracker_.Resolve(instance.computation));
|
||||
VersionedComputationHandle versioned_handle =
|
||||
user_computation->GetVersionedHandle();
|
||||
|
||||
// Dump computation proto state if flag is set.
|
||||
legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
|
||||
const string& directory_path = flags->xla_dump_computations_to;
|
||||
if (!directory_path.empty()) {
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
std::unique_ptr<SessionModule> session_module,
|
||||
computation_tracker_.SnapshotComputation(versioned_handle.handle));
|
||||
string filename = tensorflow::strings::StrCat(
|
||||
"computation_", versioned_handle.handle.handle(), "__",
|
||||
session_module->entry().name(), "__version_",
|
||||
versioned_handle.version);
|
||||
TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
|
||||
*session_module));
|
||||
}
|
||||
|
||||
TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
|
||||
computation_tracker_.BuildHloModule(
|
||||
versioned_handle,
|
||||
/*include_unreachable_instructions=*/true));
|
||||
hlo_modules.push_back(std::move(hlo_module));
|
||||
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
std::shared_ptr<const ProgramShape> program_shape,
|
||||
user_computation->ComputeProgramShape(versioned_handle.version));
|
||||
|
||||
module_configs.push_back(MakeUnique<HloModuleConfig>(*program_shape));
|
||||
HloModuleConfig* module_config = module_configs.back().get();
|
||||
auto* computation_layout =
|
||||
module_config->mutable_entry_computation_layout();
|
||||
if (flags->xla_hlo_profile) {
|
||||
module_config->enable_hlo_profiling(true);
|
||||
}
|
||||
for (int i = 0; i < instance.argument_layouts.size(); ++i) {
|
||||
const Shape& argument_layout = *instance.argument_layouts[i];
|
||||
if (ShapeUtil::IsTuple(argument_layout)) {
|
||||
return Unimplemented("tuple arguments not supported yet");
|
||||
}
|
||||
TF_RETURN_IF_ERROR(
|
||||
computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape(
|
||||
argument_layout));
|
||||
}
|
||||
TF_RETURN_IF_ERROR(
|
||||
computation_layout->mutable_result_layout()->CopyLayoutFromShape(
|
||||
*instance.result_layout));
|
||||
}
|
||||
|
||||
return execute_backend_->compiler()->CompileAheadOfTime(
|
||||
std::move(hlo_modules), std::move(module_configs), MakeHloDumper(),
|
||||
options);
|
||||
}
|
||||
|
||||
StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
|
||||
const ComputationHandle& computation,
|
||||
const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
|
||||
|
@ -59,22 +59,6 @@ class LocalService : public Service {
|
||||
const Shape& shape, int device_ordinal,
|
||||
bool allocate_space_for_deep_copy);
|
||||
|
||||
// A description of a computation to compile using CompileAheadOfTime.
|
||||
struct AheadOfTimeComputationInstance {
|
||||
ComputationHandle computation;
|
||||
std::vector<const Shape*> argument_layouts;
|
||||
const Shape* result_layout = nullptr;
|
||||
};
|
||||
|
||||
// Compiles a list of computations for ahead-of-time execution. This is
|
||||
// intended for use in static compilation. See
|
||||
// |LocalClient::CompileAheadOfTime| for additional details.
|
||||
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
|
||||
CompileAheadOfTime(
|
||||
const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
|
||||
computations,
|
||||
const AotCompilationOptions& Options);
|
||||
|
||||
// Builds an Executable with the given argument layouts and options. If
|
||||
// result_layout is non-null, then the executable is compiled to produce a
|
||||
// result of the given layout.
|
||||
|
@ -180,20 +180,24 @@ Service::Service(std::unique_ptr<Backend> execute_backend,
|
||||
std::unique_ptr<Backend> compute_constant_backend)
|
||||
: execute_backend_(std::move(execute_backend)),
|
||||
compute_constant_backend_(std::move(compute_constant_backend)) {
|
||||
LOG(INFO) << Printf(
|
||||
"XLA service %p executing computations on platform %s. Devices:", this,
|
||||
execute_backend_->platform()->Name().c_str());
|
||||
for (int i = 0; i < execute_backend_->device_count(); ++i) {
|
||||
if (execute_backend_->device_ordinal_supported(i)) {
|
||||
se::StreamExecutor* executor =
|
||||
execute_backend_->stream_executor(i).ValueOrDie();
|
||||
const auto& description = executor->GetDeviceDescription();
|
||||
LOG(INFO) << Printf(" StreamExecutor device (%d): %s, %s", i,
|
||||
description.name().c_str(),
|
||||
description.platform_version().c_str());
|
||||
} else {
|
||||
LOG(INFO) << Printf(" StreamExecutor device (%d) not supported", i);
|
||||
if (execute_backend_) {
|
||||
LOG(INFO) << Printf(
|
||||
"XLA service %p executing computations on platform %s. Devices:", this,
|
||||
execute_backend_->platform()->Name().c_str());
|
||||
for (int i = 0; i < execute_backend_->device_count(); ++i) {
|
||||
if (execute_backend_->device_ordinal_supported(i)) {
|
||||
se::StreamExecutor* executor =
|
||||
execute_backend_->stream_executor(i).ValueOrDie();
|
||||
const auto& description = executor->GetDeviceDescription();
|
||||
LOG(INFO) << Printf(" StreamExecutor device (%d): %s, %s", i,
|
||||
description.name().c_str(),
|
||||
description.platform_version().c_str());
|
||||
} else {
|
||||
LOG(INFO) << Printf(" StreamExecutor device (%d) not supported", i);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
VLOG(1) << "XLA compile-only service constructed";
|
||||
}
|
||||
}
|
||||
|
||||
@ -286,7 +290,7 @@ StatusOr<std::vector<const Allocation*>> Service::ResolveAndValidateArguments(
|
||||
StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
|
||||
const ProgramShape& program_shape,
|
||||
tensorflow::gtl::ArraySlice<const Allocation*> arguments,
|
||||
const ExecutionOptions& execution_options) {
|
||||
const ExecutionOptions& execution_options, Backend* backend) {
|
||||
auto module_config = MakeUnique<HloModuleConfig>(program_shape);
|
||||
auto* computation_layout = module_config->mutable_entry_computation_layout();
|
||||
|
||||
@ -326,7 +330,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
|
||||
module_config->enable_hlo_profiling(true);
|
||||
}
|
||||
|
||||
module_config->set_replica_count(execute_backend_->Replicas().size());
|
||||
module_config->set_replica_count(backend->Replicas().size());
|
||||
module_config->set_fast_math_disabled(execution_options.disable_fast_math());
|
||||
module_config->set_seed(execution_options.seed());
|
||||
|
||||
@ -474,7 +478,7 @@ StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
|
||||
std::unique_ptr<Executable> executable_unique_ptr,
|
||||
BuildExecutable(versioned_handle, std::move(module_config),
|
||||
/*executable_for_compute_constant=*/false, arguments,
|
||||
execute_backend_.get(), executor));
|
||||
backend, executor));
|
||||
|
||||
if (profile != nullptr) {
|
||||
uint64 end_micros = tensorflow::Env::Default()->NowMicros();
|
||||
@ -575,15 +579,14 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
|
||||
perftools::gputools::DeviceMemoryBase result;
|
||||
if (backend->Replicas().size() == 1) {
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
result,
|
||||
ExecuteOnStreamWrapper<StatusOr<se::DeviceMemoryBase>>(
|
||||
executable, &run_options[0], profile, execute_backend_.get(),
|
||||
[&arguments](Executable* executable,
|
||||
const ServiceExecutableRunOptions* run_options,
|
||||
HloExecutionProfile* hlo_execution_profile) {
|
||||
return executable->ExecuteOnStream(run_options, arguments,
|
||||
hlo_execution_profile);
|
||||
}));
|
||||
result, ExecuteOnStreamWrapper<StatusOr<se::DeviceMemoryBase>>(
|
||||
executable, &run_options[0], profile, backend,
|
||||
[&arguments](Executable* executable,
|
||||
const ServiceExecutableRunOptions* run_options,
|
||||
HloExecutionProfile* hlo_execution_profile) {
|
||||
return executable->ExecuteOnStream(run_options, arguments,
|
||||
hlo_execution_profile);
|
||||
}));
|
||||
} else {
|
||||
std::vector<
|
||||
tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>>
|
||||
@ -666,7 +669,8 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
|
||||
// the program and the argument allocations.
|
||||
TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
|
||||
CreateModuleConfig(*program_shape, arg_allocations,
|
||||
request.execution_options()));
|
||||
request.execution_options(),
|
||||
execute_backend_.get()));
|
||||
VLOG(3) << "ExecuteParallel created HloModuleConfig computation layout: "
|
||||
<< module_config->entry_computation_layout().ToString();
|
||||
|
||||
@ -751,9 +755,10 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
|
||||
ResolveAndValidateArguments(arg->arguments(), execute_backend_.get(),
|
||||
execute_backend_->default_device_ordinal()));
|
||||
|
||||
TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
|
||||
CreateModuleConfig(*program_shape, arg_allocations,
|
||||
arg->execution_options()));
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
std::unique_ptr<HloModuleConfig> module_config,
|
||||
CreateModuleConfig(*program_shape, arg_allocations,
|
||||
arg->execution_options(), execute_backend_.get()));
|
||||
|
||||
VLOG(3) << "Execute created HloModuleConfig computation layout: "
|
||||
<< module_config->entry_computation_layout().ToString();
|
||||
@ -818,9 +823,10 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
|
||||
ResolveAndValidateArguments(arg->arguments(), execute_backend_.get(),
|
||||
execute_backend_->default_device_ordinal()));
|
||||
|
||||
TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
|
||||
CreateModuleConfig(*program_shape, arg_allocations,
|
||||
arg->execution_options()));
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
std::unique_ptr<HloModuleConfig> module_config,
|
||||
CreateModuleConfig(*program_shape, arg_allocations,
|
||||
arg->execution_options(), execute_backend_.get()));
|
||||
|
||||
VLOG(3) << "ExecuteAsync created HloModuleConfig computation layout: "
|
||||
<< module_config->entry_computation_layout().ToString();
|
||||
@ -1141,7 +1147,8 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
|
||||
}
|
||||
|
||||
TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
|
||||
CreateModuleConfig(program_shape, {}, execution_options));
|
||||
CreateModuleConfig(program_shape, {}, execution_options,
|
||||
compute_constant_backend_.get()));
|
||||
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
std::shared_ptr<Executable> executable,
|
||||
|
@ -265,11 +265,11 @@ class Service : public ServiceInterface {
|
||||
tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
|
||||
const Backend* backend, int device_ordinal);
|
||||
|
||||
// Create a Hlo module config foe the given program shape and arguments.
|
||||
// Create a Hlo module config for the given program shape and arguments.
|
||||
StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
|
||||
const ProgramShape& program_shape,
|
||||
tensorflow::gtl::ArraySlice<const Allocation*> arguments,
|
||||
const ExecutionOptions& execution_options);
|
||||
const ExecutionOptions& execution_options, Backend* backend);
|
||||
|
||||
// Builds an Executable for the given parameters. If
|
||||
// executable_for_compute_constant is true, then the executable is intended to
|
||||
|
@ -728,9 +728,17 @@ Status ForEachMutableSubshapeHelper(
|
||||
new_shape.add_dimensions(dim);
|
||||
}
|
||||
if (shape.has_layout()) {
|
||||
new_shape.mutable_layout()->clear_minor_to_major();
|
||||
Layout* new_layout = new_shape.mutable_layout();
|
||||
new_layout->clear_minor_to_major();
|
||||
for (auto index : Permute(permutation, shape.layout().minor_to_major())) {
|
||||
new_shape.mutable_layout()->add_minor_to_major(index);
|
||||
new_layout->add_minor_to_major(index);
|
||||
}
|
||||
if (shape.layout().padded_dimensions_size() > 0) {
|
||||
new_layout->clear_padded_dimensions();
|
||||
for (auto dim :
|
||||
Permute(permutation, shape.layout().padded_dimensions())) {
|
||||
new_layout->add_padded_dimensions(dim);
|
||||
}
|
||||
}
|
||||
}
|
||||
return new_shape;
|
||||
@ -1057,7 +1065,9 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
|
||||
DCHECK_EQ(count.size(), base.size());
|
||||
const Layout& layout = shape.layout();
|
||||
int64 rank = layout.minor_to_major_size();
|
||||
int64 n = 0;
|
||||
// Allows handling R0 arrays, such that the visitor function will be called
|
||||
// once with the proper empty indexes.
|
||||
int64 n = -1;
|
||||
std::vector<int64> indexes(base.begin(), base.end());
|
||||
while (n < rank && visitor_function(indexes)) {
|
||||
// Increments dimensions in minor to major order.
|
||||
|
@ -42,7 +42,7 @@ xla::Computation Doubler(xla::Client* client) {
|
||||
int main(int argc, char** argv) {
|
||||
tensorflow::port::InitMain(argv[0], &argc, &argv);
|
||||
|
||||
auto client = xla::ClientLibrary::LocalClientOrDie();
|
||||
auto client = xla::ClientLibrary::GetOrCreateCompileOnlyClient().ValueOrDie();
|
||||
|
||||
xla::ComputationBuilder builder(client, "aot_test_helper");
|
||||
auto opaque_shape = xla::ShapeUtil::MakeOpaqueShape();
|
||||
@ -74,7 +74,7 @@ int main(int argc, char** argv) {
|
||||
llvm::Triple triple(xla::llvm_ir::AsStringRef(triple_string));
|
||||
|
||||
xla::Computation computation = builder.Build().ConsumeValueOrDie();
|
||||
xla::LocalClient::AheadOfTimeComputationInstance instance{
|
||||
xla::CompileOnlyClient::AotComputationInstance instance{
|
||||
&computation, /*argument_layouts=*/{&opaque_shape}, &r0f32};
|
||||
|
||||
xla::cpu::CpuAotCompilationOptions options(
|
||||
|
@ -153,16 +153,26 @@ string Reindent(tensorflow::StringPiece original,
|
||||
});
|
||||
}
|
||||
|
||||
bool IsPermutation(tensorflow::gtl::ArraySlice<int64> permutation, int64 rank) {
|
||||
if (rank != permutation.size()) {
|
||||
return false;
|
||||
}
|
||||
std::vector<int64> output(permutation.size(), -1);
|
||||
for (auto index : permutation) {
|
||||
CHECK_GE(index, 0);
|
||||
CHECK_LT(index, rank);
|
||||
output[index] = 0;
|
||||
}
|
||||
return std::find(output.begin(), output.end(), -1) == output.end();
|
||||
}
|
||||
|
||||
std::vector<int64> InversePermutation(
|
||||
tensorflow::gtl::ArraySlice<int64> input_permutation) {
|
||||
DCHECK(IsPermutation(input_permutation, input_permutation.size()));
|
||||
std::vector<int64> output_permutation(input_permutation.size(), -1);
|
||||
for (size_t i = 0; i < input_permutation.size(); ++i) {
|
||||
output_permutation[input_permutation[i]] = i;
|
||||
}
|
||||
DCHECK_EQ(
|
||||
0, std::count(output_permutation.begin(), output_permutation.end(), -1));
|
||||
DCHECK(std::is_permutation(input_permutation.begin(), input_permutation.end(),
|
||||
output_permutation.begin()));
|
||||
return output_permutation;
|
||||
}
|
||||
|
||||
|
@ -177,6 +177,9 @@ Status Unavailable(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2);
|
||||
string Reindent(tensorflow::StringPiece original,
|
||||
tensorflow::StringPiece indentation);
|
||||
|
||||
// Checks whether permutation is a permutation of the [0, rank) integer range.
|
||||
bool IsPermutation(tensorflow::gtl::ArraySlice<int64> permutation, int64 rank);
|
||||
|
||||
// Applies `permutation` on `input` and returns the permuted array.
|
||||
// For each i, output[permutation[i]] = input[i].
|
||||
//
|
||||
@ -187,12 +190,11 @@ template <template <typename...> class C, typename T>
|
||||
std::vector<T> Permute(tensorflow::gtl::ArraySlice<int64> permutation,
|
||||
C<T> input_) {
|
||||
tensorflow::gtl::ArraySlice<T> input(input_);
|
||||
CHECK_EQ(permutation.size(), input.size());
|
||||
CHECK(IsPermutation(permutation, input.size()));
|
||||
std::vector<T> output(input.size());
|
||||
for (size_t i = 0; i < permutation.size(); ++i) {
|
||||
output[permutation[i]] = input[i];
|
||||
}
|
||||
DCHECK(std::is_permutation(input.begin(), input.end(), output.begin()));
|
||||
return output;
|
||||
}
|
||||
|
||||
|
@ -121,6 +121,7 @@ set(tf_proto_text_srcs
|
||||
"tensorflow/core/protobuf/cluster.proto"
|
||||
"tensorflow/core/protobuf/config.proto"
|
||||
"tensorflow/core/protobuf/debug.proto"
|
||||
"tensorflow/core/protobuf/device_properties.proto"
|
||||
"tensorflow/core/protobuf/rewriter_config.proto"
|
||||
"tensorflow/core/protobuf/tensor_bundle.proto"
|
||||
"tensorflow/core/protobuf/saver.proto"
|
||||
|
@ -82,7 +82,7 @@ tf_custom_op_py_library(
|
||||
|
||||
cuda_py_test(
|
||||
name = "cudnn_rnn_ops_test",
|
||||
size = "small",
|
||||
size = "medium",
|
||||
srcs = ["python/kernel_tests/cudnn_rnn_ops_test.py"],
|
||||
additional_deps = [
|
||||
":cudnn_rnn_py",
|
||||
|
@ -40,6 +40,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/lib/strings/stringprintf.h"
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
#include "tensorflow/core/util/env_var.h"
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#include "tensorflow/core/platform/stream_executor.h"
|
||||
@ -67,7 +68,7 @@ limitations under the License.
|
||||
* TensorFlow is responsible for making sure the memory is alive long enough
|
||||
* and recycles afterwards.
|
||||
*
|
||||
*/
|
||||
*/
|
||||
namespace tensorflow {
|
||||
|
||||
using CPUDevice = Eigen::ThreadPoolDevice;
|
||||
@ -106,6 +107,7 @@ using perftools::gputools::DeviceMemory;
|
||||
using perftools::gputools::DeviceMemoryBase;
|
||||
using perftools::gputools::ScratchAllocator;
|
||||
using perftools::gputools::port::StatusOr;
|
||||
using strings::Printf;
|
||||
|
||||
Status ParseRNNMode(const string& str, RnnMode* rnn_mode) {
|
||||
if (str == "rnn_relu") {
|
||||
@ -203,9 +205,10 @@ DeviceMemoryBase SliceDeviceMemory(const DeviceMemoryBase& device_memory,
|
||||
}
|
||||
|
||||
inline Status FromExecutorStatus(const perftools::gputools::port::Status& s) {
|
||||
return s.ok() ? Status::OK() : Status(static_cast<tensorflow::error::Code>(
|
||||
static_cast<int>(s.code())),
|
||||
s.error_message());
|
||||
return s.ok() ? Status::OK()
|
||||
: Status(static_cast<tensorflow::error::Code>(
|
||||
static_cast<int>(s.code())),
|
||||
s.error_message());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -244,8 +247,7 @@ class CudnnRNNWorkspaceAllocator : public ScratchAllocator {
|
||||
// allocator.
|
||||
allocated_tensors_.push_back(temporary_memory);
|
||||
total_byte_size_ += byte_size;
|
||||
return perftools::gputools::port::StatusOr<
|
||||
perftools::gputools::DeviceMemory<uint8>>(
|
||||
return StatusOr<DeviceMemory<uint8>>(
|
||||
AsDeviceMemory<uint8>(&temporary_memory));
|
||||
}
|
||||
int64 TotalByteSize() { return total_byte_size_; }
|
||||
@ -296,6 +298,43 @@ class CudnnRNNReserveSpaceAllocator : public ScratchAllocator {
|
||||
int output_index_;
|
||||
};
|
||||
|
||||
// A helper to allocate persistent memory for Cudnn RNN models, which is
|
||||
// expected to live between kernel invocations.
|
||||
// This class is not thread-safe.
|
||||
class CudnnRNNPersistentSpaceAllocator : public ScratchAllocator {
|
||||
public:
|
||||
CudnnRNNPersistentSpaceAllocator(OpKernelContext* context)
|
||||
: context_(context) {}
|
||||
|
||||
virtual ~CudnnRNNPersistentSpaceAllocator() {}
|
||||
|
||||
int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
|
||||
return std::numeric_limits<int64>::max();
|
||||
}
|
||||
|
||||
StatusOr<DeviceMemory<uint8>> AllocateBytes(
|
||||
perftools::gputools::Stream* stream, int64 byte_size) override {
|
||||
if (total_byte_size_ != 0) {
|
||||
return Status(error::FAILED_PRECONDITION,
|
||||
"Persistent space allocator can only be called once");
|
||||
}
|
||||
|
||||
Status allocation_status = context_->allocate_persistent(
|
||||
DT_UINT8, TensorShape({byte_size}), &handle_, nullptr);
|
||||
if (!allocation_status.ok()) {
|
||||
return ToExecutorStatus(allocation_status);
|
||||
}
|
||||
total_byte_size_ += byte_size;
|
||||
return AsDeviceMemory<uint8>(handle_.AccessTensor(context_));
|
||||
}
|
||||
int64 TotalByteSize() { return total_byte_size_; }
|
||||
|
||||
private:
|
||||
int64 total_byte_size_ = 0;
|
||||
PersistentTensor handle_;
|
||||
OpKernelContext* context_; // not owned
|
||||
};
|
||||
|
||||
struct CudnnModelTypes {
|
||||
RnnMode rnn_mode;
|
||||
TFRNNInputMode rnn_input_mode;
|
||||
@ -317,6 +356,16 @@ struct CudnnModelShapes {
|
||||
TensorShape input_shape;
|
||||
TensorShape output_shape;
|
||||
TensorShape hidden_state_shape;
|
||||
// At present only fields related to cached RnnDescriptor are concerned.
|
||||
bool IsCompatibleWith(const CudnnModelShapes& rhs) const {
|
||||
return num_layers == rhs.num_layers && input_size == rhs.input_size &&
|
||||
num_units == rhs.num_units && dir_count == rhs.dir_count;
|
||||
}
|
||||
string RnnDescDebugString() {
|
||||
return strings::Printf(
|
||||
"[num_layers, input_size, num_units, dir_count]: [%d, %d, %d, %d]",
|
||||
num_layers, input_size, num_units, dir_count);
|
||||
}
|
||||
};
|
||||
|
||||
// Extract and checks the forward input tensors, parameters, and shapes from the
|
||||
@ -399,11 +448,23 @@ void RestoreParams(const OpInputList params_input,
|
||||
|
||||
} // namespace
|
||||
|
||||
// Note: all following kernels depend on a RnnDescriptor instance, which
|
||||
// according to Cudnn official doc should be kept around and reused across all
|
||||
// Cudnn kernels in the same model.
|
||||
// In Tensorflow, we don't pass the reference across different OpKernels,
|
||||
// rather, recreate it separately in each OpKernel, which does no cause issue:
|
||||
// CudnnDropoutDescriptor keeps a reference to a memory for
|
||||
// random number generator state. During recreation, this state is lost.
|
||||
// However, only forward-pass Cudnn APIs make use of the state.
|
||||
|
||||
// A common base class for RNN kernels. It extracts common attributes and
|
||||
// shape validations.
|
||||
class CudnnRNNKernelCommon : public OpKernel {
|
||||
protected:
|
||||
CudnnRNNKernelCommon(OpKernelConstruction* context) : OpKernel(context) {
|
||||
OP_REQUIRES_OK(context, context->GetAttr("dropout", &dropout_));
|
||||
OP_REQUIRES_OK(context, context->GetAttr("seed", &seed_));
|
||||
OP_REQUIRES_OK(context, context->GetAttr("seed2", &seed2_));
|
||||
string str;
|
||||
OP_REQUIRES_OK(context, context->GetAttr("rnn_mode", &str));
|
||||
OP_REQUIRES_OK(context, ParseRNNMode(str, &model_types_.rnn_mode));
|
||||
@ -413,6 +474,10 @@ class CudnnRNNKernelCommon : public OpKernel {
|
||||
OP_REQUIRES_OK(context, context->GetAttr("direction", &str));
|
||||
OP_REQUIRES_OK(
|
||||
context, ParseRNNDirectionMode(str, &model_types_.rnn_direction_mode));
|
||||
// Reset CudnnRnnDescriptor and related random number generate states in
|
||||
// every Compute() call.
|
||||
OP_REQUIRES_OK(context, ReadBoolFromEnvVar("TF_CUDNN_RESET_RND_GEN_STATE",
|
||||
false, &reset_rnd_gen_state_));
|
||||
}
|
||||
|
||||
bool HasInputC() const { return model_types_.HasInputC(); }
|
||||
@ -422,6 +487,9 @@ class CudnnRNNKernelCommon : public OpKernel {
|
||||
return model_types_.rnn_direction_mode;
|
||||
}
|
||||
CudnnModelTypes model_types() const { return model_types_; }
|
||||
float dropout() const { return dropout_; }
|
||||
uint64 seed() { return (static_cast<uint64>(seed_) << 32) | seed2_; }
|
||||
bool ResetRndGenState() { return reset_rnd_gen_state_; }
|
||||
|
||||
template <typename T>
|
||||
Status ExtractCudnnRNNParamsInfo(OpKernelContext* context,
|
||||
@ -448,11 +516,14 @@ class CudnnRNNKernelCommon : public OpKernel {
|
||||
RnnInputMode input_mode;
|
||||
TF_RETURN_IF_ERROR(
|
||||
ToRNNInputMode(rnn_input_mode(), num_units, input_size, &input_mode));
|
||||
|
||||
auto* stream = context->op_device_context()->stream();
|
||||
// ExtracCudnnRNNParamsInfo is only called by op_kernels that do not require
|
||||
// random number generator, therefore set state_allocator to nullptr.
|
||||
auto rnn_desc_s = stream->parent()->createRnnDescriptor(
|
||||
num_layers, num_units, input_size, input_mode, rnn_direction_mode(),
|
||||
rnn_mode(), ToDataType<T>::value, 0.f /*dropout*/, 0 /*seed*/,
|
||||
nullptr /*state_allocator*/);
|
||||
rnn_mode(), ToDataType<T>::value, dropout(), seed(),
|
||||
nullptr /* state_allocator */);
|
||||
if (!rnn_desc_s.ok()) {
|
||||
return FromExecutorStatus(rnn_desc_s);
|
||||
}
|
||||
@ -461,6 +532,11 @@ class CudnnRNNKernelCommon : public OpKernel {
|
||||
}
|
||||
|
||||
private:
|
||||
int seed_;
|
||||
int seed2_;
|
||||
float dropout_;
|
||||
bool reset_rnd_gen_state_;
|
||||
|
||||
CudnnModelTypes model_types_;
|
||||
};
|
||||
|
||||
@ -560,9 +636,8 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
|
||||
context->set_output(i, input.Slice(start, end));
|
||||
} else {
|
||||
Tensor* output = nullptr;
|
||||
OP_REQUIRES_OK(
|
||||
context,
|
||||
context->allocate_output(i, TensorShape({width, height}), &output));
|
||||
OP_REQUIRES_OK(context, context->allocate_output(
|
||||
i, TensorShape({width, height}), &output));
|
||||
DeviceMemoryBase data_src_ptr = SliceDeviceMemory(
|
||||
input_ptr, rnn_desc->ParamsWeightRegions()[i].offset,
|
||||
size_in_bytes);
|
||||
@ -571,14 +646,17 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
|
||||
}
|
||||
}
|
||||
|
||||
CHECK(num_params_ == rnn_desc->ParamsBiasRegions().size())
|
||||
<< "Number of params mismatch. Expected " << num_params_ << ", got "
|
||||
<< rnn_desc->ParamsBiasRegions().size();
|
||||
OP_REQUIRES(context, num_params_ == rnn_desc->ParamsBiasRegions().size(),
|
||||
errors::InvalidArgument("Number of params mismatch. Expected ",
|
||||
num_params_, ", got ",
|
||||
rnn_desc->ParamsBiasRegions().size()));
|
||||
for (int i = 0; i < rnn_desc->ParamsBiasRegions().size(); i++) {
|
||||
int64 size_in_bytes = rnn_desc->ParamsBiasRegions()[i].size;
|
||||
int64 size = size_in_bytes / sizeof(T);
|
||||
CHECK(size == num_units) << "Params size mismatch. Expected " << num_units
|
||||
<< ", got " << size;
|
||||
OP_REQUIRES(context, size == num_units,
|
||||
errors::InvalidArgument("Params size mismatch. Expected ",
|
||||
num_units, ", got ", size));
|
||||
|
||||
// If data is aligned, use slice view to avoid expensive memcpy.
|
||||
bool start_aligned =
|
||||
rnn_desc->ParamsBiasRegions()[i].offset % EIGEN_MAX_ALIGN_BYTES == 0;
|
||||
@ -698,16 +776,32 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
|
||||
OP_REQUIRES_OK(context,
|
||||
ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
|
||||
model_shapes.input_size, &input_mode));
|
||||
// TODO(zhengxq): add dropout support.
|
||||
// TODO(zhengxq): cache the descriptor so we don't have to create them all
|
||||
// the time.
|
||||
auto data_type = ToDataType<T>::value;
|
||||
auto rnn_desc_s = executor->createRnnDescriptor(
|
||||
model_shapes.num_layers, model_shapes.num_units,
|
||||
model_shapes.input_size, input_mode, rnn_direction_mode(), rnn_mode(),
|
||||
data_type, 0.f /*dropout*/, 0 /*seed*/, nullptr /*state_allocator*/);
|
||||
OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
|
||||
auto rnn_desc = rnn_desc_s.ConsumeValueOrDie();
|
||||
{
|
||||
mutex_lock l(mu_);
|
||||
if (model_shapes_ == nullptr) {
|
||||
model_shapes_.reset(new CudnnModelShapes(model_shapes));
|
||||
} else {
|
||||
OP_REQUIRES(context, model_shapes_->IsCompatibleWith(model_shapes),
|
||||
errors::InvalidArgument(
|
||||
"Incompatible rnn model shapes inferred: expecting ",
|
||||
model_shapes_->RnnDescDebugString(), ", getting ",
|
||||
model_shapes.RnnDescDebugString(), "."));
|
||||
}
|
||||
if (rnn_desc_ == nullptr || ResetRndGenState()) {
|
||||
dropout_state_allocator_.reset(
|
||||
new CudnnRNNPersistentSpaceAllocator(context));
|
||||
auto rnn_desc_s = executor->createRnnDescriptor(
|
||||
model_shapes_->num_layers, model_shapes_->num_units,
|
||||
model_shapes_->input_size, input_mode, rnn_direction_mode(),
|
||||
rnn_mode(), data_type, dropout(), seed(),
|
||||
dropout_state_allocator_.get());
|
||||
OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
|
||||
rnn_desc_ = std::move(rnn_desc_s.ConsumeValueOrDie());
|
||||
}
|
||||
}
|
||||
|
||||
auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
|
||||
input_shape.dim_size(0), input_shape.dim_size(1),
|
||||
@ -753,21 +847,30 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
|
||||
// Creates a memory callback for the workspace. The memory lives to the end
|
||||
// of this kernel calls.
|
||||
CudnnRNNWorkspaceAllocator workspace_allocator(context);
|
||||
bool launch_status =
|
||||
stream
|
||||
->ThenRnnForward(
|
||||
*rnn_desc, *input_desc, input_data, *hidden_state_desc,
|
||||
input_h_data, *hidden_state_desc, input_c_data, params_data,
|
||||
*output_desc, &output_data, *hidden_state_desc, &output_h_data,
|
||||
*hidden_state_desc, &output_c_data, is_training_,
|
||||
&reserve_space_allocator, &workspace_allocator)
|
||||
.ok();
|
||||
bool launch_status = false;
|
||||
{
|
||||
mutex_lock l(mu_);
|
||||
launch_status =
|
||||
stream
|
||||
->ThenRnnForward(
|
||||
*rnn_desc_, *input_desc, input_data, *hidden_state_desc,
|
||||
input_h_data, *hidden_state_desc, input_c_data, params_data,
|
||||
*output_desc, &output_data, *hidden_state_desc,
|
||||
&output_h_data, *hidden_state_desc, &output_c_data,
|
||||
is_training_, &reserve_space_allocator, &workspace_allocator)
|
||||
.ok();
|
||||
}
|
||||
OP_REQUIRES(context, launch_status,
|
||||
errors::Internal("Failed to call ThenRnnForward"));
|
||||
}
|
||||
|
||||
private:
|
||||
mutex mu_;
|
||||
bool is_training_;
|
||||
std::unique_ptr<CudnnModelShapes> model_shapes_ GUARDED_BY(mu_);
|
||||
std::unique_ptr<RnnDescriptor> rnn_desc_ GUARDED_BY(mu_);
|
||||
std::unique_ptr<CudnnRNNPersistentSpaceAllocator> dropout_state_allocator_
|
||||
GUARDED_BY(mu_);
|
||||
};
|
||||
|
||||
REGISTER_KERNEL_BUILDER(
|
||||
@ -808,9 +911,9 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
|
||||
const Tensor* output_h = nullptr;
|
||||
OP_REQUIRES_OK(context, context->input("output_h", &output_h));
|
||||
OP_REQUIRES(context, output_h->shape() == hidden_state_shape,
|
||||
errors::InvalidArgument("Invalid output_h shape: ",
|
||||
output_h->shape().DebugString(), " ",
|
||||
hidden_state_shape.DebugString()));
|
||||
errors::InvalidArgument(
|
||||
"Invalid output_h shape: ", output_h->shape().DebugString(),
|
||||
" ", hidden_state_shape.DebugString()));
|
||||
const Tensor* output_c = nullptr;
|
||||
if (HasInputC()) {
|
||||
// Only LSTM uses input_c and output_c. So for all other models, we only
|
||||
@ -881,15 +984,32 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
|
||||
OP_REQUIRES_OK(context,
|
||||
ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
|
||||
model_shapes.input_size, &input_mode));
|
||||
// TODO(zhengxq): add dropout support.
|
||||
// TODO(zhengxq): cache the descriptor so we don't have to create them all
|
||||
// the time.
|
||||
auto rnn_desc_s = executor->createRnnDescriptor(
|
||||
model_shapes.num_layers, model_shapes.num_units,
|
||||
model_shapes.input_size, input_mode, rnn_direction_mode(), rnn_mode(),
|
||||
data_type, 0.f /*dropout*/, 0 /*seed*/, nullptr /*state_allocator*/);
|
||||
OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
|
||||
auto rnn_desc = rnn_desc_s.ConsumeValueOrDie();
|
||||
{
|
||||
mutex_lock l(mu_);
|
||||
if (model_shapes_ == nullptr) {
|
||||
model_shapes_.reset(new CudnnModelShapes(model_shapes));
|
||||
} else {
|
||||
OP_REQUIRES(context, model_shapes_->IsCompatibleWith(model_shapes),
|
||||
errors::InvalidArgument(
|
||||
"Incompatible rnn model shapes inferred: expecting ",
|
||||
model_shapes_->RnnDescDebugString(), ", getting ",
|
||||
model_shapes.RnnDescDebugString(), "."));
|
||||
}
|
||||
|
||||
if (rnn_desc_ == nullptr || ResetRndGenState()) {
|
||||
dropout_state_allocator_.reset(
|
||||
new CudnnRNNPersistentSpaceAllocator(context));
|
||||
auto rnn_desc_s = executor->createRnnDescriptor(
|
||||
model_shapes.num_layers, model_shapes.num_units,
|
||||
model_shapes.input_size, input_mode, rnn_direction_mode(),
|
||||
rnn_mode(), data_type, dropout(), seed(),
|
||||
dropout_state_allocator_.get());
|
||||
OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
|
||||
rnn_desc_ = std::move(rnn_desc_s.ConsumeValueOrDie());
|
||||
}
|
||||
}
|
||||
|
||||
auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
|
||||
input_shape.dim_size(0), input_shape.dim_size(1),
|
||||
@ -939,21 +1059,32 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
|
||||
// Creates a memory callback for the workspace. The memory lives to the end
|
||||
// of this kernel calls.
|
||||
CudnnRNNWorkspaceAllocator workspace_allocator(context);
|
||||
bool launch_status =
|
||||
stream
|
||||
->ThenRnnBackward(
|
||||
*rnn_desc, *input_desc, input_data, *hidden_state_desc,
|
||||
input_h_data, *hidden_state_desc, input_c_data, params_data,
|
||||
*output_desc, output_data, *hidden_state_desc, output_h_data,
|
||||
*hidden_state_desc, output_c_data, output_backprop_data,
|
||||
output_h_backprop_data, output_c_backprop_data,
|
||||
&input_backprop_data, &input_h_backprop_data,
|
||||
&input_c_backprop_data, ¶ms_backprop_data,
|
||||
&reserve_space_uint8, &workspace_allocator)
|
||||
.ok();
|
||||
bool launch_status = false;
|
||||
{
|
||||
mutex_lock l(mu_);
|
||||
launch_status =
|
||||
stream
|
||||
->ThenRnnBackward(
|
||||
*rnn_desc_, *input_desc, input_data, *hidden_state_desc,
|
||||
input_h_data, *hidden_state_desc, input_c_data, params_data,
|
||||
*output_desc, output_data, *hidden_state_desc, output_h_data,
|
||||
*hidden_state_desc, output_c_data, output_backprop_data,
|
||||
output_h_backprop_data, output_c_backprop_data,
|
||||
&input_backprop_data, &input_h_backprop_data,
|
||||
&input_c_backprop_data, ¶ms_backprop_data,
|
||||
&reserve_space_uint8, &workspace_allocator)
|
||||
.ok();
|
||||
}
|
||||
OP_REQUIRES(context, launch_status,
|
||||
errors::Internal("Failed to call ThenRnnBackward"));
|
||||
}
|
||||
|
||||
private:
|
||||
mutex mu_;
|
||||
std::unique_ptr<CudnnModelShapes> model_shapes_ GUARDED_BY(mu_);
|
||||
std::unique_ptr<RnnDescriptor> rnn_desc_ GUARDED_BY(mu_);
|
||||
std::unique_ptr<CudnnRNNPersistentSpaceAllocator> dropout_state_allocator_
|
||||
GUARDED_BY(mu_);
|
||||
};
|
||||
|
||||
REGISTER_KERNEL_BUILDER(
|
||||
|
@ -35,6 +35,9 @@ input_mode: Indicate whether there is a linear projection between the input and
|
||||
input_size == num_units; otherwise, it implies 'linear_input'.
|
||||
direction: Indicates whether a bidirectional model will be used.
|
||||
dir = (direction == bidirectional) ? 2 : 1
|
||||
dropout: dropout probability. When set to 0., dropout is disabled.
|
||||
seed: the 1st part of a seed to initialize dropout.
|
||||
seed2: the 2nd part of a seed to initialize dropout.
|
||||
)doc";
|
||||
|
||||
constexpr auto kCudnnRNNParamsBuffer = R"doc(
|
||||
@ -77,6 +80,9 @@ REGISTER_OP("CudnnRNNParamsSize")
|
||||
.Attr(kRNNModeAttrs)
|
||||
.Attr(kRNNInputModeAttrs)
|
||||
.Attr(kRNNDirectionAttrs)
|
||||
.Attr("dropout: float = 0.0")
|
||||
.Attr("seed: int = 0")
|
||||
.Attr("seed2: int = 0")
|
||||
.Output("params_size: S")
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
c->set_output(0, c->Vector(1));
|
||||
@ -119,6 +125,7 @@ REGISTER_OP("CudnnRNN")
|
||||
.Input("input_h: T")
|
||||
.Input("input_c: T")
|
||||
.Input("params: T")
|
||||
.SetIsStateful()
|
||||
.Output("output: T")
|
||||
.Output("output_h: T")
|
||||
.Output("output_c: T")
|
||||
@ -127,7 +134,7 @@ REGISTER_OP("CudnnRNN")
|
||||
.Attr(kRNNModeAttrs)
|
||||
.Attr(kRNNInputModeAttrs)
|
||||
.Attr(kRNNDirectionAttrs)
|
||||
.Attr("dropout: float")
|
||||
.Attr("dropout: float = 0.0")
|
||||
.Attr("seed: int = 0")
|
||||
.Attr("seed2: int = 0")
|
||||
.Attr("is_training: bool = true")
|
||||
@ -158,7 +165,8 @@ REGISTER_OP("CudnnRNN")
|
||||
Computes the RNN from the input and initial states, with respect to the params
|
||||
buffer.
|
||||
)doc",
|
||||
kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(), R"doc(
|
||||
kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(),
|
||||
R"doc(
|
||||
is_training: Indicates whether this operation is used for inferenece or
|
||||
training.
|
||||
reserve_space: an opaque tensor that can be used in backprop calculation. It
|
||||
@ -185,6 +193,9 @@ REGISTER_OP("CudnnRNNBackprop")
|
||||
.Attr(kRNNModeAttrs)
|
||||
.Attr(kRNNInputModeAttrs)
|
||||
.Attr(kRNNDirectionAttrs)
|
||||
.Attr("dropout: float = 0.0")
|
||||
.Attr("seed: int = 0")
|
||||
.Attr("seed2: int = 0")
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
auto input_shape = c->input(0);
|
||||
auto input_h_shape = c->input(1);
|
||||
@ -199,7 +210,8 @@ REGISTER_OP("CudnnRNNBackprop")
|
||||
.Doc(strings::StrCat(R"doc(
|
||||
Compute the backprop of both data and weights in a RNN.
|
||||
)doc",
|
||||
kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(), R"doc(
|
||||
kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(),
|
||||
R"doc(
|
||||
output_backprop: A 3-D tensor with the same shape as output in the forward pass.
|
||||
output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
|
||||
pass.
|
||||
@ -228,6 +240,9 @@ REGISTER_OP("CudnnRNNParamsToCanonical")
|
||||
.Attr(kRNNModeAttrs)
|
||||
.Attr(kRNNInputModeAttrs)
|
||||
.Attr(kRNNDirectionAttrs)
|
||||
.Attr("dropout: float = 0.0")
|
||||
.Attr("seed: int = 0")
|
||||
.Attr("seed2: int = 0")
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
ShapeHandle unused;
|
||||
TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &unused));
|
||||
@ -268,6 +283,9 @@ REGISTER_OP("CudnnRNNCanonicalToParams")
|
||||
.Attr(kRNNModeAttrs)
|
||||
.Attr(kRNNInputModeAttrs)
|
||||
.Attr(kRNNDirectionAttrs)
|
||||
.Attr("dropout: float = 0.0")
|
||||
.Attr("seed: int = 0")
|
||||
.Attr("seed2: int = 0")
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
|
||||
return Status::OK();
|
||||
@ -281,7 +299,6 @@ upcoming training or inferences.
|
||||
num_params: number of parameter sets for all layers.
|
||||
Each layer may contain multiple parameter sets, with each set consisting of
|
||||
a weight matrix and a bias vector.
|
||||
)doc",
|
||||
kCudnnRNNCommonAttrs));
|
||||
)doc", kCudnnRNNCommonAttrs));
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -38,15 +38,24 @@ from tensorflow.python.training import saver as saver_lib
|
||||
|
||||
class CudnnRNNTest(TensorFlowTestCase):
|
||||
|
||||
def _CreateModel(self, rnn_mode, num_layers, num_units, input_size):
|
||||
def _CreateModel(self,
|
||||
rnn_mode,
|
||||
num_layers,
|
||||
num_units,
|
||||
input_size,
|
||||
dropout=0.):
|
||||
if rnn_mode == "lstm":
|
||||
model = cudnn_rnn_ops.CudnnLSTM(num_layers, num_units, input_size)
|
||||
model = cudnn_rnn_ops.CudnnLSTM(
|
||||
num_layers, num_units, input_size, dropout=dropout)
|
||||
elif rnn_mode == "gru":
|
||||
model = cudnn_rnn_ops.CudnnGRU(num_layers, num_units, input_size)
|
||||
model = cudnn_rnn_ops.CudnnGRU(
|
||||
num_layers, num_units, input_size, dropout=dropout)
|
||||
elif rnn_mode == "rnn_tanh":
|
||||
model = cudnn_rnn_ops.CudnnRNNTanh(num_layers, num_units, input_size)
|
||||
model = cudnn_rnn_ops.CudnnRNNTanh(
|
||||
num_layers, num_units, input_size, dropout=dropout)
|
||||
elif rnn_mode == "rnn_relu":
|
||||
model = cudnn_rnn_ops.CudnnRNNRelu(num_layers, num_units, input_size)
|
||||
model = cudnn_rnn_ops.CudnnRNNRelu(
|
||||
num_layers, num_units, input_size, dropout=dropout)
|
||||
else:
|
||||
raise ValueError("Invalid rnn_mode: %s" % rnn_mode)
|
||||
return model
|
||||
@ -174,9 +183,11 @@ class CudnnRNNTest(TensorFlowTestCase):
|
||||
self._testOneLSTMParamsSize(num_layers, num_units, input_size)
|
||||
|
||||
def _testOneSimpleInference(self, rnn_mode, num_layers, num_units, input_size,
|
||||
batch_size, seq_length, dir_count, expected,
|
||||
tolerance):
|
||||
model = self._CreateModel(rnn_mode, num_layers, num_units, input_size)
|
||||
batch_size, seq_length, dir_count, dropout,
|
||||
expected, tolerance):
|
||||
random_seed.set_random_seed(5678)
|
||||
model = self._CreateModel(rnn_mode, num_layers, num_units, input_size,
|
||||
dropout)
|
||||
has_input_c = (rnn_mode == "lstm")
|
||||
params_size_t = model.params_size()
|
||||
input_data = array_ops.ones([seq_length, batch_size, input_size])
|
||||
@ -206,18 +217,24 @@ class CudnnRNNTest(TensorFlowTestCase):
|
||||
with self.test_session(use_gpu=True) as sess:
|
||||
sess.run(variables.global_variables_initializer())
|
||||
total_sum_v = sess.run([total_sum])
|
||||
|
||||
self.assertAllClose(
|
||||
total_sum_v[0], expected, atol=tolerance, rtol=tolerance)
|
||||
|
||||
@unittest.skipUnless(test.is_built_with_cuda(),
|
||||
"Test only applicable when running on GPUs")
|
||||
def testSimpleInference(self):
|
||||
# Cudnn scales result for dropout during training, therefore dropout has no
|
||||
# impact for inference results.
|
||||
# (lstm, gru, rnn_tanh are saturated in the test. rnn_relu case is most
|
||||
# demonstrative of the dropout-invariant nature of CudnnRnn.)
|
||||
test_configs = [
|
||||
[
|
||||
"lstm",
|
||||
231833.22,
|
||||
1e-2,
|
||||
{
|
||||
{
|
||||
"rnn_mode": "lstm",
|
||||
"dropout": [0., 0.5, 1.],
|
||||
"expected": 231833.22,
|
||||
"tolerance": 1e-2,
|
||||
"shape": {
|
||||
"num_layers": 4,
|
||||
"num_units": 200,
|
||||
"input_size": 200,
|
||||
@ -225,12 +242,13 @@ class CudnnRNNTest(TensorFlowTestCase):
|
||||
"seq_length": 10,
|
||||
"dir_count": 1,
|
||||
},
|
||||
],
|
||||
[
|
||||
"gru",
|
||||
56000,
|
||||
1e-2,
|
||||
{
|
||||
},
|
||||
{
|
||||
"rnn_mode": "gru",
|
||||
"dropout": [0., 0.5, 1.],
|
||||
"expected": 56000,
|
||||
"tolerance": 1e-2,
|
||||
"shape": {
|
||||
"num_layers": 4,
|
||||
"num_units": 200,
|
||||
"input_size": 200,
|
||||
@ -238,12 +256,13 @@ class CudnnRNNTest(TensorFlowTestCase):
|
||||
"seq_length": 10,
|
||||
"dir_count": 1,
|
||||
},
|
||||
],
|
||||
[
|
||||
"rnn_tanh",
|
||||
56000,
|
||||
1e-2,
|
||||
{
|
||||
},
|
||||
{
|
||||
"rnn_mode": "rnn_tanh",
|
||||
"dropout": [0., 0.5, 1.],
|
||||
"expected": 56000,
|
||||
"tolerance": 1e-2,
|
||||
"shape": {
|
||||
"num_layers": 4,
|
||||
"num_units": 200,
|
||||
"input_size": 200,
|
||||
@ -251,12 +270,13 @@ class CudnnRNNTest(TensorFlowTestCase):
|
||||
"seq_length": 10,
|
||||
"dir_count": 1,
|
||||
},
|
||||
],
|
||||
[
|
||||
"rnn_relu",
|
||||
130688,
|
||||
1e-2,
|
||||
{
|
||||
},
|
||||
{
|
||||
"rnn_mode": "rnn_relu",
|
||||
"dropout": [0., 0.5, 1.],
|
||||
"expected": 130688,
|
||||
"tolerance": 1e-2,
|
||||
"shape": {
|
||||
"num_layers": 2,
|
||||
"num_units": 8,
|
||||
"input_size": 4,
|
||||
@ -264,24 +284,32 @@ class CudnnRNNTest(TensorFlowTestCase):
|
||||
"seq_length": 2,
|
||||
"dir_count": 1,
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
with ops.Graph().as_default():
|
||||
for config in test_configs:
|
||||
rnn_mode = config[0]
|
||||
expected = config[1]
|
||||
tolerance = config[2]
|
||||
shapes = config[3]
|
||||
self._testOneSimpleInference(rnn_mode, shapes["num_layers"],
|
||||
shapes["num_units"], shapes["input_size"],
|
||||
shapes["batch_size"], shapes["seq_length"],
|
||||
shapes["dir_count"], expected, tolerance)
|
||||
rnn_mode = config["rnn_mode"]
|
||||
dropout_list = config.get("dropout", [0.])
|
||||
expected = config["expected"]
|
||||
tolerance = config["tolerance"]
|
||||
shape = config["shape"]
|
||||
for dropout in dropout_list:
|
||||
self._testOneSimpleInference(
|
||||
rnn_mode, shape["num_layers"], shape["num_units"],
|
||||
shape["input_size"], shape["batch_size"], shape["seq_length"],
|
||||
shape["dir_count"], dropout, expected, tolerance)
|
||||
|
||||
def _testOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size,
|
||||
batch_size, seq_length, dir_count, tolerance):
|
||||
batch_size, seq_length, dir_count, dropout,
|
||||
tolerance):
|
||||
# Gradient checking runs two forward ops with almost the same input. Need to
|
||||
# make sure the drop patterns across the two runs are the same.
|
||||
old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False))
|
||||
os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True)
|
||||
has_input_c = (rnn_mode == "lstm")
|
||||
random_seed.set_random_seed(1234)
|
||||
model = self._CreateModel(rnn_mode, num_layers, num_units, input_size)
|
||||
model = self._CreateModel(rnn_mode, num_layers, num_units, input_size,
|
||||
dropout)
|
||||
params_size_t = model.params_size()
|
||||
input_data = variables.Variable(
|
||||
random_ops.random_uniform([seq_length, batch_size, input_size]))
|
||||
@ -294,6 +322,7 @@ class CudnnRNNTest(TensorFlowTestCase):
|
||||
input_c = variables.Variable(
|
||||
random_ops.random_uniform(
|
||||
[num_layers * dir_count, batch_size, num_units]))
|
||||
|
||||
output, output_h, output_c = model(
|
||||
input_data=input_data,
|
||||
input_h=input_h,
|
||||
@ -322,18 +351,22 @@ class CudnnRNNTest(TensorFlowTestCase):
|
||||
sess.run(variables.global_variables_initializer())
|
||||
all_inputs = [entry[0] for entry in inputs_and_shapes]
|
||||
all_shapes = [entry[1] for entry in inputs_and_shapes]
|
||||
|
||||
err = gradient_checker.compute_gradient_error(all_inputs, all_shapes,
|
||||
total_sum, [1])
|
||||
|
||||
self.assertLess(err, tolerance)
|
||||
os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state
|
||||
|
||||
@unittest.skipUnless(test.is_built_with_cuda(),
|
||||
"Test only applicable when running on GPUs")
|
||||
def testSimpleTraining(self):
|
||||
test_configs = [
|
||||
[
|
||||
"lstm",
|
||||
1e-2,
|
||||
{
|
||||
{
|
||||
"rnn_mode": "lstm",
|
||||
"dropout": [0., 0.5, 1.],
|
||||
"tolerance": 1e-2,
|
||||
"shape": {
|
||||
"num_layers": 2,
|
||||
"num_units": 3,
|
||||
"input_size": 4,
|
||||
@ -341,11 +374,12 @@ class CudnnRNNTest(TensorFlowTestCase):
|
||||
"seq_length": 4,
|
||||
"dir_count": 1,
|
||||
},
|
||||
],
|
||||
[
|
||||
"gru",
|
||||
4e-3,
|
||||
{
|
||||
},
|
||||
{
|
||||
"rnn_mode": "gru",
|
||||
"dropout": [0., 0.5, 1.],
|
||||
"tolerance": 4e-3,
|
||||
"shape": {
|
||||
"num_layers": 2,
|
||||
"num_units": 3,
|
||||
"input_size": 4,
|
||||
@ -353,11 +387,12 @@ class CudnnRNNTest(TensorFlowTestCase):
|
||||
"seq_length": 4,
|
||||
"dir_count": 1,
|
||||
},
|
||||
],
|
||||
[
|
||||
"rnn_tanh",
|
||||
5e-3,
|
||||
{
|
||||
},
|
||||
{
|
||||
"rnn_mode": "rnn_tanh",
|
||||
"dropout": [0., 0.5, 1.],
|
||||
"tolerance": 5e-3,
|
||||
"shape": {
|
||||
"num_layers": 2,
|
||||
"num_units": 3,
|
||||
"input_size": 4,
|
||||
@ -365,11 +400,12 @@ class CudnnRNNTest(TensorFlowTestCase):
|
||||
"seq_length": 4,
|
||||
"dir_count": 1,
|
||||
},
|
||||
],
|
||||
[
|
||||
"rnn_relu",
|
||||
3e-1,
|
||||
{
|
||||
},
|
||||
{
|
||||
"rnn_mode": "rnn_relu",
|
||||
"dropout": [0., 0.5, 1.],
|
||||
"tolerance": 4e-1,
|
||||
"shape": {
|
||||
"num_layers": 2,
|
||||
"num_units": 3,
|
||||
"input_size": 4,
|
||||
@ -377,17 +413,19 @@ class CudnnRNNTest(TensorFlowTestCase):
|
||||
"seq_length": 4,
|
||||
"dir_count": 1,
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
with ops.Graph().as_default():
|
||||
for config in test_configs:
|
||||
rnn_mode = config[0]
|
||||
tolerance = config[1]
|
||||
shape = config[2]
|
||||
self._testOneSimpleTraining(rnn_mode, shape["num_layers"],
|
||||
shape["num_units"], shape["input_size"],
|
||||
shape["batch_size"], shape["seq_length"],
|
||||
shape["dir_count"], tolerance)
|
||||
rnn_mode = config["rnn_mode"]
|
||||
dropout_list = config.get("dropout", [0.])
|
||||
tolerance = config["tolerance"]
|
||||
shape = config["shape"]
|
||||
for dropout in dropout_list:
|
||||
self._testOneSimpleTraining(rnn_mode, shape["num_layers"],
|
||||
shape["num_units"], shape["input_size"],
|
||||
shape["batch_size"], shape["seq_length"],
|
||||
shape["dir_count"], dropout, tolerance)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -23,13 +23,13 @@ from tensorflow.contrib.util import loader
|
||||
from tensorflow.python.framework import common_shapes
|
||||
from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.framework import random_seed
|
||||
from tensorflow.python.ops import array_ops
|
||||
from tensorflow.python.ops import control_flow_ops
|
||||
from tensorflow.python.ops import state_ops
|
||||
from tensorflow.python.platform import resource_loader
|
||||
from tensorflow.python.training import saver
|
||||
|
||||
|
||||
_cudnn_rnn_ops_so = loader.load_op_library(
|
||||
resource_loader.get_path_to_datafile("_cudnn_rnn_ops.so"))
|
||||
|
||||
@ -110,12 +110,12 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
|
||||
if not isinstance(params, tuple):
|
||||
params = (params,)
|
||||
assign_ops = [
|
||||
state_ops.assign(
|
||||
variable, param, validate_shape=False)
|
||||
state_ops.assign(variable, param, validate_shape=False)
|
||||
for variable, param in zip(self._variables, params)
|
||||
]
|
||||
return control_flow_ops.group(*assign_ops)
|
||||
|
||||
|
||||
_cudnn_rnn_common_doc_string = """
|
||||
Cudnn RNN has an opaque parameter buffer that can be used for inference and
|
||||
training. But it is possible that the layout of the parameter buffers
|
||||
@ -163,8 +163,7 @@ class _CudnnRNN(object):
|
||||
input_mode="auto_select",
|
||||
direction="unidirectional",
|
||||
dropout=0.,
|
||||
seed=0,
|
||||
seed2=0):
|
||||
seed=0):
|
||||
"""Creates a CudnnRNN model from model spec.
|
||||
|
||||
Args:
|
||||
@ -183,8 +182,8 @@ class _CudnnRNN(object):
|
||||
direction: the direction model that the model operates. Could be either
|
||||
'unidirectional' or 'bidirectional'
|
||||
dropout: whether to enable dropout. With it is 0, dropout is disabled.
|
||||
seed: the first part of a seed that is used to initialize dropout.
|
||||
seed2: the second part of a seed that is used to initialize dropout.
|
||||
seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
|
||||
for behavior.
|
||||
"""
|
||||
self._num_layers = num_layers
|
||||
self._num_units = num_units
|
||||
@ -193,8 +192,10 @@ class _CudnnRNN(object):
|
||||
self._input_mode = input_mode
|
||||
self._direction = direction
|
||||
self._dropout = dropout
|
||||
self._seed = seed
|
||||
self._seed2 = seed2
|
||||
# get graph and op seed.
|
||||
self._seed, self._seed2 = random_seed.get_seed(seed)
|
||||
if self._seed is None and self._seed2 is None:
|
||||
self._seed, self._seed2 = 0, 0
|
||||
|
||||
def params_size(self):
|
||||
"""Calculates the size of the opaque parameter buffer needed for this model.
|
||||
@ -208,6 +209,9 @@ class _CudnnRNN(object):
|
||||
input_size=self._input_size,
|
||||
T=dtypes.float32,
|
||||
S=dtypes.int32,
|
||||
dropout=self._dropout,
|
||||
seed=self._seed,
|
||||
seed2=self._seed2,
|
||||
rnn_mode=self._rnn_mode,
|
||||
input_mode=self._input_mode,
|
||||
direction=self._direction)[0]
|
||||
@ -258,6 +262,9 @@ class _CudnnRNN(object):
|
||||
num_units=self._num_units,
|
||||
input_size=self._input_size,
|
||||
params=params,
|
||||
dropout=self._dropout,
|
||||
seed=self._seed,
|
||||
seed2=self._seed2,
|
||||
num_params=self._num_layers * self._NUM_PARAMS_PER_LAYER,
|
||||
rnn_mode=self._rnn_mode,
|
||||
input_mode=self._input_mode,
|
||||
@ -280,6 +287,9 @@ class _CudnnRNN(object):
|
||||
input_size=self._input_size,
|
||||
weights=weights,
|
||||
biases=biases,
|
||||
dropout=self._dropout,
|
||||
seed=self._seed,
|
||||
seed2=self._seed2,
|
||||
rnn_mode=self._rnn_mode,
|
||||
input_mode=self._input_mode,
|
||||
direction=self._direction)
|
||||
@ -299,8 +309,7 @@ class CudnnLSTM(_CudnnRNN):
|
||||
input_mode="auto_select",
|
||||
direction="unidirectional",
|
||||
dropout=0.,
|
||||
seed=0,
|
||||
seed2=0):
|
||||
seed=0):
|
||||
"""Creates a Cudnn LSTM model from model spec.
|
||||
|
||||
Args:
|
||||
@ -317,8 +326,7 @@ class CudnnLSTM(_CudnnRNN):
|
||||
direction: the direction model that the model operates. Could be either
|
||||
'unidirectional' or 'bidirectional'
|
||||
dropout: whether to enable dropout. With it is 0, dropout is disabled.
|
||||
seed: the first part of a seed that is used to initialize dropout.
|
||||
seed2: the second part of a seed that is used to initialize dropout.
|
||||
seed: the seed used for initializing dropout.
|
||||
"""
|
||||
super(CudnnLSTM, self).__init__(
|
||||
"lstm",
|
||||
@ -328,8 +336,7 @@ class CudnnLSTM(_CudnnRNN):
|
||||
input_mode=input_mode,
|
||||
direction=direction,
|
||||
dropout=dropout,
|
||||
seed=seed,
|
||||
seed2=seed2)
|
||||
seed=seed)
|
||||
|
||||
def __call__(self, input_data, input_h, input_c, params, is_training=True):
|
||||
"""Runs the forward step for the Cudnn LSTM model.
|
||||
@ -346,11 +353,8 @@ class CudnnLSTM(_CudnnRNN):
|
||||
output_h: the final state for h.
|
||||
output_c: the final state for c.
|
||||
"""
|
||||
output, output_h, output_c = super(CudnnLSTM, self).__call__(input_data,
|
||||
input_h,
|
||||
input_c,
|
||||
params,
|
||||
is_training)
|
||||
output, output_h, output_c = super(CudnnLSTM, self).__call__(
|
||||
input_data, input_h, input_c, params, is_training=is_training)
|
||||
return (output, output_h, output_c)
|
||||
|
||||
|
||||
@ -365,8 +369,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
|
||||
input_mode="auto_select",
|
||||
direction="unidirectional",
|
||||
dropout=0.,
|
||||
seed=0,
|
||||
seed2=0):
|
||||
seed=0):
|
||||
"""Creates a Cudnn RNN model from model without hidden-state C.
|
||||
|
||||
Args:
|
||||
@ -383,8 +386,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
|
||||
direction: the direction model that the model operates. Could be either
|
||||
'unidirectional' or 'bidirectional'
|
||||
dropout: whether to enable dropout. With it is 0, dropout is disabled.
|
||||
seed: the first part of a seed that is used to initialize dropout.
|
||||
seed2: the second part of a seed that is used to initialize dropout.
|
||||
seed: the seed used for initializing dropout.
|
||||
"""
|
||||
super(_CudnnRNNNoInputC, self).__init__(
|
||||
self._rnn_mode,
|
||||
@ -394,8 +396,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
|
||||
input_mode=input_mode,
|
||||
direction=direction,
|
||||
dropout=dropout,
|
||||
seed=seed,
|
||||
seed2=seed2)
|
||||
seed=seed)
|
||||
|
||||
def __call__(self, input_data, input_h, params, is_training=True):
|
||||
"""Runs the forward step for the Cudnn LSTM model.
|
||||
@ -459,6 +460,9 @@ def _cudnn_rnn_backward(op, *grad):
|
||||
output_h_backprop=grad[1],
|
||||
output_c_backprop=grad[2],
|
||||
reserve_space=op.outputs[3],
|
||||
dropout=op.get_attr("dropout"),
|
||||
seed=op.get_attr("seed"),
|
||||
seed2=op.get_attr("seed2"),
|
||||
rnn_mode=op.get_attr("rnn_mode"),
|
||||
input_mode=op.get_attr("input_mode"),
|
||||
direction=op.get_attr("direction"))
|
||||
|
@ -111,13 +111,11 @@ py_library(
|
||||
srcs_version = "PY2AND3",
|
||||
deps = [
|
||||
":sdca_ops_py",
|
||||
":sparse_feature_column_py",
|
||||
"//tensorflow/contrib/framework:framework_py",
|
||||
"//tensorflow/contrib/layers:layers_py",
|
||||
"//tensorflow/contrib/learn",
|
||||
"//tensorflow/python:array_ops",
|
||||
"//tensorflow/python:dtypes",
|
||||
"//tensorflow/python:math_ops",
|
||||
"//tensorflow/python:sparse_tensor",
|
||||
"//tensorflow/python:tensor_util",
|
||||
"//tensorflow/python:training",
|
||||
|
@ -24,13 +24,10 @@ from tensorflow.contrib.learn.python.learn.estimators import estimator
|
||||
from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
|
||||
from tensorflow.contrib.learn.python.learn.estimators import prediction_key
|
||||
from tensorflow.contrib.linear_optimizer.python import sdca_optimizer
|
||||
from tensorflow.contrib.linear_optimizer.python.ops import sdca_ops
|
||||
from tensorflow.contrib.linear_optimizer.python.ops.sparse_feature_column import SparseFeatureColumn
|
||||
from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.framework import sparse_tensor
|
||||
from tensorflow.python.framework import tensor_util
|
||||
from tensorflow.python.ops import array_ops
|
||||
from tensorflow.python.ops import math_ops
|
||||
from tensorflow.python.ops import variable_scope
|
||||
from tensorflow.python.training import session_run_hook
|
||||
|
||||
@ -76,131 +73,6 @@ def _add_bias_column(feature_columns, columns_to_tensors, bias_variable,
|
||||
columns_to_variables[bias_column] = [bias_variable]
|
||||
|
||||
|
||||
def _get_sdca_train_step(optimizer, columns_to_variables, weight_column_name,
|
||||
loss_type, features, targets, global_step):
|
||||
"""Returns the training operation of an SdcaModel optimizer."""
|
||||
|
||||
def _dense_tensor_to_sparse_feature_column(dense_tensor):
|
||||
"""Returns SparseFeatureColumn for the input dense_tensor."""
|
||||
ignore_value = 0.0
|
||||
sparse_indices = array_ops.where(
|
||||
math_ops.not_equal(dense_tensor,
|
||||
math_ops.cast(ignore_value, dense_tensor.dtype)))
|
||||
sparse_values = array_ops.gather_nd(dense_tensor, sparse_indices)
|
||||
# TODO(sibyl-Aix6ihai, sibyl-vie3Poto): Makes this efficient, as now SDCA supports
|
||||
# very sparse features with weights and not weights.
|
||||
return SparseFeatureColumn(
|
||||
array_ops.reshape(
|
||||
array_ops.split(value=sparse_indices, num_or_size_splits=2,
|
||||
axis=1)[0], [-1]),
|
||||
array_ops.reshape(
|
||||
array_ops.split(value=sparse_indices, num_or_size_splits=2,
|
||||
axis=1)[1], [-1]),
|
||||
array_ops.reshape(math_ops.to_float(sparse_values), [-1]))
|
||||
|
||||
def _training_examples_and_variables():
|
||||
"""Returns dictionaries for training examples and variables."""
|
||||
batch_size = targets.get_shape()[0]
|
||||
|
||||
# Iterate over all feature columns and create appropriate lists for dense
|
||||
# and sparse features as well as dense and sparse weights (variables) for
|
||||
# SDCA.
|
||||
# TODO(sibyl-vie3Poto): Reshape variables stored as values in column_to_variables
|
||||
# dict as 1-dimensional tensors.
|
||||
dense_features, sparse_features, sparse_feature_with_values = [], [], []
|
||||
dense_feature_weights = []
|
||||
sparse_feature_weights, sparse_feature_with_values_weights = [], []
|
||||
for column in sorted(columns_to_variables.keys(), key=lambda x: x.key):
|
||||
transformed_tensor = features[column]
|
||||
if isinstance(column, layers.feature_column._RealValuedColumn): # pylint: disable=protected-access
|
||||
# A real-valued column corresponds to a dense feature in SDCA. A
|
||||
# transformed tensor corresponding to a RealValuedColumn has rank 2
|
||||
# (its shape is typically [batch_size, column.dimension]) and so it
|
||||
# can be passed to SDCA as is.
|
||||
dense_features.append(transformed_tensor)
|
||||
# For real valued columns, the variables list contains exactly one
|
||||
# element.
|
||||
dense_feature_weights.append(columns_to_variables[column][0])
|
||||
elif isinstance(column, layers.feature_column._BucketizedColumn): # pylint: disable=protected-access
|
||||
# A bucketized column corresponds to a sparse feature in SDCA. The
|
||||
# bucketized feature is "sparsified" for SDCA by converting it to a
|
||||
# SparseFeatureColumn respresenting the one-hot encoding of the
|
||||
# bucketized feature.
|
||||
#
|
||||
# TODO(sibyl-vie3Poto): Explore whether it is more efficient to translate a
|
||||
# bucketized feature column to a dense feature in SDCA. This will likely
|
||||
# depend on the number of buckets.
|
||||
dense_bucket_tensor = column._to_dnn_input_layer(transformed_tensor) # pylint: disable=protected-access
|
||||
sparse_feature_column = _dense_tensor_to_sparse_feature_column(
|
||||
dense_bucket_tensor)
|
||||
sparse_feature_with_values.append(sparse_feature_column)
|
||||
# For bucketized columns, the variables list contains exactly one
|
||||
# element.
|
||||
sparse_feature_with_values_weights.append(
|
||||
columns_to_variables[column][0])
|
||||
elif isinstance(
|
||||
column,
|
||||
(
|
||||
layers.feature_column._CrossedColumn, # pylint: disable=protected-access
|
||||
layers.feature_column._SparseColumn)): # pylint: disable=protected-access
|
||||
sparse_features.append(
|
||||
SparseFeatureColumn(
|
||||
array_ops.reshape(
|
||||
array_ops.split(
|
||||
value=transformed_tensor.indices,
|
||||
num_or_size_splits=2,
|
||||
axis=1)[0], [-1]),
|
||||
array_ops.reshape(transformed_tensor.values, [-1]), None))
|
||||
sparse_feature_weights.append(columns_to_variables[column][0])
|
||||
elif isinstance(column, layers.feature_column._WeightedSparseColumn): # pylint: disable=protected-access
|
||||
id_tensor = column.id_tensor(transformed_tensor)
|
||||
weight_tensor = column.weight_tensor(transformed_tensor)
|
||||
sparse_feature_with_values.append(
|
||||
SparseFeatureColumn(
|
||||
array_ops.reshape(
|
||||
array_ops.split(
|
||||
value=id_tensor.indices, num_or_size_splits=2, axis=1)[
|
||||
0], [-1]),
|
||||
array_ops.reshape(id_tensor.values, [-1]),
|
||||
array_ops.reshape(weight_tensor.values, [-1])))
|
||||
sparse_feature_with_values_weights.append(
|
||||
columns_to_variables[column][0])
|
||||
else:
|
||||
raise ValueError("SDCAOptimizer does not support column type {}".format(
|
||||
type(column).__name__))
|
||||
|
||||
example_weights = array_ops.reshape(
|
||||
features[weight_column_name],
|
||||
shape=[-1]) if weight_column_name else array_ops.ones([batch_size])
|
||||
example_ids = features[optimizer.example_id_column]
|
||||
sparse_feature_with_values.extend(sparse_features)
|
||||
sparse_feature_with_values_weights.extend(sparse_feature_weights)
|
||||
examples = dict(
|
||||
sparse_features=sparse_feature_with_values,
|
||||
dense_features=dense_features,
|
||||
example_labels=math_ops.to_float(
|
||||
array_ops.reshape(targets, shape=[-1])),
|
||||
example_weights=example_weights,
|
||||
example_ids=example_ids)
|
||||
sdca_variables = dict(
|
||||
sparse_features_weights=sparse_feature_with_values_weights,
|
||||
dense_features_weights=dense_feature_weights)
|
||||
return examples, sdca_variables
|
||||
|
||||
training_examples, training_variables = _training_examples_and_variables()
|
||||
sdca_model = sdca_ops.SdcaModel(
|
||||
examples=training_examples,
|
||||
variables=training_variables,
|
||||
options=dict(
|
||||
symmetric_l1_regularization=optimizer.symmetric_l1_regularization,
|
||||
symmetric_l2_regularization=optimizer.symmetric_l2_regularization,
|
||||
num_loss_partitions=optimizer.num_loss_partitions,
|
||||
num_table_shards=optimizer.num_table_shards,
|
||||
loss_type=loss_type))
|
||||
train_op = sdca_model.minimize(global_step=global_step)
|
||||
return sdca_model, train_op
|
||||
|
||||
|
||||
def sdca_model_fn(features, labels, mode, params, config=None):
|
||||
"""A model_fn for linear models that use the SDCA optimizer.
|
||||
|
||||
@ -283,9 +155,9 @@ def sdca_model_fn(features, labels, mode, params, config=None):
|
||||
|
||||
def _train_op_fn(unused_loss):
|
||||
global_step = contrib_variables.get_global_step()
|
||||
sdca_model, train_op = _get_sdca_train_step(optimizer, columns_to_variables,
|
||||
weight_column_name, loss_type,
|
||||
features, labels, global_step)
|
||||
sdca_model, train_op = optimizer.get_train_step(
|
||||
columns_to_variables, weight_column_name, loss_type, features, labels,
|
||||
global_step)
|
||||
if update_weights_hook is not None:
|
||||
update_weights_hook.set_parameters(sdca_model, train_op)
|
||||
return train_op
|
||||
|
@ -99,16 +99,16 @@ class SDCAOptimizer(object):
|
||||
def symmetric_l2_regularization(self):
|
||||
return self._symmetric_l2_regularization
|
||||
|
||||
def get_train_step(self, columns_to_variables,
|
||||
weight_column_name, loss_type, features, targets,
|
||||
global_step):
|
||||
def get_train_step(self, columns_to_variables, weight_column_name, loss_type,
|
||||
features, targets, global_step):
|
||||
"""Returns the training operation of an SdcaModel optimizer."""
|
||||
|
||||
def _tensor_to_sparse_feature_column(dense_tensor):
|
||||
def _dense_tensor_to_sparse_feature_column(dense_tensor):
|
||||
"""Returns SparseFeatureColumn for the input dense_tensor."""
|
||||
ignore_value = 0.0
|
||||
sparse_indices = array_ops.where(math_ops.not_equal(
|
||||
dense_tensor, math_ops.cast(ignore_value, dense_tensor.dtype)))
|
||||
sparse_indices = array_ops.where(
|
||||
math_ops.not_equal(dense_tensor,
|
||||
math_ops.cast(ignore_value, dense_tensor.dtype)))
|
||||
sparse_values = array_ops.gather_nd(dense_tensor, sparse_indices)
|
||||
# TODO(sibyl-Aix6ihai, sibyl-vie3Poto): Makes this efficient, as now SDCA supports
|
||||
# very sparse features with weights and not weights.
|
||||
@ -133,10 +133,9 @@ class SDCAOptimizer(object):
|
||||
dense_features, sparse_features, sparse_feature_with_values = [], [], []
|
||||
dense_feature_weights = []
|
||||
sparse_feature_weights, sparse_feature_with_values_weights = [], []
|
||||
# pylint: disable=protected-access
|
||||
for column in sorted(columns_to_variables.keys(), key=lambda x: x.key):
|
||||
transformed_tensor = features[column]
|
||||
if isinstance(column, layers.feature_column._RealValuedColumn):
|
||||
if isinstance(column, layers.feature_column._RealValuedColumn): # pylint: disable=protected-access
|
||||
# A real-valued column corresponds to a dense feature in SDCA. A
|
||||
# transformed tensor corresponding to a RealValuedColumn has rank 2
|
||||
# (its shape is typically [batch_size, column.dimension]) and so it
|
||||
@ -145,22 +144,28 @@ class SDCAOptimizer(object):
|
||||
# For real valued columns, the variables list contains exactly one
|
||||
# element.
|
||||
dense_feature_weights.append(columns_to_variables[column][0])
|
||||
elif isinstance(column, layers.feature_column._BucketizedColumn):
|
||||
elif isinstance(column, layers.feature_column._BucketizedColumn): # pylint: disable=protected-access
|
||||
# A bucketized column corresponds to a sparse feature in SDCA. The
|
||||
# bucketized feature is "sparsified" for SDCA by converting it to a
|
||||
# SparseFeatureColumn respresenting the one-hot encoding of the
|
||||
# bucketized feature.
|
||||
dense_bucket_tensor = layers.input_from_feature_columns(
|
||||
{column: transformed_tensor}, [column])
|
||||
sparse_feature_column = _tensor_to_sparse_feature_column(
|
||||
#
|
||||
# TODO(sibyl-vie3Poto): Explore whether it is more efficient to translate a
|
||||
# bucketized feature column to a dense feature in SDCA. This will
|
||||
# likely depend on the number of buckets.
|
||||
dense_bucket_tensor = column._to_dnn_input_layer(transformed_tensor) # pylint: disable=protected-access
|
||||
sparse_feature_column = _dense_tensor_to_sparse_feature_column(
|
||||
dense_bucket_tensor)
|
||||
sparse_feature_with_values.append(sparse_feature_column)
|
||||
# For bucketized columns, the variables list contains exactly one
|
||||
# element.
|
||||
sparse_feature_with_values_weights.append(
|
||||
columns_to_variables[column][0])
|
||||
elif isinstance(column, (layers.feature_column._CrossedColumn,
|
||||
layers.feature_column._SparseColumn)):
|
||||
elif isinstance(
|
||||
column,
|
||||
(
|
||||
layers.feature_column._CrossedColumn, # pylint: disable=protected-access
|
||||
layers.feature_column._SparseColumn)): # pylint: disable=protected-access
|
||||
sparse_features.append(
|
||||
SparseFeatureColumn(
|
||||
array_ops.reshape(
|
||||
@ -168,10 +173,9 @@ class SDCAOptimizer(object):
|
||||
value=transformed_tensor.indices,
|
||||
num_or_size_splits=2,
|
||||
axis=1)[0], [-1]),
|
||||
array_ops.reshape(transformed_tensor.values, [-1]),
|
||||
None))
|
||||
array_ops.reshape(transformed_tensor.values, [-1]), None))
|
||||
sparse_feature_weights.append(columns_to_variables[column][0])
|
||||
elif isinstance(column, layers.feature_column._WeightedSparseColumn):
|
||||
elif isinstance(column, layers.feature_column._WeightedSparseColumn): # pylint: disable=protected-access
|
||||
id_tensor = column.id_tensor(transformed_tensor)
|
||||
weight_tensor = column.weight_tensor(transformed_tensor)
|
||||
sparse_feature_with_values.append(
|
||||
@ -183,11 +187,10 @@ class SDCAOptimizer(object):
|
||||
array_ops.reshape(id_tensor.values, [-1]),
|
||||
array_ops.reshape(weight_tensor.values, [-1])))
|
||||
sparse_feature_with_values_weights.append(
|
||||
columns_to_variables[column][0])
|
||||
columns_to_variables[column][0])
|
||||
else:
|
||||
raise ValueError('SDCAOptimizer does not support column type %s.' %
|
||||
type(column).__name__)
|
||||
# pylint: enable=protected-access
|
||||
|
||||
example_weights = array_ops.reshape(
|
||||
features[weight_column_name],
|
||||
@ -195,12 +198,13 @@ class SDCAOptimizer(object):
|
||||
example_ids = features[self._example_id_column]
|
||||
sparse_feature_with_values.extend(sparse_features)
|
||||
sparse_feature_with_values_weights.extend(sparse_feature_weights)
|
||||
examples = dict(sparse_features=sparse_feature_with_values,
|
||||
dense_features=dense_features,
|
||||
example_labels=math_ops.to_float(array_ops.reshape(
|
||||
targets, shape=[-1])),
|
||||
example_weights=example_weights,
|
||||
example_ids=example_ids)
|
||||
examples = dict(
|
||||
sparse_features=sparse_feature_with_values,
|
||||
dense_features=dense_features,
|
||||
example_labels=math_ops.to_float(
|
||||
array_ops.reshape(targets, shape=[-1])),
|
||||
example_weights=example_weights,
|
||||
example_ids=example_ids)
|
||||
sdca_variables = dict(
|
||||
sparse_features_weights=sparse_feature_with_values_weights,
|
||||
dense_features_weights=dense_feature_weights)
|
||||
|
@ -11,6 +11,7 @@ tensorflow/core/protobuf/cluster.pb.cc
|
||||
tensorflow/core/protobuf/config.pb.cc
|
||||
tensorflow/core/protobuf/rewriter_config.pb.cc
|
||||
tensorflow/core/protobuf/debug.pb.cc
|
||||
tensorflow/core/protobuf/device_properties.pb.cc
|
||||
tensorflow/core/lib/core/error_codes.pb.cc
|
||||
tensorflow/core/framework/versions.pb.cc
|
||||
tensorflow/core/framework/variable.pb.cc
|
||||
@ -36,3 +37,4 @@ tensorflow/core/framework/attr_value.pb.cc
|
||||
tensorflow/core/framework/allocation_description.pb.cc
|
||||
tensorflow/core/example/feature.pb.cc
|
||||
tensorflow/core/example/example.pb.cc
|
||||
tensorflow/core/grappler/costs/op_performance_data.pb.cc
|
||||
|
@ -10,6 +10,7 @@ tensorflow/core/protobuf/meta_graph.pb.h
|
||||
tensorflow/core/protobuf/cluster.pb.h
|
||||
tensorflow/core/protobuf/config.pb.h
|
||||
tensorflow/core/protobuf/debug.pb.h
|
||||
tensorflow/core/protobuf/device_properties.pb.h
|
||||
tensorflow/core/protobuf/rewriter_config.pb.h
|
||||
tensorflow/core/protobuf/tensor_bundle.pb.h
|
||||
tensorflow/core/lib/core/error_codes.pb.h
|
||||
@ -37,3 +38,4 @@ tensorflow/core/framework/attr_value.pb.h
|
||||
tensorflow/core/framework/allocation_description.pb.h
|
||||
tensorflow/core/example/feature.pb.h
|
||||
tensorflow/core/example/example.pb.h
|
||||
tensorflow/core/grappler/costs/op_performance_data.pb.h
|
||||
|
@ -10,6 +10,7 @@ tensorflow/core/protobuf/meta_graph.proto
|
||||
tensorflow/core/protobuf/cluster.proto
|
||||
tensorflow/core/protobuf/config.proto
|
||||
tensorflow/core/protobuf/debug.proto
|
||||
tensorflow/core/protobuf/device_properties.proto
|
||||
tensorflow/core/protobuf/rewriter_config.proto
|
||||
tensorflow/core/protobuf/tensor_bundle.proto
|
||||
tensorflow/core/lib/core/error_codes.proto
|
||||
|
@ -8,11 +8,13 @@ exports_files(["LICENSE"])
|
||||
package(default_visibility = ["//tensorflow:__subpackages__"])
|
||||
|
||||
load("//tensorflow:tensorflow.bzl", "py_test")
|
||||
load("//tensorflow:tensorflow.bzl", "tf_py_test")
|
||||
|
||||
py_library(
|
||||
name = "opt_py",
|
||||
srcs = [
|
||||
"__init__.py",
|
||||
"python/training/drop_stale_gradient_optimizer.py",
|
||||
"python/training/external_optimizer.py",
|
||||
"python/training/lazy_adam_optimizer.py",
|
||||
"python/training/moving_average_optimizer.py",
|
||||
@ -104,6 +106,22 @@ py_test(
|
||||
],
|
||||
)
|
||||
|
||||
tf_py_test(
|
||||
name = "drop_stale_gradient_optimizer_test",
|
||||
srcs = ["python/training/drop_stale_gradient_optimizer_test.py"],
|
||||
additional_deps = [
|
||||
":opt_py",
|
||||
"//third_party/py/numpy",
|
||||
"//tensorflow/python:client",
|
||||
"//tensorflow/python:client_testlib",
|
||||
"//tensorflow/python:dtypes",
|
||||
"//tensorflow/python:framework_ops",
|
||||
"//tensorflow/python:math_ops",
|
||||
"//tensorflow/python:training",
|
||||
"//tensorflow/python:variables",
|
||||
],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "all_files",
|
||||
srcs = glob(
|
||||
|
@ -19,6 +19,7 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
# pylint: disable=wildcard-import
|
||||
from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import *
|
||||
from tensorflow.contrib.opt.python.training.external_optimizer import *
|
||||
from tensorflow.contrib.opt.python.training.lazy_adam_optimizer import *
|
||||
from tensorflow.contrib.opt.python.training.moving_average_optimizer import *
|
||||
@ -27,7 +28,8 @@ from tensorflow.contrib.opt.python.training.variable_clipping_optimizer import *
|
||||
|
||||
from tensorflow.python.util.all_util import remove_undocumented
|
||||
|
||||
_allowed_symbols = ['ExternalOptimizerInterface',
|
||||
_allowed_symbols = ['DropStaleGradientOptimizer',
|
||||
'ExternalOptimizerInterface',
|
||||
'LazyAdamOptimizer',
|
||||
'MovingAverageOptimizer',
|
||||
'ScipyOptimizerInterface',
|
||||
|
@ -0,0 +1,112 @@
|
||||
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
"""Wrapper optimizer for checking and dropping stale gradients."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.ops import control_flow_ops
|
||||
from tensorflow.python.ops import gen_array_ops
|
||||
from tensorflow.python.ops import gen_math_ops
|
||||
from tensorflow.python.ops import init_ops
|
||||
from tensorflow.python.ops import math_ops
|
||||
from tensorflow.python.ops import variable_scope
|
||||
from tensorflow.python.summary import summary
|
||||
from tensorflow.python.training import optimizer
|
||||
from tensorflow.python.training import training_util
|
||||
|
||||
|
||||
class DropStaleGradientOptimizer(optimizer.Optimizer):
|
||||
"""Wrapper optimizer that checks and drops stale gradient.
|
||||
|
||||
This optimizer records the global step for each worker before computing
|
||||
gradients and compares it with the global step at the time of applying the
|
||||
gradients. If the difference is larger than a threshold, it will drop all
|
||||
the computed gradients.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
opt,
|
||||
staleness,
|
||||
use_locking=False,
|
||||
name="DropStaleGradient"):
|
||||
"""Constructs a new DropStaleGradientOptimizer.
|
||||
|
||||
Args:
|
||||
opt: The actual optimizer that will be used to compute and apply the
|
||||
gradients. Must be one of the Optimizer classes.
|
||||
staleness: The maximum staleness allowed for the optimizer.
|
||||
use_locking: If `True` use locks for clip update operations.
|
||||
name: Optional name prefix for the operations created when applying
|
||||
gradients. Defaults to "DropStaleGradient".
|
||||
"""
|
||||
super(DropStaleGradientOptimizer, self).__init__(use_locking, name)
|
||||
self._opt = opt
|
||||
self._staleness = staleness
|
||||
|
||||
def compute_gradients(self, loss, *args, **kwargs):
|
||||
# Record current global step for worker.
|
||||
with ops.colocate_with(loss):
|
||||
self._local_step = training_util.get_global_step() + 0
|
||||
|
||||
with ops.control_dependencies([self._local_step]):
|
||||
loss = gen_array_ops.identity(loss)
|
||||
return self._opt.compute_gradients(loss, *args, **kwargs)
|
||||
|
||||
def get_slot(self, *args, **kwargs):
|
||||
return self._opt.get_slot(*args, **kwargs)
|
||||
|
||||
def get_slot_names(self, *args, **kwargs):
|
||||
return self._opt.get_slot_names(*args, **kwargs)
|
||||
|
||||
def apply_gradients(self, grads_and_vars, global_step=None, name=None):
|
||||
gradients = []
|
||||
# Number of stale gradients.
|
||||
stale_counter = variable_scope.get_variable(
|
||||
"stale_counter", [],
|
||||
initializer=init_ops.zeros_initializer(),
|
||||
trainable=False)
|
||||
|
||||
def _AcceptGradientOp():
|
||||
with ops.control_dependencies(
|
||||
[self._opt.apply_gradients(
|
||||
grads_and_vars, global_step=global_step, name=name)]):
|
||||
return gen_array_ops.identity(0.0)
|
||||
|
||||
def _DropGradientOp():
|
||||
return gen_array_ops.identity(1.0)
|
||||
|
||||
for grad_and_var in grads_and_vars:
|
||||
grad = grad_and_var[0]
|
||||
if isinstance(grad, ops.Tensor):
|
||||
gradients.append(grad)
|
||||
else:
|
||||
gradients.append(grad.op)
|
||||
|
||||
with ops.control_dependencies(gradients), ops.colocate_with(global_step):
|
||||
staleness = gen_array_ops.reshape(
|
||||
global_step - self._local_step, shape=())
|
||||
conditional_update = stale_counter.assign_add(control_flow_ops.cond(
|
||||
gen_math_ops.less_equal(staleness, self._staleness),
|
||||
_AcceptGradientOp, _DropGradientOp))
|
||||
|
||||
summary.scalar(
|
||||
"Gradient staleness percentage",
|
||||
stale_counter / (math_ops.cast(global_step + 1, dtypes.float32)))
|
||||
return conditional_update
|
@ -0,0 +1,297 @@
|
||||
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Tests for DropStaleGradientOptimizer."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import portpicker
|
||||
|
||||
from tensorflow.contrib.opt.python.training import drop_stale_gradient_optimizer
|
||||
from tensorflow.python.client import session
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.ops import data_flow_ops
|
||||
from tensorflow.python.ops import variables
|
||||
from tensorflow.python.platform import test
|
||||
from tensorflow.python.training import gradient_descent
|
||||
from tensorflow.python.training import server_lib
|
||||
from tensorflow.python.training import training_util
|
||||
|
||||
|
||||
# Creates the workers and return their sessions, graphs, train_ops.
|
||||
def _get_workers(num_workers, staleness):
|
||||
worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
|
||||
cluster_dict = {
|
||||
'worker': ['localhost:%s' % port for port in worker_ports],
|
||||
'ps': ['localhost:%s' % portpicker.pick_unused_port()]
|
||||
}
|
||||
cs = server_lib.ClusterSpec(cluster_dict)
|
||||
workers = [
|
||||
server_lib.Server(
|
||||
cs, job_name='worker', task_index=ix, start=True)
|
||||
for ix in range(num_workers)
|
||||
]
|
||||
server_lib.Server(cs, job_name='ps', task_index=0, start=True)
|
||||
|
||||
sessions = []
|
||||
graphs = []
|
||||
train_ops = []
|
||||
|
||||
# To simulate stale cases, maintaining two queues for computing and
|
||||
# applying gradients respectively. In the phase of computing gradients,
|
||||
# all workers except chief worker compute gradients together and chief worker
|
||||
# computes after all other worers' computing finished. In the phase of
|
||||
# applying gradients, chief worker will first apply gradients, then all other
|
||||
# workers will apply gradients one by one. Therefore, the chief worker will
|
||||
# always have 0 staleness, each of all other workers will have a unique
|
||||
# staleness value from [1, num_workers).
|
||||
for worker_id in range(num_workers):
|
||||
graph = ops.Graph()
|
||||
with graph.as_default():
|
||||
global_step = training_util.create_global_step()
|
||||
var_0 = variables.Variable(0.0, name='v0')
|
||||
var_1 = variables.Variable(1.0, name='v1')
|
||||
compute_gradients_queue = data_flow_ops.FIFOQueue(
|
||||
-1, global_step.dtype.base_dtype, shapes=(),
|
||||
name='compute_gradients_queue', shared_name='compute_gradients_queue')
|
||||
apply_gradients_queue = data_flow_ops.FIFOQueue(
|
||||
-1, global_step.dtype.base_dtype, shapes=(),
|
||||
name='apply_gradients_queue', shared_name='apply_gradients_queue')
|
||||
|
||||
# Gradients for loss on var_0 and var_1 will be 1.0.
|
||||
loss = 0 - var_0 - var_1
|
||||
sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
|
||||
stale_check_opt = (
|
||||
drop_stale_gradient_optimizer.DropStaleGradientOptimizer(
|
||||
sgd_opt, staleness))
|
||||
|
||||
# Compute gradients.
|
||||
if worker_id == 0:
|
||||
with ops.control_dependencies(
|
||||
[compute_gradients_queue.dequeue_many(num_workers - 1)]):
|
||||
grad_and_vars = stale_check_opt.compute_gradients(loss)
|
||||
else:
|
||||
grad_and_vars = stale_check_opt.compute_gradients(loss)
|
||||
with ops.control_dependencies([t[0] for t in grad_and_vars]):
|
||||
worker_enqueue_op = compute_gradients_queue.enqueue(global_step)
|
||||
|
||||
# Apply gradients.
|
||||
if worker_id == 0:
|
||||
with ops.control_dependencies(
|
||||
[stale_check_opt.apply_gradients(grad_and_vars, global_step)]):
|
||||
train_op = apply_gradients_queue.enqueue(global_step)
|
||||
else:
|
||||
with ops.control_dependencies([worker_enqueue_op]):
|
||||
with ops.control_dependencies([apply_gradients_queue.dequeue()]):
|
||||
with ops.control_dependencies(
|
||||
[stale_check_opt.apply_gradients(
|
||||
grad_and_vars, global_step)]):
|
||||
train_op = apply_gradients_queue.enqueue(global_step)
|
||||
|
||||
sess = session.Session(workers[worker_id].target)
|
||||
|
||||
sessions.append(sess)
|
||||
graphs.append(graph)
|
||||
train_ops.append(train_op)
|
||||
|
||||
return sessions, graphs, train_ops
|
||||
|
||||
|
||||
class DropStaleGradientOptimizerTest(test.TestCase):
|
||||
|
||||
def _run(self, train_op, sess):
|
||||
sess.run(train_op)
|
||||
|
||||
def test1Worker(self):
|
||||
num_workers = 1
|
||||
sessions, graphs, train_ops = _get_workers(num_workers, 0)
|
||||
with graphs[0].as_default():
|
||||
sessions[0].run(variables.global_variables_initializer())
|
||||
global_step = training_util.get_global_step(graphs[0])
|
||||
var_0 = graphs[0].get_tensor_by_name('v0:0')
|
||||
var_1 = graphs[0].get_tensor_by_name('v1:0')
|
||||
stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
|
||||
# Verify the initialized value.
|
||||
self.assertAllEqual(0.0, sessions[0].run(var_0))
|
||||
self.assertAllEqual(1.0, sessions[0].run(var_1))
|
||||
self.assertAllEqual(0.0, sessions[0].run(stale_counter))
|
||||
self.assertAllEqual(0, sessions[0].run(global_step))
|
||||
|
||||
sessions[0].run(train_ops[0])
|
||||
|
||||
# Verify the updated value after 1 step.
|
||||
self.assertAllEqual(1, sessions[0].run(global_step))
|
||||
self.assertAllEqual(0.0 + 1.0, sessions[0].run(var_0))
|
||||
self.assertAllEqual(1.0 + 1.0, sessions[0].run(var_1))
|
||||
self.assertAllEqual(1, sessions[0].run(global_step))
|
||||
|
||||
def test1WorkerNegativeStaleness(self):
|
||||
num_workers = 1
|
||||
sessions, graphs, train_ops = _get_workers(num_workers, -1)
|
||||
with graphs[0].as_default():
|
||||
sessions[0].run(variables.global_variables_initializer())
|
||||
global_step = training_util.get_global_step(graphs[0])
|
||||
var_0 = graphs[0].get_tensor_by_name('v0:0')
|
||||
var_1 = graphs[0].get_tensor_by_name('v1:0')
|
||||
stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
|
||||
# Verify the initialized value.
|
||||
self.assertAllEqual(0.0, sessions[0].run(var_0))
|
||||
self.assertAllEqual(1.0, sessions[0].run(var_1))
|
||||
self.assertAllEqual(0.0, sessions[0].run(stale_counter))
|
||||
self.assertAllEqual(0, sessions[0].run(global_step))
|
||||
|
||||
sessions[0].run(train_ops[0])
|
||||
|
||||
# Verify no updates because max staleness is negative.
|
||||
self.assertAllEqual(0, sessions[0].run(global_step))
|
||||
self.assertAllEqual(1.0, sessions[0].run(stale_counter))
|
||||
self.assertAllEqual(0.0, sessions[0].run(var_0))
|
||||
self.assertAllEqual(1.0, sessions[0].run(var_1))
|
||||
|
||||
def test2WorkersStaleness0(self):
|
||||
num_workers = 2
|
||||
sessions, graphs, train_ops = _get_workers(num_workers, 0)
|
||||
with graphs[0].as_default():
|
||||
sessions[0].run(variables.global_variables_initializer())
|
||||
global_step = training_util.get_global_step(graphs[0])
|
||||
var_0 = graphs[0].get_tensor_by_name('v0:0')
|
||||
var_1 = graphs[0].get_tensor_by_name('v1:0')
|
||||
stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
|
||||
# Verify the initialized value.
|
||||
self.assertAllEqual(0.0, sessions[0].run(var_0))
|
||||
self.assertAllEqual(1.0, sessions[0].run(var_1))
|
||||
self.assertAllEqual(0.0, sessions[0].run(stale_counter))
|
||||
self.assertAllEqual(0, sessions[0].run(global_step))
|
||||
|
||||
thread_0 = self.checkedThread(
|
||||
target=self._run, args=(train_ops[0], sessions[0]))
|
||||
thread_1 = self.checkedThread(
|
||||
target=self._run, args=(train_ops[1], sessions[1]))
|
||||
thread_0.start()
|
||||
thread_1.start()
|
||||
thread_0.join()
|
||||
thread_1.join()
|
||||
|
||||
# With 2 workers and max staleness set to 0, only cheif worker will update
|
||||
# var_0 and var_1.
|
||||
self.assertAllEqual(1, sessions[0].run(global_step))
|
||||
self.assertAllEqual(1.0, sessions[0].run(stale_counter))
|
||||
self.assertAllEqual(0.0 + 1.0, sessions[0].run(var_0))
|
||||
self.assertAllEqual(1.0 + 1.0, sessions[0].run(var_1))
|
||||
|
||||
def test2WorkersStaleness1(self):
|
||||
num_workers = 2
|
||||
sessions, graphs, train_ops = _get_workers(num_workers, 1)
|
||||
with graphs[0].as_default():
|
||||
sessions[0].run(variables.global_variables_initializer())
|
||||
global_step = training_util.get_global_step(graphs[0])
|
||||
var_0 = graphs[0].get_tensor_by_name('v0:0')
|
||||
var_1 = graphs[0].get_tensor_by_name('v1:0')
|
||||
stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
|
||||
# Verify the initialized value.
|
||||
self.assertAllEqual(0.0, sessions[0].run(var_0))
|
||||
self.assertAllEqual(1.0, sessions[0].run(var_1))
|
||||
self.assertAllEqual(0.0, sessions[0].run(stale_counter))
|
||||
self.assertAllEqual(0, sessions[0].run(global_step))
|
||||
|
||||
thread_0 = self.checkedThread(
|
||||
target=self._run, args=(train_ops[0], sessions[0]))
|
||||
thread_1 = self.checkedThread(
|
||||
target=self._run, args=(train_ops[1], sessions[1]))
|
||||
thread_0.start()
|
||||
thread_1.start()
|
||||
thread_0.join()
|
||||
thread_1.join()
|
||||
|
||||
# With 2 workers and max staleness set to 1, both workers will update
|
||||
# var_0 and var_1.
|
||||
self.assertAllEqual(2, sessions[0].run(global_step))
|
||||
self.assertAllEqual(0.0, sessions[0].run(stale_counter))
|
||||
self.assertAllEqual(0.0 + 2.0, sessions[0].run(var_0))
|
||||
self.assertAllEqual(1.0 + 2.0, sessions[0].run(var_1))
|
||||
|
||||
def test3WorkersStaleness0(self):
|
||||
num_workers = 3
|
||||
sessions, graphs, train_ops = _get_workers(num_workers, 0)
|
||||
with graphs[0].as_default():
|
||||
sessions[0].run(variables.global_variables_initializer())
|
||||
global_step = training_util.get_global_step(graphs[0])
|
||||
var_0 = graphs[0].get_tensor_by_name('v0:0')
|
||||
var_1 = graphs[0].get_tensor_by_name('v1:0')
|
||||
stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
|
||||
# Verify the initialized value.
|
||||
self.assertAllEqual(0.0, sessions[0].run(var_0))
|
||||
self.assertAllEqual(1.0, sessions[0].run(var_1))
|
||||
self.assertAllEqual(0.0, sessions[0].run(stale_counter))
|
||||
self.assertAllEqual(0, sessions[0].run(global_step))
|
||||
|
||||
thread_0 = self.checkedThread(
|
||||
target=self._run, args=(train_ops[0], sessions[0]))
|
||||
thread_1 = self.checkedThread(
|
||||
target=self._run, args=(train_ops[1], sessions[1]))
|
||||
thread_2 = self.checkedThread(
|
||||
target=self._run, args=(train_ops[2], sessions[2]))
|
||||
thread_0.start()
|
||||
thread_1.start()
|
||||
thread_2.start()
|
||||
thread_0.join()
|
||||
thread_1.join()
|
||||
thread_2.join()
|
||||
|
||||
# With 3 workers and max staleness set to 0, only cheif worker will update
|
||||
# var_0 and var_1.
|
||||
self.assertAllEqual(1, sessions[0].run(global_step))
|
||||
self.assertAllEqual(2.0, sessions[0].run(stale_counter))
|
||||
self.assertAllEqual(0.0 + 1.0, sessions[0].run(var_0))
|
||||
self.assertAllEqual(1.0 + 1.0, sessions[0].run(var_1))
|
||||
|
||||
def test3WorkersStaleness1(self):
|
||||
num_workers = 3
|
||||
sessions, graphs, train_ops = _get_workers(num_workers, 1)
|
||||
with graphs[0].as_default():
|
||||
sessions[0].run(variables.global_variables_initializer())
|
||||
global_step = training_util.get_global_step(graphs[0])
|
||||
var_0 = graphs[0].get_tensor_by_name('v0:0')
|
||||
var_1 = graphs[0].get_tensor_by_name('v1:0')
|
||||
stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
|
||||
# Verify the initialized value.
|
||||
self.assertAllEqual(0.0, sessions[0].run(var_0))
|
||||
self.assertAllEqual(1.0, sessions[0].run(var_1))
|
||||
self.assertAllEqual(0.0, sessions[0].run(stale_counter))
|
||||
self.assertAllEqual(0, sessions[0].run(global_step))
|
||||
|
||||
thread_0 = self.checkedThread(
|
||||
target=self._run, args=(train_ops[0], sessions[0]))
|
||||
thread_1 = self.checkedThread(
|
||||
target=self._run, args=(train_ops[1], sessions[1]))
|
||||
thread_2 = self.checkedThread(
|
||||
target=self._run, args=(train_ops[2], sessions[2]))
|
||||
thread_0.start()
|
||||
thread_1.start()
|
||||
thread_2.start()
|
||||
thread_0.join()
|
||||
thread_1.join()
|
||||
thread_2.join()
|
||||
|
||||
# With 3 workers and max staleness set to 1, chief worker and only one of
|
||||
# the two other workers will update var_0 and var_1.
|
||||
self.assertAllEqual(2, sessions[0].run(global_step))
|
||||
self.assertAllEqual(1.0, sessions[0].run(stale_counter))
|
||||
self.assertAllEqual(0.0 + 2.0, sessions[0].run(var_0))
|
||||
self.assertAllEqual(1.0 + 2.0, sessions[0].run(var_1))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test.main()
|
@ -849,14 +849,12 @@ class RNNCellTest(test.TestCase):
|
||||
batch_size = 3
|
||||
input_size = 4
|
||||
expected_state_c = np.array(
|
||||
[[2.954548e-01, 8.354891e-04],
|
||||
[2.834632e-01, 8.158963e-01],
|
||||
[2.291694e-01, 1.325745e-04]],
|
||||
[[0.00072015, 0.00036633], [0.00083481, 0.00047266],
|
||||
[0.00085111, 0.00053054]],
|
||||
dtype=np.float32)
|
||||
expected_state_h = np.array(
|
||||
[[2.116566e-01, 5.985238e-04],
|
||||
[2.137760e-01, 6.153145e-01],
|
||||
[1.742966e-01, 1.008306e-04]],
|
||||
[[0.0005159, 0.00026243], [0.00062958, 0.00035646],
|
||||
[0.00064732, 0.00040351]],
|
||||
dtype=np.float32)
|
||||
with variable_scope.variable_scope(
|
||||
"root", initializer=init_ops.constant_initializer(0.5)):
|
||||
|
@ -11,7 +11,12 @@ Consultants: Jon Shlens, Pete Warden
|
||||
1. Measure model parameters, float operations, tensor shapes.
|
||||
2. Measure op execution times, requested memory size and device placement.
|
||||
3. Inspect checkpoint tensors' shapes and their values.
|
||||
4. Explore model based on name scope or graph structure.
|
||||
4. 3 ways to view and explore TensorFlow model profiles
|
||||
|
||||
* Organize by Python code call stack.
|
||||
* Organize by TensorFlow operation name scope hierarchies.
|
||||
* Organize by TensorFlow operation inputs/outputs graph.
|
||||
|
||||
5. Selectively grouping/filtering/accounting/ordering ops.
|
||||
|
||||
tfprof can be used as Python API, Interactive CLI and One-shot Script.
|
||||
@ -28,7 +33,8 @@ param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
|
||||
tfprof_options=tf.contrib.tfprof.model_analyzer.
|
||||
TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
|
||||
|
||||
# param_stats is tensorflow.tfprof.TFProfNode proto. It organize the statistics
|
||||
# param_stats is tensorflow.tfprof.TFGraphNodeProto proto.
|
||||
# It organize the statistics
|
||||
# of each graph node in tree scructure. Let's print the root below.
|
||||
sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
|
||||
```
|
||||
|
@ -21,16 +21,34 @@ py_test(
|
||||
name = "model_analyzer_test",
|
||||
srcs = ["model_analyzer_test.py"],
|
||||
srcs_version = "PY2AND3",
|
||||
tags = ["no_pip"],
|
||||
deps = [
|
||||
":model_analyzer",
|
||||
"//tensorflow/core:protos_all_py",
|
||||
"//tensorflow/python:array_ops",
|
||||
":model_analyzer_testlib",
|
||||
"//tensorflow/python:client",
|
||||
"//tensorflow/python:client_testlib",
|
||||
"//tensorflow/python:framework_for_generated_wrappers",
|
||||
"//tensorflow/python:platform",
|
||||
"//tensorflow/python:variables",
|
||||
],
|
||||
)
|
||||
|
||||
py_library(
|
||||
name = "model_analyzer_testlib",
|
||||
srcs = ["model_analyzer_testlib.py"],
|
||||
srcs_version = "PY2AND3",
|
||||
deps = [
|
||||
":model_analyzer",
|
||||
"//tensorflow/contrib/rnn:rnn_py",
|
||||
"//tensorflow/core:protos_all_py",
|
||||
"//tensorflow/python:array_ops",
|
||||
"//tensorflow/python:framework_for_generated_wrappers",
|
||||
"//tensorflow/python:init_ops",
|
||||
"//tensorflow/python:math_ops",
|
||||
"//tensorflow/python:nn_ops",
|
||||
"//tensorflow/python:platform",
|
||||
"//tensorflow/python:rnn",
|
||||
"//tensorflow/python:training",
|
||||
"//tensorflow/python:variable_scope",
|
||||
"//tensorflow/python:variables",
|
||||
],
|
||||
|
@ -123,7 +123,7 @@ def print_model_analysis(graph,
|
||||
"""Print model statistics.
|
||||
|
||||
Prints the model statistics to stdout. Also returns the results
|
||||
in a TFProfNode proto. See go/tfprof or run tfprof tool:
|
||||
in a TFGraphNodeProto proto. See go/tfprof or run tfprof tool:
|
||||
'bazel run third_party/tensorflow/tools/tfprof help'
|
||||
|
||||
Examples:
|
||||
@ -142,15 +142,19 @@ def print_model_analysis(graph,
|
||||
'micros' and 'bytes'.
|
||||
op_log: tensorflow::tfprof::OpLog proto. users can use this proto to
|
||||
group together ops and use a op_type to select the group.
|
||||
tfprof_cmd: string. Either 'scope' or 'graph'. 'scope' view organize
|
||||
ops using their name scopes. 'graph' view organize ops using
|
||||
their graph inputs.
|
||||
tfprof_cmd: string. Either 'scope', 'graph', 'code'.
|
||||
'scope' view organize outputs using ops' name scope.
|
||||
'graph' view organize outputs using op's inputs/outputs.
|
||||
'code' view organize outputs using Python call stack.
|
||||
tfprof_options: See 'tfprof help' for details.
|
||||
Returns:
|
||||
TFProfNode proto. Side effect: a formatted output to stdout.
|
||||
If tfprof_cmd is 'scope' or 'graph', returns TFGraphNodeProto proto.
|
||||
If tfprof_cmd is 'code', returns TFCodeNodeProto proto.
|
||||
Side effect: a formatted output to stdout.
|
||||
"""
|
||||
# pylint: disable=protected-access
|
||||
op_log = tfprof_logger._merge_default_with_oplog(graph, op_log, run_meta)
|
||||
op_log = tfprof_logger._merge_default_with_oplog(
|
||||
graph, op_log, run_meta, add_trace=tfprof_cmd == 'code')
|
||||
# pylint: enable=protected-access
|
||||
opts = tfprof_options_pb2.OptionsProto()
|
||||
opts.max_depth = tfprof_options['max_depth']
|
||||
@ -178,11 +182,24 @@ def print_model_analysis(graph,
|
||||
opts.dump_to_file = tfprof_options['dump_to_file']
|
||||
|
||||
run_meta_str = run_meta.SerializeToString() if run_meta else b''
|
||||
op_log_str = op_log.SerializeToString() if op_log else b''
|
||||
|
||||
tfprof_node = tfprof_output_pb2.TFProfNode()
|
||||
tfprof_node.ParseFromString(
|
||||
print_mdl.PrintModelAnalysis(
|
||||
graph.as_graph_def().SerializeToString(), run_meta_str, op_log_str,
|
||||
tfprof_cmd.encode('utf-8'), opts.SerializeToString()))
|
||||
if tfprof_cmd == 'code':
|
||||
tfprof_node = tfprof_output_pb2.TFCodeNodeProto()
|
||||
tfprof_node.ParseFromString(
|
||||
print_mdl.PrintModelAnalysis(
|
||||
graph.as_graph_def().SerializeToString(),
|
||||
run_meta_str,
|
||||
op_log.SerializeToString(),
|
||||
tfprof_cmd.encode('utf-8'),
|
||||
opts.SerializeToString()))
|
||||
else:
|
||||
tfprof_node = tfprof_output_pb2.TFGraphNodeProto()
|
||||
tfprof_node.ParseFromString(
|
||||
print_mdl.PrintModelAnalysis(
|
||||
graph.as_graph_def().SerializeToString(),
|
||||
run_meta_str,
|
||||
op_log.SerializeToString(),
|
||||
tfprof_cmd.encode('utf-8'),
|
||||
opts.SerializeToString()))
|
||||
|
||||
return tfprof_node
|
||||
|
@ -18,49 +18,27 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
|
||||
from tensorflow.core.protobuf import config_pb2
|
||||
from tensorflow.python.client import session
|
||||
from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.ops import array_ops
|
||||
from tensorflow.python.ops import init_ops
|
||||
from tensorflow.python.ops import nn_ops
|
||||
from tensorflow.python.ops import variable_scope
|
||||
from tensorflow.python.ops import variables
|
||||
from tensorflow.python.platform import gfile
|
||||
from tensorflow.python.platform import test
|
||||
|
||||
# XXX: this depends on pywrap_tensorflow and must come later
|
||||
from tensorflow.contrib.tfprof.python.tools.tfprof import model_analyzer
|
||||
from tensorflow.contrib.tfprof.python.tools.tfprof import model_analyzer_testlib as lib
|
||||
|
||||
|
||||
class PrintModelAnalysisTest(test.TestCase):
|
||||
|
||||
def _BuildSmallModel(self):
|
||||
image = array_ops.zeros([2, 6, 6, 3])
|
||||
_ = variable_scope.get_variable(
|
||||
'ScalarW', [],
|
||||
dtypes.float32,
|
||||
initializer=init_ops.random_normal_initializer(stddev=0.001))
|
||||
kernel = variable_scope.get_variable(
|
||||
'DW', [3, 3, 3, 6],
|
||||
dtypes.float32,
|
||||
initializer=init_ops.random_normal_initializer(stddev=0.001))
|
||||
x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
|
||||
kernel = variable_scope.get_variable(
|
||||
'DW2', [2, 2, 6, 12],
|
||||
dtypes.float32,
|
||||
initializer=init_ops.random_normal_initializer(stddev=0.001))
|
||||
x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME')
|
||||
return x
|
||||
|
||||
def testDumpToFile(self):
|
||||
ops.reset_default_graph()
|
||||
opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
|
||||
opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
|
||||
|
||||
with session.Session() as sess, ops.device('/cpu:0'):
|
||||
_ = self._BuildSmallModel()
|
||||
_ = lib.BuildSmallModel()
|
||||
model_analyzer.print_model_analysis(sess.graph, tfprof_options=opts)
|
||||
|
||||
with gfile.Open(opts['dump_to_file'], 'r') as f:
|
||||
@ -71,6 +49,7 @@ class PrintModelAnalysisTest(test.TestCase):
|
||||
f.read())
|
||||
|
||||
def testSelectEverything(self):
|
||||
ops.reset_default_graph()
|
||||
opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
|
||||
opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
|
||||
opts['account_type_regexes'] = ['.*']
|
||||
@ -78,8 +57,10 @@ class PrintModelAnalysisTest(test.TestCase):
|
||||
'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device', 'op_types'
|
||||
]
|
||||
|
||||
with session.Session() as sess, ops.device('/cpu:0'):
|
||||
x = self._BuildSmallModel()
|
||||
config = config_pb2.ConfigProto(
|
||||
graph_options=config_pb2.GraphOptions(build_cost_model=1))
|
||||
with session.Session(config=config) as sess, ops.device('/cpu:0'):
|
||||
x = lib.BuildSmallModel()
|
||||
|
||||
sess.run(variables.global_variables_initializer())
|
||||
run_meta = config_pb2.RunMetadata()
|
||||
@ -98,6 +79,118 @@ class PrintModelAnalysisTest(test.TestCase):
|
||||
f.read())
|
||||
# pylint: enable=line-too-long
|
||||
|
||||
def testSimpleCodeView(self):
|
||||
ops.reset_default_graph()
|
||||
opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
|
||||
opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
|
||||
opts['account_type_regexes'] = ['.*']
|
||||
opts['show_name_regexes'] = ['.*model_analyzer_testlib.*']
|
||||
opts['account_displayed_op_only'] = False
|
||||
# TODO(xpan): Test 'micros'. Since the execution time changes each run,
|
||||
# it's a bit difficult to test it now.
|
||||
opts['select'] = [
|
||||
'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device',
|
||||
]
|
||||
|
||||
config = config_pb2.ConfigProto(
|
||||
graph_options=config_pb2.GraphOptions(build_cost_model=1))
|
||||
with session.Session(config=config) as sess, ops.device('/cpu:0'):
|
||||
x = lib.BuildSmallModel()
|
||||
|
||||
sess.run(variables.global_variables_initializer())
|
||||
run_meta = config_pb2.RunMetadata()
|
||||
_ = sess.run(x,
|
||||
options=config_pb2.RunOptions(
|
||||
trace_level=config_pb2.RunOptions.FULL_TRACE),
|
||||
run_metadata=run_meta)
|
||||
|
||||
model_analyzer.print_model_analysis(
|
||||
sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
|
||||
|
||||
with gfile.Open(opts['dump_to_file'], 'r') as f:
|
||||
# pylint: disable=line-too-long
|
||||
self.assertEqual(
|
||||
'_TFProfRoot (0/451 params, 0/10.44k flops, 0B/5.28KB)\n model_analyzer_testlib.py:33:BuildSmallModel:image = array_ops... (0/0 params, 0/0 flops, 0B/864B)\n model_analyzer_testlib.py:37:BuildSmallModel:initializer=init_... (0/1 params, 0/0 flops, 0B/0B)\n model_analyzer_testlib.py:41:BuildSmallModel:initializer=init_... (0/162 params, 0/0 flops, 0B/1.30KB)\n model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/5.83k flops, 0B/432B)\n model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0/288 params, 0/0 flops, 0B/2.30KB)\n model_analyzer_testlib.py:47:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/4.61k flops, 0B/384B)\n',
|
||||
f.read())
|
||||
# pylint: enable=line-too-long
|
||||
|
||||
def testComplexCodeView(self):
|
||||
ops.reset_default_graph()
|
||||
opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
|
||||
opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
|
||||
opts['account_type_regexes'] = ['.*']
|
||||
opts['show_name_regexes'] = ['.*model_analyzer_testlib.py.*']
|
||||
opts['account_displayed_op_only'] = False
|
||||
opts['select'] = ['params', 'float_ops']
|
||||
|
||||
config = config_pb2.ConfigProto(
|
||||
graph_options=config_pb2.GraphOptions(build_cost_model=1))
|
||||
with session.Session(config=config) as sess, ops.device('/cpu:0'):
|
||||
x = lib.BuildFullModel()
|
||||
|
||||
sess.run(variables.global_variables_initializer())
|
||||
run_meta = config_pb2.RunMetadata()
|
||||
_ = sess.run(x,
|
||||
options=config_pb2.RunOptions(
|
||||
trace_level=config_pb2.RunOptions.FULL_TRACE),
|
||||
run_metadata=run_meta)
|
||||
|
||||
tfprof_node = model_analyzer.print_model_analysis(
|
||||
sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
|
||||
|
||||
# pylint: disable=line-too-long
|
||||
with gfile.Open(opts['dump_to_file'], 'r') as f:
|
||||
self.assertEqual(
|
||||
'_TFProfRoot (0/2.84k params, 0/54.08k flops)\n model_analyzer_testlib.py:56:BuildFullModel:seq.append(array_... (0/1.80k params, 0/41.76k flops)\n model_analyzer_testlib.py:33:BuildSmallModel:image = array_ops... (0/0 params, 0/0 flops)\n model_analyzer_testlib.py:37:BuildSmallModel:initializer=init_... (0/4 params, 0/0 flops)\n model_analyzer_testlib.py:41:BuildSmallModel:initializer=init_... (0/648 params, 0/0 flops)\n model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/23.33k flops)\n model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0/1.15k params, 0/0 flops)\n model_analyzer_testlib.py:47:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/18.43k flops)\n model_analyzer_testlib.py:60:BuildFullModel:cell, array_ops.c... (0/1.04k params, 0/4.13k flops)\n model_analyzer_testlib.py:62:BuildFullModel:target = array_op... (0/0 params, 0/0 flops)\n model_analyzer_testlib.py:63:BuildFullModel:loss = nn_ops.l2_... (0/0 params, 0/0 flops)\n model_analyzer_testlib.py:65:BuildFullModel:return sgd_op.min... (0/0 params, 0/8.19k flops)\n',
|
||||
f.read())
|
||||
|
||||
self.assertLess(0, tfprof_node.total_exec_micros)
|
||||
self.assertEqual(2844, tfprof_node.total_parameters)
|
||||
self.assertEqual(54080, tfprof_node.total_float_ops)
|
||||
self.assertEqual(5, len(tfprof_node.children))
|
||||
self.assertEqual('_TFProfRoot', tfprof_node.name)
|
||||
self.assertEqual('model_analyzer_testlib.py:56:BuildFullModel:seq.append(array_...',
|
||||
tfprof_node.children[0].name)
|
||||
self.assertEqual('model_analyzer_testlib.py:60:BuildFullModel:cell, array_ops.c...',
|
||||
tfprof_node.children[1].name)
|
||||
self.assertEqual('model_analyzer_testlib.py:62:BuildFullModel:target = array_op...',
|
||||
tfprof_node.children[2].name)
|
||||
self.assertEqual('model_analyzer_testlib.py:63:BuildFullModel:loss = nn_ops.l2_...',
|
||||
tfprof_node.children[3].name)
|
||||
self.assertEqual('model_analyzer_testlib.py:65:BuildFullModel:return sgd_op.min...',
|
||||
tfprof_node.children[4].name)
|
||||
# pylint: enable=line-too-long
|
||||
|
||||
def testCodeViewLeafGraphNode(self):
|
||||
ops.reset_default_graph()
|
||||
opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
|
||||
opts['account_type_regexes'] = ['.*']
|
||||
opts['account_displayed_op_only'] = False
|
||||
opts['select'] = [
|
||||
'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device'
|
||||
]
|
||||
|
||||
config = config_pb2.ConfigProto(
|
||||
graph_options=config_pb2.GraphOptions(build_cost_model=1))
|
||||
with session.Session(config=config) as sess, ops.device('/cpu:0'):
|
||||
x = lib.BuildSmallModel()
|
||||
|
||||
sess.run(variables.global_variables_initializer())
|
||||
run_meta = config_pb2.RunMetadata()
|
||||
_ = sess.run(x,
|
||||
options=config_pb2.RunOptions(
|
||||
trace_level=config_pb2.RunOptions.FULL_TRACE),
|
||||
run_metadata=run_meta)
|
||||
|
||||
tfprof_node = model_analyzer.print_model_analysis(
|
||||
sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
|
||||
|
||||
leaf = tfprof_node
|
||||
while leaf.children:
|
||||
self.assertEqual(0, len(leaf.graph_nodes))
|
||||
leaf = leaf.children[0]
|
||||
self.assertEqual(1, len(leaf.graph_nodes))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test.main()
|
||||
|
@ -0,0 +1,67 @@
|
||||
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""A test lib that defines some models."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from tensorflow.contrib.rnn.python.ops.core_rnn_cell import BasicRNNCell
|
||||
from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.ops import array_ops
|
||||
from tensorflow.python.ops import init_ops
|
||||
from tensorflow.python.ops import math_ops
|
||||
from tensorflow.python.ops import nn_ops
|
||||
from tensorflow.python.ops import rnn
|
||||
from tensorflow.python.ops import variable_scope
|
||||
from tensorflow.python.training import gradient_descent
|
||||
|
||||
|
||||
def BuildSmallModel():
|
||||
"""Build a small forward conv model."""
|
||||
image = array_ops.zeros([2, 6, 6, 3])
|
||||
_ = variable_scope.get_variable(
|
||||
'ScalarW', [],
|
||||
dtypes.float32,
|
||||
initializer=init_ops.random_normal_initializer(stddev=0.001))
|
||||
kernel = variable_scope.get_variable(
|
||||
'DW', [3, 3, 3, 6],
|
||||
dtypes.float32,
|
||||
initializer=init_ops.random_normal_initializer(stddev=0.001))
|
||||
x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
|
||||
kernel = variable_scope.get_variable(
|
||||
'DW2', [2, 2, 6, 12],
|
||||
dtypes.float32,
|
||||
initializer=init_ops.random_normal_initializer(stddev=0.001))
|
||||
x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME')
|
||||
return x
|
||||
|
||||
|
||||
def BuildFullModel():
|
||||
"""Build the full model with conv,rnn,opt."""
|
||||
seq = []
|
||||
for i in range(4):
|
||||
with variable_scope.variable_scope('inp_%d' % i):
|
||||
seq.append(array_ops.reshape(BuildSmallModel(), [2, 1, -1]))
|
||||
|
||||
cell = BasicRNNCell(16, 48)
|
||||
out = rnn.dynamic_rnn(
|
||||
cell, array_ops.concat(seq, axis=1), dtype=dtypes.float32)[0]
|
||||
|
||||
target = array_ops.ones_like(out)
|
||||
loss = nn_ops.l2_loss(math_ops.reduce_mean(target - out))
|
||||
sgd_op = gradient_descent.GradientDescentOptimizer(1e-2)
|
||||
return sgd_op.minimize(loss)
|
||||
|
||||
|
@ -96,12 +96,13 @@ class PrintModelAnalysisTest(test.TestCase):
|
||||
|
||||
with session.Session() as sess, ops.device('/cpu:0'):
|
||||
_ = self._BuildSmallModel()
|
||||
tfprof_pb = tfprof_output_pb2.TFProfNode()
|
||||
tfprof_pb = tfprof_output_pb2.TFGraphNodeProto()
|
||||
tfprof_pb.ParseFromString(
|
||||
print_mdl.PrintModelAnalysis(sess.graph.as_graph_def(
|
||||
).SerializeToString(), b'', b'', b'scope', opts.SerializeToString()))
|
||||
print_mdl.PrintModelAnalysis(
|
||||
sess.graph.as_graph_def().SerializeToString(),
|
||||
b'', b'', b'scope', opts.SerializeToString()))
|
||||
|
||||
expected_pb = tfprof_output_pb2.TFProfNode()
|
||||
expected_pb = tfprof_output_pb2.TFGraphNodeProto()
|
||||
text_format.Merge(r"""name: "_TFProfRoot"
|
||||
exec_micros: 0
|
||||
requested_bytes: 0
|
||||
|
@ -62,12 +62,13 @@ def _fill_missing_graph_shape(graph, run_meta):
|
||||
return graph
|
||||
|
||||
|
||||
def _get_logged_ops(graph, run_meta=None):
|
||||
def _get_logged_ops(graph, run_meta=None, add_trace=False):
|
||||
"""Extract trainable model parameters and FLOPs for ops from a Graph.
|
||||
|
||||
Args:
|
||||
graph: tf.Graph.
|
||||
run_meta: RunMetadata proto used to complete shape information.
|
||||
add_trace: Whether to add op trace information.
|
||||
Returns:
|
||||
logged_ops: dict mapping from op_name to OpLogEntry.
|
||||
"""
|
||||
@ -76,21 +77,32 @@ def _get_logged_ops(graph, run_meta=None):
|
||||
|
||||
op_missing_shape = 0
|
||||
logged_ops = {}
|
||||
graph_def = graph.as_graph_def()
|
||||
for node in graph_def.node:
|
||||
for op in graph.get_operations():
|
||||
try:
|
||||
stats = ops.get_stats_for_node_def(graph, node, REGISTERED_FLOP_STATS)
|
||||
stats = ops.get_stats_for_node_def(
|
||||
graph, op.node_def, REGISTERED_FLOP_STATS)
|
||||
except ValueError:
|
||||
# Catch Exception When shape is incomplete. Skip it.
|
||||
op_missing_shape += 1
|
||||
stats = None
|
||||
|
||||
if not stats or not stats.value:
|
||||
continue
|
||||
if node.name not in logged_ops:
|
||||
entry = tfprof_log_pb2.OpLogEntry()
|
||||
entry.name = node.name
|
||||
entry = tfprof_log_pb2.OpLogEntry()
|
||||
entry.name = op.name
|
||||
add_entry = False
|
||||
if stats and stats.value:
|
||||
entry.float_ops = int(stats.value)
|
||||
add_entry = True
|
||||
|
||||
if add_trace:
|
||||
for tb in op.traceback:
|
||||
trace = entry.code_def.traces.add()
|
||||
trace.file = tb[0] if tb[0] else 'none'
|
||||
trace.lineno = tb[1] if tb[1] else -1
|
||||
trace.function = tb[2] if tb[2] else 'none'
|
||||
trace.line = tb[3] if tb[3] else 'none'
|
||||
add_entry = True
|
||||
|
||||
if add_entry:
|
||||
logged_ops[entry.name] = entry
|
||||
|
||||
for v in graph.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES):
|
||||
@ -108,18 +120,21 @@ def _get_logged_ops(graph, run_meta=None):
|
||||
return logged_ops
|
||||
|
||||
|
||||
def _merge_default_with_oplog(graph, op_log=None, run_meta=None):
|
||||
def _merge_default_with_oplog(graph, op_log=None,
|
||||
run_meta=None,
|
||||
add_trace=False):
|
||||
"""Merge the tfprof default extra info with caller's op_log.
|
||||
|
||||
Args:
|
||||
graph: tf.Graph.
|
||||
op_log: OpLog proto.
|
||||
run_meta: RunMetadata proto used to complete shape information.
|
||||
add_trace: Whether to add op trace information.
|
||||
Returns:
|
||||
tmp_op_log: Merged OpLog proto.
|
||||
"""
|
||||
tmp_op_log = tfprof_log_pb2.OpLog()
|
||||
logged_ops = _get_logged_ops(graph, run_meta)
|
||||
logged_ops = _get_logged_ops(graph, run_meta, add_trace=add_trace)
|
||||
if not op_log:
|
||||
tmp_op_log.log_entries.extend(logged_ops.values())
|
||||
else:
|
||||
@ -131,13 +146,16 @@ def _merge_default_with_oplog(graph, op_log=None, run_meta=None):
|
||||
all_ops[op_name].types.extend(entry.types)
|
||||
if entry.float_ops > 0 and all_ops[op_name].float_ops == 0:
|
||||
all_ops[op_name].float_ops = entry.float_ops
|
||||
if entry.code_def.traces and not all_ops[op_name].code_def.traces:
|
||||
all_ops[op_name].code_def.MergeFrom(entry.code_def)
|
||||
else:
|
||||
all_ops[op_name] = entry
|
||||
tmp_op_log.log_entries.extend(all_ops.values())
|
||||
return tmp_op_log
|
||||
|
||||
|
||||
def write_op_log(graph, log_dir, op_log=None, run_meta=None):
|
||||
def write_op_log(graph, log_dir, op_log=None, run_meta=None,
|
||||
add_trace=False):
|
||||
"""Log provided 'op_log', and add additional model information below.
|
||||
|
||||
The API also assigns ops in tf.trainable_variables() an op type called
|
||||
@ -154,8 +172,9 @@ def write_op_log(graph, log_dir, op_log=None, run_meta=None):
|
||||
one is created.
|
||||
run_meta: (Optional) RunMetadata proto that helps flops computation using
|
||||
run time shape information.
|
||||
add_trace: Whether to add op trace information. Used to support "code" view.
|
||||
"""
|
||||
op_log = _merge_default_with_oplog(graph, op_log, run_meta)
|
||||
op_log = _merge_default_with_oplog(graph, op_log, run_meta, add_trace)
|
||||
|
||||
with gfile.Open(os.path.join(log_dir, 'tfprof_log'), 'w') as log:
|
||||
log.write(op_log.SerializeToString())
|
||||
|
@ -156,6 +156,7 @@ CORE_PROTO_SRCS = [
|
||||
"protobuf/config.proto",
|
||||
"protobuf/cluster.proto",
|
||||
"protobuf/debug.proto",
|
||||
"protobuf/device_properties.proto",
|
||||
"protobuf/queue_runner.proto",
|
||||
"protobuf/rewriter_config.proto",
|
||||
"protobuf/tensor_bundle.proto",
|
||||
|
@ -829,7 +829,8 @@ static bool ValidateInlining(const Node* node, const FunctionBody* fbody) {
|
||||
// Given a "caller" in "graph", which is a function call of a function
|
||||
// to "fbody". Replaces the "caller" with fbody->graph and connects
|
||||
// edges properly.
|
||||
static void InlineFunctionBody(Graph* g, Node* caller,
|
||||
static void InlineFunctionBody(const FunctionLibraryDefinition& flib_def,
|
||||
Graph* g, Node* caller,
|
||||
const FunctionBody* fbody) {
|
||||
if (!ValidateInlining(caller, fbody)) {
|
||||
LOG(WARNING) << "Inlining mismatch: " << caller->DebugString() << " vs. "
|
||||
@ -837,6 +838,23 @@ static void InlineFunctionBody(Graph* g, Node* caller,
|
||||
return;
|
||||
}
|
||||
|
||||
// Input edges. For data edges coming into "caller", we first compute the
|
||||
// <src>:<src_output> for the i-th input in "inputs".
|
||||
// If "caller" has any input control dependencies, we add a NoOp
|
||||
// node "input_control_node", which depends on "caller"'s control inputs.
|
||||
std::vector<Endpoint> inputs(caller->num_inputs());
|
||||
Node* input_control_node = nullptr;
|
||||
for (const Edge* e : caller->in_edges()) {
|
||||
if (e->IsControlEdge()) {
|
||||
if (input_control_node == nullptr) {
|
||||
input_control_node = AddNoOp(g);
|
||||
}
|
||||
g->AddControlEdge(e->src(), input_control_node);
|
||||
} else {
|
||||
inputs[e->dst_input()] = {e->src(), e->src_output()};
|
||||
}
|
||||
}
|
||||
|
||||
// Duplicate fbody->graph into 'g'. First, we copy the nodes of
|
||||
// fbody->graph into 'g' except the source and sink nodes. We copy
|
||||
// edges among nodes in 'fbody->graph'.
|
||||
@ -850,8 +868,35 @@ static void InlineFunctionBody(Graph* g, Node* caller,
|
||||
CHECK(n->IsOp());
|
||||
NodeDef ndef = n->def();
|
||||
ndef.set_name(strings::StrCat(caller->name(), "/", ndef.name()));
|
||||
node_map[n->id()] = g->AddNode(ndef, &s);
|
||||
Node* clone = g->AddNode(ndef, &s);
|
||||
TF_CHECK_OK(s);
|
||||
node_map[n->id()] = clone;
|
||||
|
||||
// If there is an input control node, and one of:
|
||||
// a) the node has no data or control inputs, or
|
||||
// b) the node is a function call or SymbolicGradient,
|
||||
// then add a control edge from the input control node to the clone.
|
||||
//
|
||||
// We must not execute any nodes if the original function call would not
|
||||
// have executed. This is especially critical when the function call is
|
||||
// inside a control-flow construct like tf.cond(). Case (a) ensures that
|
||||
// such nodes do not run.
|
||||
//
|
||||
// The purpose of case (b) is to ensure that instances of case (a) created
|
||||
// by further inlining steps also receive the control dependency.
|
||||
if (input_control_node) {
|
||||
bool has_inputs = false;
|
||||
for (const Edge* e : n->in_edges()) {
|
||||
if (!e->src()->IsSource()) {
|
||||
has_inputs = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!has_inputs || flib_def.Find(clone->type_string()) != nullptr ||
|
||||
clone->type_string() == "SymbolicGradient") {
|
||||
g->AddControlEdge(input_control_node, clone);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const Edge* e : fbody->graph->edges()) {
|
||||
if (e->src()->IsSource() || e->src()->IsSink() || e->dst()->IsSource() ||
|
||||
@ -865,29 +910,12 @@ static void InlineFunctionBody(Graph* g, Node* caller,
|
||||
|
||||
// Connect input edges.
|
||||
//
|
||||
// For data edges coming into "caller", we first compute the
|
||||
// <src>:<src_output> for the i-th input in "inputs". We create one
|
||||
// Identity node for each input. Then, we connect inputs[i] to to
|
||||
// the i-th identity node added. The nodes that previously connects
|
||||
// to the j-th output of i-th arg node are reconnected to th i-th
|
||||
// We create one Identity node for each input. Then, we connect inputs[i] to
|
||||
// the i-th identity node added. The nodes that previously connected
|
||||
// to the j-th output of i-th arg node are reconnected to the i-th
|
||||
// identity node.
|
||||
//
|
||||
// If "caller" has any input control dependencies, we add a NoOp
|
||||
// node "input_control_node". This "input_control_node" depends on
|
||||
// what "caller" depends on, and the added identity nodes depend on
|
||||
// "input_control_node".
|
||||
std::vector<Endpoint> inputs(caller->num_inputs());
|
||||
Node* input_control_node = nullptr;
|
||||
for (const Edge* e : caller->in_edges()) {
|
||||
if (e->IsControlEdge()) {
|
||||
if (input_control_node == nullptr) {
|
||||
input_control_node = AddNoOp(g);
|
||||
}
|
||||
g->AddControlEdge(e->src(), input_control_node);
|
||||
} else {
|
||||
inputs[e->dst_input()] = {e->src(), e->src_output()};
|
||||
}
|
||||
}
|
||||
// The added identity nodes depend on "input_control_node".
|
||||
for (std::size_t i = 0; i < fbody->arg_nodes.size(); ++i) {
|
||||
Node* arg = node_map[fbody->arg_nodes[i]->id()];
|
||||
Node* n = AddIdentity(g, inputs[i]);
|
||||
@ -982,7 +1010,7 @@ bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph) {
|
||||
candidates.push_back({node, fbody});
|
||||
}
|
||||
for (const auto& p : candidates) {
|
||||
InlineFunctionBody(graph, p.first, p.second);
|
||||
InlineFunctionBody(*fld, graph, p.first, p.second);
|
||||
}
|
||||
return !candidates.empty();
|
||||
}
|
||||
|
@ -391,6 +391,90 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
|
||||
}
|
||||
}
|
||||
|
||||
// Verifies that control dependencies on the caller are added as control
|
||||
// dependencies on any function calls created by inlining.
|
||||
TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithControlDeps) {
|
||||
Init({test::function::XTimesTwo(), test::function::XTimesFour()});
|
||||
|
||||
std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
|
||||
{
|
||||
Scope s = Scope::NewRootScope();
|
||||
TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
|
||||
auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
|
||||
auto c = ops::NoOp(s.WithOpName("c"));
|
||||
auto b = Call(&s, "b", "XTimesFour", {a});
|
||||
s.graph()->AddControlEdge(c.operation.node(), b.node());
|
||||
auto ret = ops::_Retval(s.WithOpName("b_RetVal"), b, 0);
|
||||
TF_ASSERT_OK(s.ToGraph(g.get()));
|
||||
}
|
||||
|
||||
ExpandInlineFunctions(lib_.get(), g.get());
|
||||
{
|
||||
Scope s = Scope::NewRootScope();
|
||||
TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
|
||||
auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
|
||||
auto c = ops::NoOp(s.WithOpName("c"));
|
||||
auto func0 =
|
||||
ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies({c}));
|
||||
auto func1 = ops::Identity(
|
||||
s.WithOpName("Func/_1").WithControlDependencies({func0}), a);
|
||||
auto b_x2 = Call(&s, "b/x2", "XTimesTwo", {func1});
|
||||
s.graph()->AddControlEdge(func0.operation.node(), b_x2.node());
|
||||
auto b_y = Call(&s, "b/y", "XTimesTwo", {b_x2});
|
||||
s.graph()->AddControlEdge(func0.operation.node(), b_y.node());
|
||||
auto func2 = ops::Identity(s.WithOpName("Func/_2"), b_y);
|
||||
auto ret = ops::_Retval(s.WithOpName("b_RetVal"), func2, 0);
|
||||
GraphDef expected;
|
||||
TF_ASSERT_OK(s.ToGraphDef(&expected));
|
||||
|
||||
GraphDef actual;
|
||||
g->ToGraphDef(&actual);
|
||||
TF_EXPECT_GRAPH_EQ(expected, actual);
|
||||
}
|
||||
|
||||
ExpandInlineFunctions(lib_.get(), g.get());
|
||||
{
|
||||
Scope s = Scope::NewRootScope();
|
||||
TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
|
||||
auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
|
||||
auto c = ops::NoOp(s.WithOpName("c"));
|
||||
auto func0 =
|
||||
ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies({c}));
|
||||
auto func1 = ops::Identity(
|
||||
s.WithOpName("Func/_1").WithControlDependencies({func0}), a);
|
||||
|
||||
auto func3 =
|
||||
ops::NoOp(s.WithOpName("Func/_3").WithControlDependencies({func0}));
|
||||
auto func4 = ops::Identity(
|
||||
s.WithOpName("Func/_4").WithControlDependencies({func3}), func1);
|
||||
auto b_x2_two = ops::Const(
|
||||
s.WithOpName("b/x2/two").WithControlDependencies({func3}), 2LL);
|
||||
auto b_x2_scale = ops::Cast(s.WithOpName("b/x2/scale"), b_x2_two, DT_FLOAT);
|
||||
auto b_x2_y = ops::Mul(s.WithOpName("b/x2/y"), func4, b_x2_scale);
|
||||
auto func5 = ops::Identity(s.WithOpName("Func/_5"), b_x2_y);
|
||||
|
||||
auto func6 =
|
||||
ops::NoOp(s.WithOpName("Func/_6").WithControlDependencies({func0}));
|
||||
auto func7 = ops::Identity(
|
||||
s.WithOpName("Func/_7").WithControlDependencies({func6}), func5);
|
||||
auto b_y_two = ops::Const(
|
||||
s.WithOpName("b/y/two").WithControlDependencies({func6}), 2LL);
|
||||
auto b_y_scale = ops::Cast(s.WithOpName("b/y/scale"), b_y_two, DT_FLOAT);
|
||||
auto b_y_y = ops::Mul(s.WithOpName("b/y/y"), func7, b_y_scale);
|
||||
auto func8 = ops::Identity(s.WithOpName("Func/_8"), b_y_y);
|
||||
|
||||
auto func2 = ops::Identity(s.WithOpName("Func/_2"), func8);
|
||||
auto ret = ops::_Retval(s.WithOpName("b_RetVal"), func2, 0);
|
||||
|
||||
GraphDef expected;
|
||||
TF_ASSERT_OK(s.ToGraphDef(&expected));
|
||||
|
||||
GraphDef actual;
|
||||
g->ToGraphDef(&actual);
|
||||
TF_EXPECT_GRAPH_EQ(expected, actual);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(FunctionLibraryRuntimeTest, OptimizeGraph) {
|
||||
Init({test::function::XTimesTwo(), test::function::XTimesFour(),
|
||||
test::function::XTimes16()});
|
||||
|
@ -567,15 +567,14 @@ int64 MinSystemMemory(int64 available_memory) {
|
||||
// We use the following heuristic for now:
|
||||
//
|
||||
// If the available_memory is < 2GiB, we allocate 200MiB to system memory.
|
||||
// Otherwise, allocate 300MiB to system memory.
|
||||
// Otherwise, allocate max(300MiB, 0.05 * available_memory) to system memory.
|
||||
//
|
||||
// In the future we could be more sophisticated by using a table of
|
||||
// devices.
|
||||
// In the future we could be more sophisticated by using a table of devices.
|
||||
if (available_memory < (1LL << 31)) {
|
||||
// 200MiB
|
||||
return 209715200LL;
|
||||
} else {
|
||||
// max(300 MiB, 0.95 * available_memory)
|
||||
// max(300 MiB, 0.05 * available_memory)
|
||||
return std::max(314572800LL, static_cast<int64>(available_memory * 0.05));
|
||||
}
|
||||
}
|
||||
|
@ -60,6 +60,7 @@ cc_library(
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:protos_all_cc",
|
||||
"//tensorflow/core/grappler:utils",
|
||||
"//tensorflow/core/grappler/costs:utils",
|
||||
"//tensorflow/core/kernels:ops_util",
|
||||
],
|
||||
)
|
||||
|
@ -56,5 +56,15 @@ void Cluster::DisableDetailedStats(bool disable) {
|
||||
}
|
||||
}
|
||||
|
||||
const std::vector<string> Cluster::GetDeviceNames() const {
|
||||
std::vector<string> device_names;
|
||||
device_names.reserve(devices_.size());
|
||||
for (const auto& device : devices_) {
|
||||
device_names.push_back(device.first);
|
||||
}
|
||||
std::sort(device_names.begin(), device_names.end());
|
||||
return device_names;
|
||||
}
|
||||
|
||||
} // end namespace grappler
|
||||
} // end namespace tensorflow
|
||||
|
@ -17,13 +17,14 @@ limitations under the License.
|
||||
#define TENSORFLOW_GRAPPLER_CLUSTERS_CLUSTER_H_
|
||||
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "tensorflow/core/framework/device_attributes.pb.h"
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
#include "tensorflow/core/grappler/grappler_item.h"
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
#include "tensorflow/core/protobuf/device_properties.pb.h"
|
||||
#include "tensorflow/core/public/session_options.h"
|
||||
|
||||
namespace tensorflow {
|
||||
@ -62,18 +63,14 @@ class Cluster {
|
||||
|
||||
// Return the list of TensorFlow devices that are available to execute a
|
||||
// graph. This is empty until provision() is called.
|
||||
const std::vector<DeviceAttributes>& GetDevices() const { return devices_; }
|
||||
|
||||
// Convenience method that returns the set of device names.
|
||||
const std::vector<string> GetDeviceNames() const {
|
||||
std::vector<string> device_names;
|
||||
device_names.reserve(devices_.size());
|
||||
for (const auto& device : devices_) {
|
||||
device_names.push_back(device.name());
|
||||
}
|
||||
return device_names;
|
||||
const std::unordered_map<string, DeviceProperties>& GetDevices() const {
|
||||
return devices_;
|
||||
}
|
||||
|
||||
// Convenience method that returns the set of device names. These names are
|
||||
// sorted alphabetically.
|
||||
const std::vector<string> GetDeviceNames() const;
|
||||
|
||||
// Prepare the session to run the specified grappler item. This include
|
||||
// initializing all the model variables.
|
||||
virtual Status Initialize(const GrapplerItem& item) = 0;
|
||||
@ -85,7 +82,7 @@ class Cluster {
|
||||
RunMetadata* metadata) = 0;
|
||||
|
||||
protected:
|
||||
std::vector<DeviceAttributes> devices_;
|
||||
std::unordered_map<string, DeviceProperties> devices_;
|
||||
const int timeout_s_;
|
||||
SessionOptions options_;
|
||||
RunOptions run_options_;
|
||||
|
@ -19,6 +19,7 @@ limitations under the License.
|
||||
|
||||
#include "tensorflow/cc/training/queue_runner.h"
|
||||
#include "tensorflow/core/framework/step_stats.pb.h"
|
||||
#include "tensorflow/core/grappler/costs/utils.h"
|
||||
#include "tensorflow/core/grappler/utils.h"
|
||||
#include "tensorflow/core/kernels/ops_util.h"
|
||||
#include "tensorflow/core/lib/core/errors.h"
|
||||
@ -66,16 +67,12 @@ Status SingleMachine::Provision() {
|
||||
return status;
|
||||
}
|
||||
|
||||
DeviceAttributes attr;
|
||||
attr.set_name("/job:localhost/replica:0/task:0/cpu:0");
|
||||
attr.set_device_type("CPU");
|
||||
devices_.push_back(attr);
|
||||
DeviceProperties attr = GetLocalCPUInfo();
|
||||
devices_["/job:localhost/replica:0/task:0/cpu:0"] = GetLocalCPUInfo();
|
||||
|
||||
for (int i = 0; i < num_gpus_; ++i) {
|
||||
DeviceAttributes attr;
|
||||
attr.set_name(strings::StrCat("/job:localhost/replica:0/task:0/gpu:", i));
|
||||
attr.set_device_type("GPU");
|
||||
devices_.push_back(attr);
|
||||
devices_[strings::StrCat("/job:localhost/replica:0/task:0/gpu:", i)] =
|
||||
GetLocalGPUInfo(i);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
@ -25,7 +25,9 @@ tf_proto_library(
|
||||
name = "op_performance_data",
|
||||
srcs = ["op_performance_data.proto"],
|
||||
cc_api_version = 2,
|
||||
protodeps = ["//tensorflow/core:protos_all"],
|
||||
protodeps = [
|
||||
"//tensorflow/core:protos_all",
|
||||
],
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
||||
|
||||
@ -141,10 +143,10 @@ cc_library(
|
||||
hdrs = ["virtual_placer.h"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
":op_performance_data_cc",
|
||||
":utils",
|
||||
"//tensorflow/core:framework",
|
||||
"//tensorflow/core:framework_lite",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:protos_all_cc",
|
||||
"//tensorflow/core/grappler:devices",
|
||||
"//tensorflow/core/grappler/clusters:cluster",
|
||||
|
@ -73,7 +73,7 @@ Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
|
||||
std::vector<OpInfo::TensorProperties> inputs =
|
||||
properties.GetInputProperties(node->name());
|
||||
|
||||
OpInfo::DeviceProperties device = placer.get_device(*node);
|
||||
DeviceProperties device = placer.get_device(*node);
|
||||
OpInfo op_info;
|
||||
op_info.set_op(node->op());
|
||||
*op_info.mutable_attr() = node->attr();
|
||||
|
@ -69,7 +69,7 @@ Costs OpLevelCostEstimator::PredictCosts(const OpInfo& op_features) const {
|
||||
}
|
||||
|
||||
std::pair<double, double> OpLevelCostEstimator::GetDeviceInfo(
|
||||
const OpInfo::DeviceProperties& device) const {
|
||||
const DeviceProperties& device) const {
|
||||
double gflops = -1;
|
||||
double bandwidth = -1;
|
||||
if (device.bandwidth() > 0) {
|
||||
@ -77,7 +77,7 @@ std::pair<double, double> OpLevelCostEstimator::GetDeviceInfo(
|
||||
}
|
||||
|
||||
if (device.type() == "CPU") {
|
||||
const OpInfo::DeviceProperties local_cpu = GetLocalCPUInfo();
|
||||
const DeviceProperties local_cpu = GetLocalCPUInfo();
|
||||
// Check if vector instructions are available, and refine performance
|
||||
// prediction based on this.
|
||||
// Frequencies are stored in MHz in the DeviceProperties.
|
||||
@ -90,7 +90,7 @@ std::pair<double, double> OpLevelCostEstimator::GetDeviceInfo(
|
||||
}
|
||||
}
|
||||
} else if (device.type() == "GPU") {
|
||||
const OpInfo::DeviceProperties local_gpu = GetLocalGPUInfo(0);
|
||||
const DeviceProperties local_gpu = GetLocalGPUInfo(0);
|
||||
const string architecture = local_gpu.environment().at("architecture");
|
||||
int cores_per_multiprocessor;
|
||||
if (architecture < "3") {
|
||||
|
@ -40,7 +40,7 @@ class OpLevelCostEstimator {
|
||||
// executed per second) and memory bandwith (in GigaBytes/second) for the
|
||||
// specified device.
|
||||
virtual std::pair<double, double> GetDeviceInfo(
|
||||
const OpInfo::DeviceProperties& device) const;
|
||||
const DeviceProperties& device) const;
|
||||
|
||||
// For operations for which we haven't yet built estimates, returns a dummy
|
||||
// value based on input size.
|
||||
|
@ -17,6 +17,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/framework/tensor_shape.pb.h"
|
||||
#include "tensorflow/core/framework/types.h"
|
||||
#include "tensorflow/core/platform/test.h"
|
||||
#include "tensorflow/core/protobuf/device_properties.pb.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace grappler {
|
||||
|
@ -22,6 +22,7 @@ import "tensorflow/core/framework/tensor.proto";
|
||||
import "tensorflow/core/framework/tensor_shape.proto";
|
||||
import "tensorflow/core/framework/types.proto";
|
||||
import "tensorflow/core/framework/attr_value.proto";
|
||||
import "tensorflow/core/protobuf/device_properties.proto";
|
||||
|
||||
// Description of an operation as well as the parameters expected to impact its
|
||||
// performance.
|
||||
@ -41,36 +42,6 @@ message OpInfo {
|
||||
repeated TensorProperties inputs = 3;
|
||||
|
||||
// Device on which the operation is run.
|
||||
message DeviceProperties {
|
||||
// Device type (CPU, GPU, ...)
|
||||
string type = 1;
|
||||
// Vendor (Intel, nvidia, ...)
|
||||
string vendor = 2;
|
||||
// Model (Haswell, K40, ...)
|
||||
string model = 3;
|
||||
// Core Frequency in Mhz
|
||||
int64 frequency = 4;
|
||||
// Number of cores
|
||||
int64 num_cores = 5;
|
||||
// Version of the tools and libraries used with this device (e.g. gcc 4.9,
|
||||
// cudnn 5.1)
|
||||
map<string, string> environment = 6;
|
||||
// Number of registers per core.
|
||||
int64 num_registers = 7;
|
||||
// L1 cache size in bytes
|
||||
int64 l1_cache_size = 8;
|
||||
// L2 cache size in bytes
|
||||
int64 l2_cache_size = 9;
|
||||
// L3 cache size in bytes
|
||||
int64 l3_cache_size = 10;
|
||||
// Shared memory size per multiprocessor in bytes. This field is
|
||||
// applicable to GPUs only.
|
||||
int64 shared_memory_size_per_multiprocessor = 11;
|
||||
// Memory size in bytes
|
||||
int64 memory_size = 12;
|
||||
// Memory bandwidth in KB/s
|
||||
int64 bandwidth = 13;
|
||||
}
|
||||
DeviceProperties device = 4;
|
||||
}
|
||||
|
||||
|
@ -125,7 +125,7 @@ std::vector<OpInfo::TensorProperties> FindInputFeatures(
|
||||
return inputs;
|
||||
}
|
||||
|
||||
OpInfo::DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
|
||||
DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
|
||||
DeviceNameUtils::ParsedName parsed;
|
||||
if (DeviceNameUtils::ParseFullName(node.device(), &parsed)) {
|
||||
if (parsed.type == "GPU") {
|
||||
@ -134,13 +134,13 @@ OpInfo::DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
|
||||
return GetLocalCPUInfo();
|
||||
}
|
||||
}
|
||||
OpInfo::DeviceProperties device;
|
||||
DeviceProperties device;
|
||||
device.set_type("UNKNOWN");
|
||||
return device;
|
||||
}
|
||||
|
||||
OpInfo::DeviceProperties GetLocalCPUInfo() {
|
||||
OpInfo::DeviceProperties device;
|
||||
DeviceProperties GetLocalCPUInfo() {
|
||||
DeviceProperties device;
|
||||
device.set_type("CPU");
|
||||
|
||||
device.set_vendor(port::CPUVendorIDString());
|
||||
@ -165,8 +165,8 @@ OpInfo::DeviceProperties GetLocalCPUInfo() {
|
||||
return device;
|
||||
}
|
||||
|
||||
OpInfo::DeviceProperties GetLocalGPUInfo(int gpu_id) {
|
||||
OpInfo::DeviceProperties device;
|
||||
DeviceProperties GetLocalGPUInfo(int gpu_id) {
|
||||
DeviceProperties device;
|
||||
device.set_type("GPU");
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
|
@ -25,6 +25,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/graph/types.h"
|
||||
#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
#include "tensorflow/core/protobuf/device_properties.pb.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace grappler {
|
||||
@ -40,14 +41,14 @@ std::vector<OpInfo::TensorProperties> FindInputFeatures(
|
||||
const std::unordered_map<string, const NodeDef*>& name_to_node);
|
||||
|
||||
// Returns the DeviceProperties of the device on which 'node' runs.
|
||||
OpInfo::DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node);
|
||||
DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node);
|
||||
|
||||
// Returns the DeviceProperties of the CPU on which grappler is running.
|
||||
OpInfo::DeviceProperties GetLocalCPUInfo();
|
||||
DeviceProperties GetLocalCPUInfo();
|
||||
|
||||
// Returns the DeviceProperties for the specified GPU attached to the server on
|
||||
// which grappler is running.
|
||||
OpInfo::DeviceProperties GetLocalGPUInfo(int gpu_id);
|
||||
DeviceProperties GetLocalGPUInfo(int gpu_id);
|
||||
|
||||
} // end namespace grappler
|
||||
} // end namespace tensorflow
|
||||
|
@ -18,35 +18,48 @@ limitations under the License.
|
||||
#include "tensorflow/core/grappler/clusters/cluster.h"
|
||||
#include "tensorflow/core/grappler/costs/utils.h"
|
||||
#include "tensorflow/core/grappler/devices.h"
|
||||
#include "tensorflow/core/lib/strings/str_util.h"
|
||||
#include "tensorflow/core/util/device_name_utils.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace grappler {
|
||||
|
||||
VirtualPlacer::VirtualPlacer(Cluster* cluster) : has_gpu_(false) {
|
||||
devices_["CPU"] = GetLocalCPUInfo();
|
||||
if (GetNumAvailableGPUs() > 0) {
|
||||
has_gpu_ = true;
|
||||
devices_["GPU"] = GetLocalGPUInfo(0);
|
||||
devices_ = cluster->GetDevices();
|
||||
for (const auto& device : cluster->GetDevices()) {
|
||||
if (str_util::Lowercase(device.first).find("gpu") != string::npos) {
|
||||
has_gpu_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
unknown_device_.set_type("UNKNOWN");
|
||||
}
|
||||
|
||||
const OpInfo::DeviceProperties& VirtualPlacer::get_device(
|
||||
const NodeDef& node) const {
|
||||
string device_type;
|
||||
const DeviceProperties& VirtualPlacer::get_device(const NodeDef& node) const {
|
||||
DeviceNameUtils::ParsedName parsed;
|
||||
if (!node.device().empty() &&
|
||||
DeviceNameUtils::ParseFullName(node.device(), &parsed)) {
|
||||
device_type = parsed.type;
|
||||
} else {
|
||||
if (has_gpu_) {
|
||||
device_type = "GPU";
|
||||
} else {
|
||||
device_type = "CPU";
|
||||
if (!node.device().empty()) {
|
||||
auto it = devices_.find(node.device());
|
||||
if (it != devices_.end()) {
|
||||
return it->second;
|
||||
}
|
||||
if (DeviceNameUtils::ParseFullName(node.device(), &parsed)) {
|
||||
string device_name =
|
||||
strings::StrCat("/job:localhost/replica:0/task:0/",
|
||||
str_util::Lowercase(parsed.type), ":", parsed.id);
|
||||
it = devices_.find(device_name);
|
||||
if (it != devices_.end()) {
|
||||
return it->second;
|
||||
}
|
||||
}
|
||||
return unknown_device_;
|
||||
}
|
||||
auto it = devices_.find(device_type);
|
||||
string device;
|
||||
if (has_gpu_) {
|
||||
device = "/job:localhost/replica:0/task:0/gpu:0";
|
||||
} else {
|
||||
device = "/job:localhost/replica:0/task:0/cpu:0";
|
||||
}
|
||||
auto it = devices_.find(device);
|
||||
if (it == devices_.end()) {
|
||||
return unknown_device_;
|
||||
}
|
||||
|
@ -17,8 +17,8 @@ limitations under the License.
|
||||
#define TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_PLACER_H_
|
||||
|
||||
#include <unordered_map>
|
||||
#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
#include "tensorflow/core/protobuf/device_properties.pb.h"
|
||||
|
||||
namespace tensorflow {
|
||||
class NodeDef;
|
||||
@ -31,12 +31,12 @@ class VirtualPlacer {
|
||||
public:
|
||||
VirtualPlacer(Cluster* cluster);
|
||||
|
||||
const OpInfo::DeviceProperties& get_device(const NodeDef& node) const;
|
||||
const DeviceProperties& get_device(const NodeDef& node) const;
|
||||
|
||||
private:
|
||||
std::unordered_map<string, OpInfo::DeviceProperties> devices_;
|
||||
std::unordered_map<string, DeviceProperties> devices_;
|
||||
bool has_gpu_;
|
||||
OpInfo::DeviceProperties unknown_device_;
|
||||
DeviceProperties unknown_device_;
|
||||
};
|
||||
|
||||
} // namespace grappler
|
||||
|
@ -38,9 +38,9 @@ using Eigen::GpuDevice;
|
||||
// in NHWC format.
|
||||
template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
|
||||
int kKnownDepthMultiplier>
|
||||
__global__ void DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args,
|
||||
const T* input, const T* filter,
|
||||
T* output, int num_outputs) {
|
||||
__global__ void __launch_bounds__(1024, 2)
|
||||
DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args, const T* input,
|
||||
const T* filter, T* output, int num_outputs) {
|
||||
const int in_rows = args.in_rows;
|
||||
const int in_cols = args.in_cols;
|
||||
const int in_depth = args.in_depth;
|
||||
@ -120,9 +120,9 @@ __global__ void DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args,
|
||||
// in NCHW format.
|
||||
template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
|
||||
int kKnownDepthMultiplier>
|
||||
__global__ void DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args,
|
||||
const T* input, const T* filter,
|
||||
T* output, int num_outputs) {
|
||||
__global__ void __launch_bounds__(1024, 2)
|
||||
DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args, const T* input,
|
||||
const T* filter, T* output, int num_outputs) {
|
||||
const int in_rows = args.in_rows;
|
||||
const int in_cols = args.in_cols;
|
||||
const int in_depth = args.in_depth;
|
||||
@ -250,17 +250,34 @@ void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
|
||||
TensorFormat data_format) {
|
||||
const int num_outputs =
|
||||
args.batch * args.out_rows * args.out_cols * args.out_depth;
|
||||
CudaLaunchConfig config = GetCudaLaunchConfig(num_outputs, d);
|
||||
// The compile-time constant version runs faster with a single block.
|
||||
const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 ||
|
||||
kKnownDepthMultiplier < 0 ||
|
||||
args.out_rows * args.out_cols <= 256
|
||||
? std::numeric_limits<int>::max()
|
||||
: d.getNumCudaMultiProcessors();
|
||||
if (data_format == FORMAT_NHWC) {
|
||||
CudaLaunchConfig config = GetCudaLaunchConfig(
|
||||
num_outputs, d,
|
||||
DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
|
||||
kKnownDepthMultiplier>,
|
||||
0);
|
||||
DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
|
||||
kKnownDepthMultiplier>
|
||||
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
||||
args, input, filter, output, num_outputs);
|
||||
<<<std::min(max_block_count, config.block_count),
|
||||
config.thread_per_block, 0, d.stream()>>>(args, input, filter,
|
||||
output, num_outputs);
|
||||
} else if (data_format == FORMAT_NCHW) {
|
||||
CudaLaunchConfig config = GetCudaLaunchConfig(
|
||||
num_outputs, d,
|
||||
DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
|
||||
kKnownDepthMultiplier>,
|
||||
0);
|
||||
DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
|
||||
kKnownDepthMultiplier>
|
||||
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
||||
args, input, filter, output, num_outputs);
|
||||
<<<std::min(max_block_count, config.block_count),
|
||||
config.thread_per_block, 0, d.stream()>>>(args, input, filter,
|
||||
output, num_outputs);
|
||||
} else {
|
||||
assert(false);
|
||||
}
|
||||
@ -288,9 +305,11 @@ template struct DepthwiseConv2dGPULaunch<double>;
|
||||
// A Cuda kernel to compute the depthwise convolution backprop w.r.t. input.
|
||||
template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
|
||||
int kKnownDepthMultiplier>
|
||||
__global__ void DepthwiseConv2dBackpropInputGPUKernelNHWC(
|
||||
const DepthwiseArgs args, const T* out_backprop, const T* filter,
|
||||
T* in_backprop, int num_in_backprop) {
|
||||
__global__ void __launch_bounds__(640, 2)
|
||||
DepthwiseConv2dBackpropInputGPUKernelNHWC(const DepthwiseArgs args,
|
||||
const T* out_backprop,
|
||||
const T* filter, T* in_backprop,
|
||||
int num_in_backprop) {
|
||||
const int in_rows = args.in_rows;
|
||||
const int in_cols = args.in_cols;
|
||||
const int in_depth = args.in_depth;
|
||||
@ -350,7 +369,7 @@ __global__ void DepthwiseConv2dBackpropInputGPUKernelNHWC(
|
||||
|
||||
template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
|
||||
int kKnownDepthMultiplier>
|
||||
__global__ void __launch_bounds__(1024)
|
||||
__global__ void __launch_bounds__(640, 2)
|
||||
DepthwiseConv2dBackpropInputGPUKernelNCHW(const DepthwiseArgs args,
|
||||
const T* out_backprop,
|
||||
const T* filter, T* in_backprop,
|
||||
@ -428,17 +447,22 @@ void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
|
||||
TensorFormat data_format) {
|
||||
const int num_in_backprop =
|
||||
args.batch * args.in_rows * args.in_cols * args.in_depth;
|
||||
CudaLaunchConfig config = GetCudaLaunchConfig(num_in_backprop, d);
|
||||
// Increase block count for when there are more warps/SM than threads/SM.
|
||||
// TODO(csigg): this is pretty arbitraty and should be generalized using
|
||||
// cudaOccupancyMaxPotentialBlockSize().
|
||||
config.block_count *= 4;
|
||||
if (data_format == FORMAT_NHWC) {
|
||||
CudaLaunchConfig config = GetCudaLaunchConfig(
|
||||
num_in_backprop, d,
|
||||
DepthwiseConv2dBackpropInputGPUKernelNHWC<
|
||||
T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
|
||||
0);
|
||||
DepthwiseConv2dBackpropInputGPUKernelNHWC<
|
||||
T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
|
||||
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
||||
args, out_backprop, filter, in_backprop, num_in_backprop);
|
||||
} else if (data_format == FORMAT_NCHW) {
|
||||
CudaLaunchConfig config = GetCudaLaunchConfig(
|
||||
num_in_backprop, d,
|
||||
DepthwiseConv2dBackpropInputGPUKernelNCHW<
|
||||
T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
|
||||
0);
|
||||
DepthwiseConv2dBackpropInputGPUKernelNCHW<
|
||||
T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
|
||||
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
||||
@ -475,9 +499,12 @@ template struct DepthwiseConv2dBackpropInputGPULaunch<double>;
|
||||
// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
|
||||
template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
|
||||
int kKnownDepthMultiplier>
|
||||
__global__ void DepthwiseConv2dBackpropFilterGPUKernelNHWC(
|
||||
const DepthwiseArgs args, const T* out_backprop, const T* input,
|
||||
T* filter_backprop, int num_out_backprop) {
|
||||
__global__ void __launch_bounds__(640, 2)
|
||||
DepthwiseConv2dBackpropFilterGPUKernelNHWC(const DepthwiseArgs args,
|
||||
const T* out_backprop,
|
||||
const T* input,
|
||||
T* filter_backprop,
|
||||
int num_out_backprop) {
|
||||
const int in_rows = args.in_rows;
|
||||
const int in_cols = args.in_cols;
|
||||
const int in_depth = args.in_depth;
|
||||
@ -566,9 +593,12 @@ __global__ void DepthwiseConv2dBackpropFilterGPUKernelNHWC(
|
||||
// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
|
||||
template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
|
||||
int kKnownDepthMultiplier>
|
||||
__global__ void DepthwiseConv2dBackpropFilterGPUKernelNCHW(
|
||||
const DepthwiseArgs args, const T* out_backprop, const T* input,
|
||||
T* filter_backprop, int num_out_backprop) {
|
||||
__global__ void __launch_bounds__(640, 2)
|
||||
DepthwiseConv2dBackpropFilterGPUKernelNCHW(const DepthwiseArgs args,
|
||||
const T* out_backprop,
|
||||
const T* input,
|
||||
T* filter_backprop,
|
||||
int num_out_backprop) {
|
||||
const int in_rows = args.in_rows;
|
||||
const int in_cols = args.in_cols;
|
||||
const int in_depth = args.in_depth;
|
||||
@ -669,13 +699,22 @@ void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d,
|
||||
TensorFormat data_format) {
|
||||
const int num_out_backprop =
|
||||
args.batch * args.out_rows * args.out_cols * args.out_depth;
|
||||
CudaLaunchConfig config = GetCudaLaunchConfig(num_out_backprop, d);
|
||||
if (data_format == FORMAT_NHWC) {
|
||||
CudaLaunchConfig config = GetCudaLaunchConfig(
|
||||
num_out_backprop, d,
|
||||
DepthwiseConv2dBackpropFilterGPUKernelNHWC<
|
||||
T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
|
||||
0);
|
||||
DepthwiseConv2dBackpropFilterGPUKernelNHWC<
|
||||
T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
|
||||
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
||||
args, out_backprop, input, filter_backprop, num_out_backprop);
|
||||
} else if (data_format == FORMAT_NCHW) {
|
||||
CudaLaunchConfig config = GetCudaLaunchConfig(
|
||||
num_out_backprop, d,
|
||||
DepthwiseConv2dBackpropFilterGPUKernelNCHW<
|
||||
T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
|
||||
0);
|
||||
DepthwiseConv2dBackpropFilterGPUKernelNCHW<
|
||||
T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
|
||||
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
||||
|
@ -87,6 +87,12 @@ class SplitVOpBase : public OpKernel {
|
||||
// Special case 1: num_split == 1. Nothing to do.
|
||||
if (num_split == 1) {
|
||||
context->set_output(0, context->input(0));
|
||||
OP_REQUIRES(
|
||||
context, (*split_sizes_vec)[0] == input_size_split_dim,
|
||||
errors::InvalidArgument("If there is only one output, it must have "
|
||||
"the same size as the input. Input size: ",
|
||||
input_size_split_dim,
|
||||
" output size: ", (*split_sizes_vec)[0]));
|
||||
*done = true;
|
||||
return;
|
||||
}
|
||||
|
@ -127,6 +127,16 @@ class TemporaryVariableOp : public OpKernel {
|
||||
OP_REQUIRES_OK(context, rm->Create(context->step_container()->name(),
|
||||
var_name_, tmp_var));
|
||||
context->set_output_ref(0, &tmp_var->mu, &tmp_var->val);
|
||||
if (context->track_allocations()) {
|
||||
AllocatorAttributes attr;
|
||||
if (context->allocate_on_host(attr)) {
|
||||
context->record_host_persistent_memory_allocation(
|
||||
tmp_var->val.AllocatedBytes());
|
||||
} else {
|
||||
context->record_device_persistent_memory_allocation(
|
||||
tmp_var->val.AllocatedBytes());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -518,7 +518,17 @@ REGISTER_OP("SplitV")
|
||||
} else if (rank == 0) {
|
||||
// Throw error if input is a scalar.
|
||||
return errors::InvalidArgument("Can't split scalars");
|
||||
} else if (size_splits == nullptr || !c->ValueKnown(split_dimension)) {
|
||||
} else if (size_splits == nullptr && c->ValueKnown(split_dimension)) {
|
||||
// If split dimension is known, but the sizes are unknown, then
|
||||
// only the split dimension is unknown
|
||||
output_shape = input;
|
||||
TF_RETURN_IF_ERROR(c->ReplaceDim(output_shape,
|
||||
c->Value(split_dimension),
|
||||
c->UnknownDim(), &output_shape));
|
||||
for (int i = 0; i < num_outputs; ++i) {
|
||||
c->set_output(i, output_shape);
|
||||
}
|
||||
} else if (size_splits == nullptr && !c->ValueKnown(split_dimension)) {
|
||||
// If split dimension or tensor containing the split sizes is unknown,
|
||||
// then return unknown shapes of same rank as input.
|
||||
output_shape = c->UnknownShapeOfRank(rank);
|
||||
@ -540,12 +550,37 @@ REGISTER_OP("SplitV")
|
||||
return errors::InvalidArgument(
|
||||
"Length of size_splits should be equal to num_outputs");
|
||||
}
|
||||
int64_t cumsum_outputs = 0;
|
||||
bool has_neg_one = false;
|
||||
// If the sizes of the splits are known, then
|
||||
// make sure that the sizes add up to the expected
|
||||
// dimension size, with the possibility of a -1.
|
||||
// Specify the full output shapes.
|
||||
for (int i = 0; i < num_outputs; ++i) {
|
||||
output_shape = c->UnknownShapeOfRank(rank);
|
||||
TF_RETURN_IF_ERROR(c->ReplaceDim(input, split_dim,
|
||||
c->MakeDim(data[i]), &output_shape));
|
||||
c->set_output(i, output_shape);
|
||||
if (data[i] == -1 && !has_neg_one)
|
||||
has_neg_one = true;
|
||||
else if (data[i] == -1 && has_neg_one)
|
||||
return errors::InvalidArgument("size_splits can only have one -1");
|
||||
else
|
||||
cumsum_outputs += data[i];
|
||||
}
|
||||
auto split_dim_size = c->Value(c->Dim(input, split_dim));
|
||||
if (has_neg_one) {
|
||||
if (cumsum_outputs < split_dim_size)
|
||||
cumsum_outputs = split_dim_size;
|
||||
else
|
||||
cumsum_outputs = split_dim_size + 1;
|
||||
}
|
||||
if (cumsum_outputs != c->Value(c->Dim(input, split_dim)))
|
||||
return errors::InvalidArgument(
|
||||
"Sum of output sizes must match "
|
||||
"the size of the original Tensor along the split dimension "
|
||||
"or the sum of the positive sizes must be less if it contains a "
|
||||
"-1");
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
|
51
tensorflow/core/protobuf/device_properties.proto
Normal file
51
tensorflow/core/protobuf/device_properties.proto
Normal file
@ -0,0 +1,51 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package tensorflow;
|
||||
option cc_enable_arenas = true;
|
||||
option java_outer_classname = "DevicePropertiesProtos";
|
||||
|
||||
message DeviceProperties {
|
||||
// Device type (CPU, GPU, ...)
|
||||
string type = 1;
|
||||
// Vendor (Intel, nvidia, ...)
|
||||
string vendor = 2;
|
||||
// Model (Haswell, K40, ...)
|
||||
string model = 3;
|
||||
// Core Frequency in Mhz
|
||||
int64 frequency = 4;
|
||||
// Number of cores
|
||||
int64 num_cores = 5;
|
||||
// Version of the tools and libraries used with this device (e.g. gcc 4.9,
|
||||
// cudnn 5.1)
|
||||
map<string, string> environment = 6;
|
||||
// Number of registers per core.
|
||||
int64 num_registers = 7;
|
||||
// L1 cache size in bytes
|
||||
int64 l1_cache_size = 8;
|
||||
// L2 cache size in bytes
|
||||
int64 l2_cache_size = 9;
|
||||
// L3 cache size in bytes
|
||||
int64 l3_cache_size = 10;
|
||||
// Shared memory size per multiprocessor in bytes. This field is
|
||||
// applicable to GPUs only.
|
||||
int64 shared_memory_size_per_multiprocessor = 11;
|
||||
// Memory size in bytes
|
||||
int64 memory_size = 12;
|
||||
// Memory bandwidth in KB/s
|
||||
int64 bandwidth = 13;
|
||||
}
|
@ -63,6 +63,28 @@ inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
|
||||
return config;
|
||||
}
|
||||
|
||||
// Calculate the Cuda launch config we should use for a kernel launch. This
|
||||
// variant takes the resource limits of func into account to maximize occupancy.
|
||||
template <typename DeviceFunc>
|
||||
inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
|
||||
const GPUDevice& d, DeviceFunc func,
|
||||
size_t dynamic_shared_memory_size) {
|
||||
int block_count = 0;
|
||||
int thread_per_block = 0;
|
||||
cudaOccupancyMaxPotentialBlockSize(&block_count, &thread_per_block, func,
|
||||
dynamic_shared_memory_size,
|
||||
work_element_count);
|
||||
block_count =
|
||||
std::min(block_count,
|
||||
(work_element_count + thread_per_block - 1) / thread_per_block);
|
||||
|
||||
CudaLaunchConfig config;
|
||||
config.virtual_thread_count = work_element_count;
|
||||
config.thread_per_block = thread_per_block;
|
||||
config.block_count = block_count;
|
||||
return config;
|
||||
}
|
||||
|
||||
struct Cuda2DLaunchConfig {
|
||||
dim3 virtual_thread_count;
|
||||
dim3 thread_per_block;
|
||||
|
1
tensorflow/opensource_only/eigen.threadpool
Normal file
1
tensorflow/opensource_only/eigen.threadpool
Normal file
@ -0,0 +1 @@
|
||||
#include "unsupported/Eigen/CXX11/ThreadPool"
|
@ -393,6 +393,17 @@ def bucketized_column(source_column, boundaries):
|
||||
`boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`,
|
||||
`[1., 2.)`, and `[2., +inf)`.
|
||||
|
||||
For example, if the inputs are
|
||||
`boundaries` = [0, 10, 100]
|
||||
input tensor = [[-5, 10000]
|
||||
[150, 10]
|
||||
[5, 100]]
|
||||
|
||||
then the output will be
|
||||
output = [[0, 3]
|
||||
[3, 2]
|
||||
[1, 3]]
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
|
@ -324,6 +324,48 @@ class FunctionTest(test.TestCase):
|
||||
"assertion"):
|
||||
_ = MyFn(100.0).eval()
|
||||
|
||||
def testControlFlowStrictness(self):
|
||||
"""Inlined functions must not execute in a untaken control flow branch."""
|
||||
|
||||
@function.Defun(dtypes.int32)
|
||||
def AssertFail(x):
|
||||
# Assertion that always fails and does not have a data dependency on `x`.
|
||||
assert_false = control_flow_ops.Assert(False, [42])
|
||||
with ops.control_dependencies([assert_false]):
|
||||
return array_ops.identity(x)
|
||||
|
||||
with ops.device("CPU"):
|
||||
pred = array_ops.placeholder(dtypes.bool)
|
||||
x = array_ops.placeholder(dtypes.int32)
|
||||
cond = control_flow_ops.cond(pred, lambda: x + 1, lambda: AssertFail(x))
|
||||
# pylint: disable=unnecessary-lambda
|
||||
loop = control_flow_ops.while_loop(lambda y: pred,
|
||||
lambda y: AssertFail(y), [x])
|
||||
# pylint: enable=unnecessary-lambda
|
||||
|
||||
# Enables inlining.
|
||||
config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
|
||||
optimizer_options=config_pb2.OptimizerOptions(
|
||||
opt_level=config_pb2.OptimizerOptions.L0,
|
||||
do_common_subexpression_elimination=True,
|
||||
do_function_inlining=True,
|
||||
do_constant_folding=True)))
|
||||
|
||||
with session.Session(config=config) as sess:
|
||||
# Since the 'False' branch is not taken, the assertion should not fire.
|
||||
self.assertEqual(4, sess.run(cond, {pred: True, x: 3}))
|
||||
|
||||
# The assertion should still fire if the False branch is taken.
|
||||
with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
|
||||
"assertion"):
|
||||
sess.run(cond, {pred: False, x: 3})
|
||||
|
||||
# Similarly for loops.
|
||||
self.assertEqual(3, sess.run(loop, {pred: False, x: 3}))
|
||||
with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
|
||||
"assertion"):
|
||||
sess.run(loop, {pred: True, x: 3})
|
||||
|
||||
def testVar(self):
|
||||
|
||||
@function.Defun(dtypes.float32)
|
||||
|
@ -184,8 +184,11 @@ class BiasAddTest(test.TestCase):
|
||||
if dtype == dtypes.float64:
|
||||
threshold = 1e-10
|
||||
self.assertAllClose(tensor_jacob_t, tensor_jacob_n, threshold, threshold)
|
||||
self.assertAllClose(bias_jacob_t, bias_jacob_n, threshold, threshold)
|
||||
self.assertAllClose(grad_jacob_t, grad_jacob_n, threshold, threshold)
|
||||
# TODO(annarev): Re-add assertion for float16, float32 dtypes and NCHW
|
||||
# once we figure out why this check started failing with cuda mavx.
|
||||
if dtype == dtypes.float64 or data_format != "NCHW":
|
||||
self.assertAllClose(bias_jacob_t, bias_jacob_n, threshold, threshold)
|
||||
self.assertAllClose(grad_jacob_t, grad_jacob_n, threshold, threshold)
|
||||
|
||||
def testGradientTensor(self):
|
||||
for (data_format, use_gpu) in GetTestConfigs():
|
||||
|
@ -22,6 +22,7 @@ import numpy as np
|
||||
|
||||
from tensorflow.python.framework import constant_op
|
||||
from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.framework import errors_impl
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.ops import array_ops
|
||||
from tensorflow.python.ops import gradients_impl
|
||||
@ -40,6 +41,42 @@ class SplitOpTest(test.TestCase):
|
||||
data -= 1j * data
|
||||
return data
|
||||
|
||||
def testShapeInference(self):
|
||||
model_input = array_ops.placeholder(dtypes.float32, shape=(1, 10))
|
||||
|
||||
# check that we fail during static shape inference if sizes are known
|
||||
with self.assertRaises(ValueError):
|
||||
# pylint: disable=expression-not-assigned
|
||||
array_ops.split(model_input, [4], axis=1)[0]
|
||||
# pylint: enable=expression-not-assigned
|
||||
|
||||
model_input = array_ops.placeholder(dtypes.float32)
|
||||
inp = np.zeros((1, 10))
|
||||
# check that we still fail at runtime if the shapes were unknown
|
||||
with self.test_session(use_gpu=False) as sess:
|
||||
with self.assertRaises(errors_impl.InvalidArgumentError):
|
||||
sess.run(array_ops.split(model_input, [4]), {model_input: inp})
|
||||
|
||||
# test that we can pass a scalar Tensor as num_splits
|
||||
with self.test_session(use_gpu=False) as sess:
|
||||
result = sess.run(
|
||||
array_ops.split(
|
||||
array_ops.ones([4, 4]),
|
||||
num_or_size_splits=array_ops.ones([2, 2]).get_shape()[1],
|
||||
axis=0))
|
||||
|
||||
self.assertEqual(result[0].shape, (2, 4))
|
||||
self.assertEqual(result[1].shape, (2, 4))
|
||||
|
||||
# test that none split dimensions remain, even if we don't know how
|
||||
# the split_dim will be split, but we do know the axis
|
||||
result = array_ops.split(
|
||||
array_ops.ones([5, 2]), array_ops.constant([2, 1, 2]) * 1, axis=0)
|
||||
|
||||
self.assertEqual(result[0].shape[1], 2)
|
||||
self.assertEqual(result[1].shape[1], 2)
|
||||
self.assertEqual(result[2].shape[1], 2)
|
||||
|
||||
def testExplicitNum(self):
|
||||
size_splits = array_ops.placeholder(dtype=dtypes.int32, shape=[None])
|
||||
|
||||
|
@ -84,7 +84,6 @@ from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import numpy as np
|
||||
import six
|
||||
|
||||
from tensorflow.python.framework import common_shapes
|
||||
from tensorflow.python.framework import constant_op
|
||||
@ -1165,13 +1164,14 @@ def sparse_mask(a, mask_indices, name=None):
|
||||
def split(value, num_or_size_splits, axis=0, num=None, name="split"):
|
||||
"""Splits a tensor into sub tensors.
|
||||
|
||||
If `num_or_size_splits` is a scalar, `num_split`, then splits `value` along
|
||||
dimension `axis` into `num_split` smaller tensors.
|
||||
If `num_or_size_splits` is an integer type, `num_split`, then splits `value`
|
||||
along dimension `axis` into `num_split` smaller tensors.
|
||||
Requires that `num_split` evenly divides `value.shape[axis]`.
|
||||
|
||||
If `num_or_size_splits` is a tensor, `size_splits`, then splits `value` into
|
||||
`len(size_splits)` pieces. The shape of the `i`-th piece has the same size as
|
||||
the `value` except along dimension `axis` where the size is `size_splits[i]`.
|
||||
If `num_or_size_splits` is not an integer type, it is presumed to be a Tensor
|
||||
`size_splits`, then splits `value` into `len(size_splits)` pieces. The shape
|
||||
of the `i`-th piece has the same size as the `value` except along dimension
|
||||
`axis` where the size is `size_splits[i]`.
|
||||
|
||||
For example:
|
||||
|
||||
@ -1189,11 +1189,11 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
|
||||
|
||||
Args:
|
||||
value: The `Tensor` to split.
|
||||
num_or_size_splits: Either an integer indicating the number of splits along
|
||||
split_dim or a 1-D Tensor containing the sizes of each output tensor
|
||||
along split_dim. If an integer then it must evenly divide
|
||||
`value.shape[axis]`; otherwise the sum of sizes along the split
|
||||
dimension must match that of the `value`.
|
||||
num_or_size_splits: Either a 0-D integer `Tensor` indicating the number of
|
||||
splits along split_dim or a 1-D integer `Tensor` integer tensor containing
|
||||
the sizes of each output tensor along split_dim. If a scalar then it must
|
||||
evenly divide `value.shape[axis]`; otherwise the sum of sizes along the
|
||||
split dimension must match that of the `value`.
|
||||
axis: A 0-D `int32` `Tensor`. The dimension along which to split.
|
||||
Must be in the range `[0, rank(value))`. Defaults to 0.
|
||||
num: Optional, used to specify the number of outputs when it cannot be
|
||||
@ -1209,11 +1209,11 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
|
||||
Raises:
|
||||
ValueError: If `num` is unspecified and cannot be inferred.
|
||||
"""
|
||||
if isinstance(num_or_size_splits, six.integer_types):
|
||||
size_splits = ops.convert_to_tensor(num_or_size_splits)
|
||||
if size_splits.get_shape().ndims == 0 and size_splits.dtype.is_integer:
|
||||
return gen_array_ops._split(
|
||||
split_dim=axis, num_split=num_or_size_splits, value=value, name=name)
|
||||
else:
|
||||
size_splits = ops.convert_to_tensor(num_or_size_splits)
|
||||
if num is None:
|
||||
size_splits_shape = size_splits.get_shape()
|
||||
num = size_splits_shape.dims[0]
|
||||
|
@ -1622,6 +1622,11 @@ class CondContext(ControlFlowContext):
|
||||
# pylint: enable=protected-access
|
||||
for x in op.outputs:
|
||||
self._values.add(x.name)
|
||||
# pylint: disable=protected-access
|
||||
if op.graph._is_function(op.type) or op.type == "SymbolicGradient":
|
||||
op._add_control_input(self._pivot.op)
|
||||
# pylint: enable=protected-access
|
||||
|
||||
if self._outer_context or not IsLoopExit(op):
|
||||
op.graph.prevent_fetching(op)
|
||||
|
||||
@ -2147,8 +2152,13 @@ class WhileContext(ControlFlowContext):
|
||||
def _MaybeAddControlDependency(self, op):
|
||||
"""Add a control input to the op if it only depends on loop invariants."""
|
||||
def _IsOpFree(op):
|
||||
"""Determines if `op` needs a control dependency."""
|
||||
if op.control_inputs:
|
||||
return False
|
||||
# pylint: disable=protected-access
|
||||
if op.graph._is_function(op.type) or op.type == "SymbolicGradient":
|
||||
return True
|
||||
# pylint: enable=protected-access
|
||||
for x in op.inputs:
|
||||
if not _IsLoopConstantEnter(x.op):
|
||||
return False
|
||||
|
@ -30,6 +30,11 @@ def _IsDirectory(parent, item):
|
||||
return gfile.IsDirectory(os.path.join(parent, item))
|
||||
|
||||
|
||||
def PluginDirectory(logdir, plugin_name):
|
||||
"""Returns the plugin directory for plugin_name."""
|
||||
return os.path.join(logdir, _PLUGINS_DIR, plugin_name)
|
||||
|
||||
|
||||
def ListPlugins(logdir):
|
||||
"""List all the plugins that have registered assets in logdir.
|
||||
|
||||
@ -61,7 +66,7 @@ def ListAssets(logdir, plugin_name):
|
||||
not exist (either because the logdir doesn't exist, or because the plugin
|
||||
didn't register) an empty list is returned.
|
||||
"""
|
||||
plugin_dir = os.path.join(logdir, _PLUGINS_DIR, plugin_name)
|
||||
plugin_dir = PluginDirectory(logdir, plugin_name)
|
||||
if not gfile.IsDirectory(plugin_dir):
|
||||
return []
|
||||
entries = gfile.ListDirectory(plugin_dir)
|
||||
@ -83,7 +88,7 @@ def RetrieveAsset(logdir, plugin_name, asset_name):
|
||||
KeyError: if the asset does not exist.
|
||||
"""
|
||||
|
||||
asset_path = os.path.join(logdir, _PLUGINS_DIR, plugin_name, asset_name)
|
||||
asset_path = os.path.join(PluginDirectory(logdir, plugin_name), asset_name)
|
||||
try:
|
||||
with gfile.Open(asset_path, "r") as f:
|
||||
return f.read()
|
||||
|
@ -50,6 +50,11 @@ class PluginGamma(GenericContentPlugin):
|
||||
|
||||
class PluginAssetUtilitiesTest(test.TestCase):
|
||||
|
||||
def testGetPluginDirectory(self):
|
||||
self.assertEqual(
|
||||
os.path.join("logdir", "plugins", "x"),
|
||||
plugin_asset_util.PluginDirectory("logdir", "x"))
|
||||
|
||||
def testNonExistentDirectory(self):
|
||||
tempdir = self.get_temp_dir()
|
||||
fake_dir = os.path.join(tempdir, "nonexistent_dir")
|
||||
|
@ -46,6 +46,7 @@ tensorboard_typescript_genrule(
|
||||
],
|
||||
typings = [
|
||||
"@org_definitelytyped//:d3.d.ts",
|
||||
"@org_definitelytyped//:lodash.d.ts",
|
||||
"//tensorflow/tensorboard/components/vz_sorting:ts_typings",
|
||||
],
|
||||
)
|
||||
|
@ -72,24 +72,31 @@ module Categorizer {
|
||||
if (tags.length === 0) {
|
||||
return [];
|
||||
}
|
||||
let sortedTags = tags.slice().sort(VZ.Sorting.compareTagNames);
|
||||
let categories: Category[] = [];
|
||||
let currentCategory = {
|
||||
name: extractor(sortedTags[0]),
|
||||
tags: [],
|
||||
};
|
||||
sortedTags.forEach((t: string) => {
|
||||
let topLevel = extractor(t);
|
||||
if (currentCategory.name !== topLevel) {
|
||||
categories.push(currentCategory);
|
||||
currentCategory = {
|
||||
|
||||
// Maps between top-level name and category. We use the mapping to avoid
|
||||
// duplicating categories per run.
|
||||
const categoryMapping: {[key: string]: Category} = {};
|
||||
|
||||
tags.forEach((t: string) => {
|
||||
const topLevel = extractor(t);
|
||||
if (!categoryMapping[topLevel]) {
|
||||
const newCategory = {
|
||||
name: topLevel,
|
||||
tags: [],
|
||||
};
|
||||
categoryMapping[topLevel] = newCategory;
|
||||
}
|
||||
currentCategory.tags.push(t);
|
||||
|
||||
categoryMapping[topLevel].tags.push(t);
|
||||
});
|
||||
|
||||
// Sort categories into alphabetical order.
|
||||
const categories =
|
||||
_.map(_.keys(categoryMapping).sort(), key => categoryMapping[key]);
|
||||
_.forEach(categories, (category) => {
|
||||
// Sort the tags within each category.
|
||||
category.tags.sort(VZ.Sorting.compareTagNames);
|
||||
});
|
||||
categories.push(currentCategory);
|
||||
return categories;
|
||||
};
|
||||
}
|
||||
|
@ -62,6 +62,18 @@ module Categorizer {
|
||||
assert.deepEqual(
|
||||
topLevelNamespaceCategorizer(['a']), [{name: 'a', tags: ['a']}]);
|
||||
});
|
||||
|
||||
it('only create 1 category per run', () => {
|
||||
// TensorBoard separates runs from tags using the / and _ characters
|
||||
// *only* during sorting. The categorizer should group all tags under
|
||||
// their correct categories - and create only 1 category per run.
|
||||
const tags = ['foo/bar', 'foo_in_between_run/baz', 'foo/quux'];
|
||||
const expected = [
|
||||
{name: 'foo', tags: ['foo/bar', 'foo/quux']},
|
||||
{name: 'foo_in_between_run', tags: ['foo_in_between_run/baz']},
|
||||
];
|
||||
assert.deepEqual(topLevelNamespaceCategorizer(tags), expected);
|
||||
});
|
||||
});
|
||||
|
||||
describe('customCategorizer', () => {
|
||||
|
@ -73,24 +73,31 @@ function extractorToCategorizer(extractor: (s: string) => string): Categorizer {
|
||||
if (tags.length === 0) {
|
||||
return [];
|
||||
}
|
||||
let sortedTags = tags.slice().sort(compareTagNames);
|
||||
let categories: Category[] = [];
|
||||
let currentCategory = {
|
||||
name: extractor(sortedTags[0]),
|
||||
tags: [],
|
||||
};
|
||||
sortedTags.forEach((t: string) => {
|
||||
let topLevel = extractor(t);
|
||||
if (currentCategory.name !== topLevel) {
|
||||
categories.push(currentCategory);
|
||||
currentCategory = {
|
||||
|
||||
// Maps between top-level name and category. We use the mapping to avoid
|
||||
// duplicating categories per run.
|
||||
const categoryMapping: {[key: string]: Category} = {};
|
||||
|
||||
tags.forEach((t: string) => {
|
||||
const topLevel = extractor(t);
|
||||
if (!categoryMapping[topLevel]) {
|
||||
const newCategory = {
|
||||
name: topLevel,
|
||||
tags: [],
|
||||
};
|
||||
categoryMapping[topLevel] = newCategory;
|
||||
}
|
||||
currentCategory.tags.push(t);
|
||||
|
||||
categoryMapping[topLevel].tags.push(t);
|
||||
});
|
||||
|
||||
// Sort categories into alphabetical order.
|
||||
const categories =
|
||||
_.map(_.keys(categoryMapping).sort(), key => categoryMapping[key]);
|
||||
_.forEach(categories, (category) => {
|
||||
// Sort the tags within each category.
|
||||
category.tags.sort(compareTagNames);
|
||||
});
|
||||
categories.push(currentCategory);
|
||||
return categories;
|
||||
};
|
||||
}
|
||||
@ -180,4 +187,4 @@ Polymer({
|
||||
this._setCategories(categories);
|
||||
})
|
||||
},
|
||||
});
|
||||
});
|
||||
|
46
tensorflow/tensorboard/components/tf_graph_loader/BUILD
Normal file
46
tensorflow/tensorboard/components/tf_graph_loader/BUILD
Normal file
@ -0,0 +1,46 @@
|
||||
package(default_visibility = ["//tensorflow:internal"])
|
||||
|
||||
load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
|
||||
load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
|
||||
load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
|
||||
|
||||
licenses(["notice"]) # Apache 2.0
|
||||
|
||||
webfiles(
|
||||
name = "tf_graph_loader",
|
||||
srcs = [
|
||||
"tf-graph-loader.html",
|
||||
],
|
||||
path = "/tf-graph-loader",
|
||||
deps = [
|
||||
"//tensorflow/tensorboard/components/tf_graph_common",
|
||||
"@org_polymer",
|
||||
],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "all_files",
|
||||
srcs = glob(["**"]),
|
||||
tags = ["notsan"],
|
||||
)
|
||||
|
||||
################################################################################
|
||||
# MARKED FOR DELETION
|
||||
|
||||
tensorboard_webcomponent_library(
|
||||
name = "legacy",
|
||||
srcs = [
|
||||
"tf-graph-loader.html",
|
||||
],
|
||||
destdir = "tf-graph-loader",
|
||||
deps = [
|
||||
"//tensorflow/tensorboard/components/tf_graph_common:legacy",
|
||||
],
|
||||
)
|
||||
|
||||
# This is needed despite how this component lacks TypeScript files because
|
||||
# components/BUILD seeks a legacy_ts rule in this package.
|
||||
tensorboard_ts_library(
|
||||
name = "legacy_ts",
|
||||
srcs = [],
|
||||
)
|
24
tensorflow/tensorboard/components/tf_graph_loader/demo/BUILD
Normal file
24
tensorflow/tensorboard/components/tf_graph_loader/demo/BUILD
Normal file
@ -0,0 +1,24 @@
|
||||
package(default_visibility = ["//tensorflow:internal"])
|
||||
|
||||
load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
|
||||
|
||||
licenses(["notice"]) # Apache 2.0
|
||||
|
||||
# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_loader/demo
|
||||
webfiles(
|
||||
name = "demo",
|
||||
srcs = ["index.html"] + glob(["data/**"]),
|
||||
path = "/tf-graph-loader/demo",
|
||||
deps = [
|
||||
"//tensorflow/tensorboard/components/tf_graph_loader",
|
||||
"@org_polymer_iron_demo_helpers",
|
||||
"@org_polymer_paper_styles",
|
||||
"@org_polymer_webcomponentsjs",
|
||||
],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "all_files",
|
||||
srcs = glob(["**"]),
|
||||
tags = ["notsan"],
|
||||
)
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,78 @@
|
||||
<!doctype html>
|
||||
<!--
|
||||
@license
|
||||
Copyright 2016 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
|
||||
<link rel="import" href="../tf-graph-loader.html">
|
||||
<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
|
||||
<title>TF Graph Loader Demo</title>
|
||||
</head>
|
||||
<body>
|
||||
<demo-snippet>
|
||||
<template>
|
||||
<dom-module id="tf-graph-loader-demo">
|
||||
<template>
|
||||
<tf-graph-loader id="loader"
|
||||
datasets="[[_datasets]]"
|
||||
selected-dataset="[[_selectedDataset]]"
|
||||
progress="{{_progress}}"></tf-graph-loader>
|
||||
</template>
|
||||
<script>
|
||||
Polymer({
|
||||
is: "tf-graph-loader-demo",
|
||||
properties: {
|
||||
// We tell the graph loader to load a specific pbtxt file.
|
||||
_datasets: {
|
||||
type: Array,
|
||||
value: [{
|
||||
"name": "Graph with XLA Clusters Specified",
|
||||
"path": "data/graph.pbtxt"
|
||||
}],
|
||||
},
|
||||
_selectedDataset: {
|
||||
type: Number,
|
||||
value: 0,
|
||||
},
|
||||
|
||||
// This property will be updated by the graph loader.
|
||||
_progress: {
|
||||
type: Object,
|
||||
},
|
||||
},
|
||||
observers: [
|
||||
'_progressUpdated(_progress)',
|
||||
],
|
||||
_progressUpdated(progress) {
|
||||
// console.log the progress.
|
||||
console.log('Progress updated.', progress);
|
||||
|
||||
// The graph has loaded. console.log it.
|
||||
if (progress.value == 100) {
|
||||
console.log('graph', this.$.loader.outGraph);
|
||||
}
|
||||
},
|
||||
});
|
||||
</script>
|
||||
</dom-module>
|
||||
<!-- The graph loader lacks visual elements. -->
|
||||
<tf-graph-loader-demo></tf-graph-loader-demo>
|
||||
</template>
|
||||
</demo-snippet>
|
||||
</body>
|
||||
</html>
|
@ -16,6 +16,7 @@ limitations under the License.
|
||||
-->
|
||||
|
||||
<link rel="import" href="../polymer/polymer.html">
|
||||
<link rel="import" href="../tf-graph-common/tf-graph-common.html">
|
||||
|
||||
<!--
|
||||
An element which provides a filter parsing for pbtxt to graph output.
|
||||
|
@ -10,12 +10,17 @@ Consultants: Jon Shlens, Pete Warden
|
||||
1. Measure model parameters, float operations, tensor shapes.
|
||||
2. Measure op execution times, requested memory size and device placement.
|
||||
3. Inspect checkpoint tensors' shapes and their values.
|
||||
4. Explore model based on name scope or graph structure.
|
||||
4. 3 ways to view and explore TensorFlow model profiles
|
||||
|
||||
* Organize by Python code call stack.
|
||||
* Organize by TensorFlow operation name scope hierarchies.
|
||||
* Organize by TensorFlow operation inputs/outputs graph.
|
||||
|
||||
5. Selectively grouping/filtering/accounting/ordering ops.
|
||||
|
||||
[Python API Tutorials](#python-api-tutorials): It can be called directly from
|
||||
Python codes. Results are either printed
|
||||
to stdout or dumped to file. tensorflow.tfprof.TFProfNode proto is returned from
|
||||
to stdout or dumped to file. tensorflow.tfprof.TFGraphNodeProto proto is returned from
|
||||
the API to allow users to perform further analysis.
|
||||
|
||||
[CLI Tutorials](#cli-tutorials):
|
||||
@ -33,13 +38,23 @@ tfprof is part of TensorFlow core. Simply ```import tensorflow as tf```.
|
||||
### Examine the shapes and sizes of all trainable Variables.
|
||||
```python
|
||||
# Print trainable variable parameter statistics to stdout.
|
||||
# By default, statistics are associated with each graph node.
|
||||
param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
|
||||
tf.get_default_graph(),
|
||||
tfprof_options=tf.contrib.tfprof.model_analyzer.
|
||||
TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
|
||||
|
||||
# param_stats is tensorflow.tfprof.TFProfNode proto. It organize the statistics
|
||||
# of each graph node in tree scructure. Let's print the root below.
|
||||
|
||||
# Set tfprof_cmd='code' to associate statistics with Python codes.
|
||||
opts = tf.contrib.tfprof.model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
|
||||
opts['show_name_regexes'] = ['.*my_code1.py.*', '.*my_code2.py.*']
|
||||
param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
|
||||
tf.get_default_graph(),
|
||||
tfprof_cmd='code'
|
||||
tfprof_options=opts)
|
||||
|
||||
# param_stats is tensorflow.tfprof.TFGraphNodeProto proto.
|
||||
# Let's print the root below.
|
||||
sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
|
||||
```
|
||||
|
||||
@ -84,8 +99,20 @@ Finally, you may run `print_model_analysis` to explore the timing and memory
|
||||
demands of the model.
|
||||
|
||||
``` python
|
||||
# See model_analyzer_test.py for more examples.
|
||||
#
|
||||
# Print to stdout an analysis of the memory usage and the timing information
|
||||
# from running the graph broken down by operations.
|
||||
# broken down by python codes.
|
||||
opts = tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY.copy()
|
||||
opts['show_name_regexes'] = ['.*my_code.py.*']
|
||||
tf.contrib.tfprof.model_analyzer.print_model_analysis(
|
||||
tf.get_default_graph(),
|
||||
run_meta=run_metadata,
|
||||
tfprof_cmd='code',
|
||||
tfprof_options=opts)
|
||||
|
||||
# Print to stdout an analysis of the memory usage and the timing information
|
||||
# broken down by operations.
|
||||
tf.contrib.tfprof.model_analyzer.print_model_analysis(
|
||||
tf.get_default_graph(),
|
||||
run_meta=run_metadata,
|
||||
@ -138,9 +165,9 @@ bazel-bin/tensorflow/tools/tfprof/tfprof \
|
||||
--run_meta_path=run_meta \
|
||||
--checkpoint_path=model.ckpt
|
||||
#
|
||||
# tfprof_log is used to define customized op types and float ops.
|
||||
# tfprof_log is used to define customized op types, float ops and code traces.
|
||||
# Use tfprof_logger.write_op_log() to create tfprof_log.
|
||||
# See 11) in Examples section on generating tfprof_log file.
|
||||
# See 12) in Examples section on generating tfprof_log file.
|
||||
bazel-bin/tensorflow/tools/tfprof/tfprof \
|
||||
--graph_path=graph.pbtxt \
|
||||
--run_meta_path=run_meta \
|
||||
@ -174,7 +201,28 @@ tfprof>
|
||||
-dump_to_file
|
||||
```
|
||||
|
||||
3) I want to see the `BatchNorm`'s gamma value in checkpoint.
|
||||
3) I want to see which line of my python codes costs most time!
|
||||
|
||||
```shell
|
||||
# Requires --graph_path --op_log_path
|
||||
tfprof> code -max_depth 1000 -show_name_regexes .*model_analyzer.*py.* -select micros -account_type_regexes .* -order_by micros
|
||||
_TFProfRoot (0us/22.44ms)
|
||||
model_analyzer_test.py:149:run_filename_as_m...:none (0us/22.44ms)
|
||||
model_analyzer_test.py:33:_run_code_in_main:none (0us/22.44ms)
|
||||
model_analyzer_test.py:208:<module>:test.main() (0us/22.44ms)
|
||||
model_analyzer_test.py:132:testComplexCodeView:x = lib.BuildFull... (0us/22.44ms)
|
||||
model_analyzer_testlib.py:63:BuildFullModel:return sgd_op.min... (0us/21.83ms)
|
||||
model_analyzer_testlib.py:58:BuildFullModel:cell, array_ops.c... (0us/333us)
|
||||
model_analyzer_testlib.py:54:BuildFullModel:seq.append(array_... (0us/254us)
|
||||
model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0us/134us)
|
||||
model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0us/40us)
|
||||
...
|
||||
model_analyzer_testlib.py:61:BuildFullModel:loss = nn_ops.l2_... (0us/28us)
|
||||
model_analyzer_testlib.py:60:BuildFullModel:target = array_op... (0us/0us)
|
||||
model_analyzer_test.py:134:testComplexCodeView:sess.run(variable... (0us/0us)
|
||||
```
|
||||
|
||||
4) I want to see the `BatchNorm`'s gamma value in checkpoint.
|
||||
|
||||
```shell
|
||||
# Requires --graph_path, --checkpoint_path.
|
||||
@ -186,7 +234,7 @@ _TFProfRoot ()
|
||||
[1.57 1.83 1.30 1.25 1.59 1.14 1.26 0.82 1.19 1.10 1.48 1.01 0.82 1.23 1.21 1.14 ],
|
||||
```
|
||||
|
||||
4) I want to see my checkpoint tensors shape and number of parameters.
|
||||
5) I want to see my checkpoint tensors shape and number of parameters.
|
||||
|
||||
```shell
|
||||
# Requires --graph_path, --checkpoint_path.
|
||||
@ -205,7 +253,7 @@ _TFProfRoot (--/930.58k params)
|
||||
unit_last/final_bn/moving_variance (64, 64/64 params)
|
||||
```
|
||||
|
||||
5) I defined an op named ‘cost’ to calculate the loss. I want to know what ops
|
||||
6) I defined an op named ‘cost’ to calculate the loss. I want to know what ops
|
||||
it depends on take a long time to run. Hint: Use the ‘graph’ command to explore
|
||||
graph dependencies.
|
||||
|
||||
@ -221,7 +269,7 @@ _TFProfRoot (0us/3.61sec)
|
||||
unit_3_3/sub2/conv2/Conv2D (10.26ms/3.60sec)
|
||||
```
|
||||
|
||||
6) I want to know the expensive operations during the back propagation.
|
||||
7) I want to know the expensive operations during the back propagation.
|
||||
Hint: tensorflow prepend ‘gradient’ to your defined name scopes. Use the ‘scope’
|
||||
command to explore based on name scope hierarchies.
|
||||
|
||||
@ -238,7 +286,7 @@ _TFProfRoot (0us/2.29sec)
|
||||
...
|
||||
```
|
||||
|
||||
7) Show the number of float operations in the model.
|
||||
8) Show the number of float operations in the model.
|
||||
Note: float operations calculation depends on
|
||||
1) op.RegisterStatistics. If an op doesn’t
|
||||
have RegisterStatistics defined, its float operations cannot be counted.
|
||||
@ -263,7 +311,7 @@ _TFProfRoot (0/17.63b flops)
|
||||
...
|
||||
```
|
||||
|
||||
8) Show the number of parameters of all `tf.trainable_variables()` in the model.
|
||||
9) Show the number of parameters of all `tf.trainable_variables()` in the model.
|
||||
|
||||
```shell
|
||||
# Requires --graph_path --op_log_path.
|
||||
@ -283,7 +331,7 @@ generated by write_op_log() Python API. write_op_log() help users create some
|
||||
common op types implicitly. Users can define their own op types and log it
|
||||
through the write_op_log() API.
|
||||
|
||||
9) What if I’m lazy and don’t want to define op type? I have given my ops
|
||||
109) What if I’m lazy and don’t want to define op type? I have given my ops
|
||||
well-defined names in my model’s code. And want to use names to select a group
|
||||
of ops. Let’s try it!
|
||||
|
||||
@ -301,7 +349,7 @@ in terminal. Otherwise, tfprof accounts all ops matched by
|
||||
`-account_type_regexes` recursively even if they are hidden due to some
|
||||
options such as -max_depth.
|
||||
|
||||
10) TensorFlow has built-in op types. For example, built-in op type `Variable`
|
||||
11) TensorFlow has built-in op types. For example, built-in op type `Variable`
|
||||
seems to include `Variable's` created by your model. However, be careful when
|
||||
depending on it because TensorFlow creates extra `Variable` ops implicitly and
|
||||
the implicitly created ops can have the same prefix as the `Variable's` you
|
||||
@ -327,7 +375,7 @@ _TFProfRoot (--/930.58k params)
|
||||
```
|
||||
|
||||
|
||||
11) A example of defining extra op type for ops using `OpLog`
|
||||
12) A example of defining extra op type for ops using `OpLog`
|
||||
|
||||
First, in Python code, create an `OpLog` proto and add op type
|
||||
information to it:
|
||||
|
@ -15,6 +15,7 @@ cc_library(
|
||||
srcs = ["tfprof_stats.cc"],
|
||||
hdrs = ["tfprof_stats.h"],
|
||||
deps = [
|
||||
":tfprof_code",
|
||||
":tfprof_graph",
|
||||
":tfprof_node",
|
||||
":tfprof_options",
|
||||
@ -61,6 +62,27 @@ cc_library(
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "tfprof_code",
|
||||
srcs = ["tfprof_code.cc"],
|
||||
hdrs = ["tfprof_code.h"],
|
||||
deps = [
|
||||
":tfprof_constants",
|
||||
":tfprof_node",
|
||||
":tfprof_options",
|
||||
":tfprof_show_code",
|
||||
":tfprof_tensor",
|
||||
":tfprof_utils",
|
||||
"//tensorflow/c:c_api",
|
||||
"//tensorflow/c:checkpoint_reader",
|
||||
"//tensorflow/core:framework",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:protos_all_cc",
|
||||
"//tensorflow/core:regexp_internal",
|
||||
"//tensorflow/tools/tfprof:protos_all_cc",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "tfprof_graph",
|
||||
srcs = ["tfprof_graph.cc"],
|
||||
@ -98,6 +120,26 @@ cc_library(
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "tfprof_show_code",
|
||||
srcs = ["tfprof_show_code.cc"],
|
||||
hdrs = ["tfprof_show_code.h"],
|
||||
deps = [
|
||||
":tfprof_constants",
|
||||
":tfprof_node",
|
||||
":tfprof_options",
|
||||
":tfprof_scope",
|
||||
":tfprof_show",
|
||||
":tfprof_tensor",
|
||||
":tfprof_utils",
|
||||
"//tensorflow/c:checkpoint_reader",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:protos_all_cc",
|
||||
"//tensorflow/core:regexp_internal",
|
||||
"//tensorflow/tools/tfprof:protos_all_cc",
|
||||
],
|
||||
)
|
||||
|
||||
tf_cc_test(
|
||||
name = "tfprof_show_test",
|
||||
srcs = ["tfprof_show_test.cc"],
|
||||
|
@ -40,13 +40,13 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
|
||||
graph_ptr->ParseFromString(*graph);
|
||||
|
||||
std::unique_ptr<RunMetadata> run_meta_ptr;
|
||||
if (run_meta) {
|
||||
if (run_meta && !run_meta->empty()) {
|
||||
run_meta_ptr.reset(new RunMetadata());
|
||||
run_meta_ptr->ParseFromString(*run_meta);
|
||||
}
|
||||
|
||||
std::unique_ptr<OpLog> op_log_ptr;
|
||||
if (op_log) {
|
||||
if (op_log && !op_log->empty()) {
|
||||
op_log_ptr.reset(new OpLog());
|
||||
op_log_ptr->ParseFromString(*op_log);
|
||||
}
|
||||
@ -58,16 +58,27 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
|
||||
|
||||
Options opts = Options::FromProtoStr(*options);
|
||||
|
||||
// TODO(xpan): We should have dump_to_file/print_stdout/etc to control
|
||||
// side-effects independently instead of one controlling the other.
|
||||
if (opts.dump_to_file.empty()) {
|
||||
printf("\n=========================Options=============================\n");
|
||||
printf("%s", opts.ToString().c_str());
|
||||
printf("\n==================Model Analysis Report======================\n");
|
||||
TFProfNode root(tf_stats.PrintGraph(*command, opts));
|
||||
string ret = "";
|
||||
if (*command == kCmds[2]) {
|
||||
ret = tf_stats.PrintCode(opts).SerializeAsString();
|
||||
} else {
|
||||
ret = tf_stats.PrintGraph(*command, opts).SerializeAsString();
|
||||
}
|
||||
printf("\n======================End of Report==========================\n");
|
||||
fflush(stdout);
|
||||
return root.SerializeAsString();
|
||||
return ret;
|
||||
}
|
||||
if (*command == kCmds[2]) {
|
||||
return tf_stats.PrintCode(opts).SerializeAsString();
|
||||
} else {
|
||||
return tf_stats.PrintGraph(*command, opts).SerializeAsString();
|
||||
}
|
||||
return tf_stats.PrintGraph(*command, opts).SerializeAsString();
|
||||
}
|
||||
} // namespace tfprof
|
||||
} // namespace tensorflow
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user