Microoptimizations of graph construction code.

Before:
Run on *********** (72 X 2993 MHz CPUs); 2018-12-13T16:09:43.471855971-08:00
CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB
Benchmark                 Time(ns)        CPU(ns)     Iterations
----------------------------------------------------------------
BM_GraphCreation/10/2        67142          86268           8252
BM_GraphCreation/64/2       138640         163264           4262
BM_GraphCreation/512/2      801036         837092            838
BM_GraphCreation/4k/2      7670132        7719032             89
BM_GraphCreation/32k/2    87954443       88133128              8
BM_GraphCreation/10/4        85895         106133           6589
BM_GraphCreation/64/4       176924         202943           3445
BM_GraphCreation/512/4     1092235        1124801            620
BM_GraphCreation/4k/4     10167172       10242199             68
BM_GraphCreation/32k/4   116535329      116863022              6
BM_GraphCreation/10/8       128276         152347           4595
BM_GraphCreation/64/8       290808         322147           2167
BM_GraphCreation/512/8     1995712        2040134            349
BM_GraphCreation/4k/8     17648175       17725397             39
BM_GraphCreation/32k/8   201791945      202232200              3
BM_GraphCreation/10/16      212183         240520           2909
BM_GraphCreation/64/16      474982         506036           1000
BM_GraphCreation/512/16    3590180        3641964            195
BM_GraphCreation/4k/16    32178292       32265093             22
BM_GraphCreation/32k/16  359809818      360593206              2

After:
Run on *********** (72 X 2993 MHz CPUs); 2018-12-13T16:48:26.030782518-08:00
CPU: Intel Skylake Xeon with HyperThreading (36 cores) dL1:32KB dL2:1024KB dL3:24MB
Benchmark                 Time(ns)        CPU(ns)     Iterations
----------------------------------------------------------------
BM_GraphCreation/10/2        65638          84729           8276
BM_GraphCreation/64/2       130192         154173           4579
BM_GraphCreation/512/2      766354         802899            881
BM_GraphCreation/4k/2      6966973        7019842             98
BM_GraphCreation/32k/2    82443771       82643748              8
BM_GraphCreation/10/4        82697         102636           6743
BM_GraphCreation/64/4       171184         197236           3574
BM_GraphCreation/512/4     1000612        1030750            676
BM_GraphCreation/4k/4      9268842        9346867             74
BM_GraphCreation/32k/4   110080002      110330854              7
BM_GraphCreation/10/8       161076         181417           4764
BM_GraphCreation/64/8       300977         331782           2081
BM_GraphCreation/512/8     1781437        1829938            387
BM_GraphCreation/4k/8     16062834       16148914             44
BM_GraphCreation/32k/8   188352170      188727906              4
BM_GraphCreation/10/16      201874         229188           3049
BM_GraphCreation/64/16      445487         479042           1462
BM_GraphCreation/512/16    3173224        3224053            218
BM_GraphCreation/4k/16    29365146       29457557             24
BM_GraphCreation/32k/16  326978055      327510864              2
PiperOrigin-RevId: 225466082
This commit is contained in:
A. Unique TensorFlower 2018-12-13 17:38:47 -08:00 committed by TensorFlower Gardener
parent 2a67515699
commit 09decf56ba
2 changed files with 11 additions and 12 deletions

View File

@ -38,9 +38,8 @@ std::pair<EdgeSet::const_iterator, bool> EdgeSet::insert(value_type value) {
} }
// array is full. convert to set. // array is full. convert to set.
s = new std::set<const Edge*>; s = new std::set<const Edge*>;
for (int i = 0; i < kInline; i++) { s->insert(reinterpret_cast<const Edge**>(std::begin(ptrs_)),
s->insert(static_cast<const Edge*>(ptrs_[i])); reinterpret_cast<const Edge**>(std::end(ptrs_)));
}
ptrs_[0] = this; ptrs_[0] = this;
ptrs_[1] = s; ptrs_[1] = s;
// fall through. // fall through.

View File

@ -35,6 +35,8 @@ limitations under the License.
#include "tensorflow/core/graph/graph.h" #include "tensorflow/core/graph/graph.h"
#include "tensorflow/core/graph/tensor_id.h" #include "tensorflow/core/graph/tensor_id.h"
#include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/gtl/flatmap.h"
#include "tensorflow/core/lib/gtl/flatset.h"
#include "tensorflow/core/lib/gtl/inlined_vector.h" #include "tensorflow/core/lib/gtl/inlined_vector.h"
#include "tensorflow/core/lib/strings/scanner.h" #include "tensorflow/core/lib/strings/scanner.h"
#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/str_util.h"
@ -268,22 +270,20 @@ class GraphConstructor {
int gdef_index; int gdef_index;
Node* node; // nullptr until the NodeDef is converted to a Node. Node* node; // nullptr until the NodeDef is converted to a Node.
}; };
// TODO(vrv): Profile this data structure to see if we should use an gtl::FlatMap<StringPiece, NodeInfo, StringPieceHasher> gdef_nodes_;
// alternative implementation of std::unordered_map.
std::unordered_map<StringPiece, NodeInfo, StringPieceHasher> gdef_nodes_;
// Prefixes already used in the GraphDef being imported. // Prefixes already used in the GraphDef being imported.
std::unordered_set<StringPiece, StringPieceHasher> gdef_prefixes_; gtl::FlatSet<StringPiece, StringPieceHasher> gdef_prefixes_;
// Mapping from node name to the existing node in g_. // Mapping from node name to the existing node in g_.
std::unordered_map<StringPiece, Node*, StringPieceHasher> existing_nodes_; gtl::FlatMap<StringPiece, Node*, StringPieceHasher> existing_nodes_;
// Prefixes already used in the graph. // Prefixes already used in the graph.
std::unordered_set<StringPiece, StringPieceHasher> existing_prefixes_; gtl::FlatSet<StringPiece, StringPieceHasher> existing_prefixes_;
// Imported node names that have been uniquified. The key is the original // Imported node names that have been uniquified. The key is the original
// name, the value is the new unique name. // name, the value is the new unique name.
std::unordered_map<string, string> uniquified_names_; gtl::FlatMap<string, string> uniquified_names_;
// Index of NodeDefs in node_defs_ with all inputs already converted. We use a // Index of NodeDefs in node_defs_ with all inputs already converted. We use a
// (sorted) set so nodes are created in the order defined in the GraphDef. // (sorted) set so nodes are created in the order defined in the GraphDef.
@ -360,7 +360,7 @@ bool NodeNameInValues(const std::vector<string>& control_dependencies,
// Adds any prefixes of `node_name` (not including the full name itself) to // Adds any prefixes of `node_name` (not including the full name itself) to
// `prefixes`. // `prefixes`.
void AddPrefixes(StringPiece node_name, void AddPrefixes(StringPiece node_name,
std::unordered_set<StringPiece, StringPieceHasher>* prefixes) { gtl::FlatSet<StringPiece, StringPieceHasher>* prefixes) {
size_t idx = -1; size_t idx = -1;
while ((idx = node_name.find('/', idx + 1)) != StringPiece::npos) { while ((idx = node_name.find('/', idx + 1)) != StringPiece::npos) {
prefixes->insert(node_name.substr(0, idx)); prefixes->insert(node_name.substr(0, idx));
@ -857,7 +857,7 @@ void GraphConstructor::UpdateUniquifiedColocationNames() {
for (int i = 0; i < coloc_values.size(); ++i) { for (int i = 0; i < coloc_values.size(); ++i) {
StringPiece val(coloc_values[i]); StringPiece val(coloc_values[i]);
if (str_util::ConsumePrefix(&val, kColocationGroupPrefix)) { if (str_util::ConsumePrefix(&val, kColocationGroupPrefix)) {
const auto& name_pair = uniquified_names_.find(string(val)); auto name_pair = uniquified_names_.find(string(val));
if (name_pair == uniquified_names_.end()) continue; if (name_pair == uniquified_names_.end()) continue;
updated = true; updated = true;
coloc_values[i] = coloc_values[i] =