From 7f12fa50f1615e4a2e78482a39e1a5d2db809734 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Mon, 22 Jun 2020 09:46:09 -0700 Subject: [PATCH] Update NCCL to v2.7.3. PiperOrigin-RevId: 317672859 Change-Id: Ice6b6ac0875f1b6f8daa6ad9d9539621b22e6666 --- tensorflow/workspace.bzl | 8 +- third_party/nccl/archive.BUILD | 1 + third_party/nccl/archive.patch | 181 ++++++++++++++++++++++++++++++++- 3 files changed, 183 insertions(+), 7 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index e6a15b422eb..73698987a08 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -802,11 +802,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): name = "nccl_archive", build_file = clean_dep("//third_party:nccl/archive.BUILD"), patch_file = clean_dep("//third_party/nccl:archive.patch"), - sha256 = "7ff66aca18392b162829612e02c00b123a58ec35869334f72d7e5afaf5ea4a13", - strip_prefix = "nccl-3701130b3c1bcdb01c14b3cb70fe52498c1e82b7", + sha256 = "67e15ce3d12ba9ea1e0cb239599202b0f61c146149699341043c072de388e90a", + strip_prefix = "nccl-5949d96f36d050e59d05872f8bbffd2549318e95", urls = [ - "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nvidia/nccl/archive/3701130b3c1bcdb01c14b3cb70fe52498c1e82b7.tar.gz", - "https://github.com/nvidia/nccl/archive/3701130b3c1bcdb01c14b3cb70fe52498c1e82b7.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nvidia/nccl/archive/5949d96f36d050e59d05872f8bbffd2549318e95.tar.gz", + "https://github.com/nvidia/nccl/archive/5949d96f36d050e59d05872f8bbffd2549318e95.tar.gz", ], ) diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD index 9b0c9bdda1d..028b348caff 100644 --- a/third_party/nccl/archive.BUILD +++ b/third_party/nccl/archive.BUILD @@ -46,6 +46,7 @@ gen_device_srcs( "src/collectives/device/broadcast.cu.cc", "src/collectives/device/reduce.cu.cc", "src/collectives/device/reduce_scatter.cu.cc", + "src/collectives/device/sendrecv.cu.cc", ], ) diff --git a/third_party/nccl/archive.patch b/third_party/nccl/archive.patch index ea0ead71fb2..94ef48d00e8 100644 --- a/third_party/nccl/archive.patch +++ b/third_party/nccl/archive.patch @@ -22,6 +22,10 @@ diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/r similarity index 100% rename from src/collectives/device/reduce_scatter.cu rename to src/collectives/device/reduce_scatter.cu.cc +diff --git a/src/collectives/device/sendrecv.cu b/src/collectives/device/sendrecv.cu.cc +similarity index 100% +rename from src/collectives/device/sendrecv.cu +rename to src/collectives/device/sendrecv.cu.cc diff --git a/src/nccl.h.in b/src/nccl.h similarity index 98% rename from src/nccl.h.in @@ -38,12 +42,183 @@ index 985274e..7ebb1e1 100644 -#define NCCL_PATCH ${nccl:Patch} -#define NCCL_SUFFIX "${nccl:Suffix}" +#define NCCL_MAJOR 2 -+#define NCCL_MINOR 5 -+#define NCCL_PATCH 7 ++#define NCCL_MINOR 7 ++#define NCCL_PATCH 3 +#define NCCL_SUFFIX "" -#define NCCL_VERSION_CODE ${nccl:Version} -+#define NCCL_VERSION_CODE 2507 ++#define NCCL_VERSION_CODE 2703 #define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z)) #ifdef __cplusplus +See https://github.com/NVIDIA/nccl/pull/322.patch +From 410d341bd4569f60282576daa5c991717dbd560e Mon Sep 17 00:00:00 2001 +From: Danilo +Date: Tue, 14 Apr 2020 14:52:42 +0200 +Subject: [PATCH 1/2] Fix memory leak in xml.cc. + +This patch fixes the memory leak documented in +https://github.com/NVIDIA/nccl/issues/321, where one of the buffers +allocated by realpath(), inside getPciPath() is not freed. + +The memory management aspect of this function also seemed odd and +unecessary, as the realpath() function is documented to only write up to +PATH_MAX bytes to the buffer passed to it, meaning we don't need dynamic +memory allocation at all. I also changed the function signature of +getPciPath to enforce the use of a fixed-size buffer. +--- + src/graph/xml.cc | 23 ++++++++++++----------- + 1 file changed, 12 insertions(+), 11 deletions(-) + +diff --git a/src/graph/xml.cc b/src/graph/xml.cc +index 550cfcd0c..8fea91950 100644 +--- a/src/graph/xml.cc ++++ b/src/graph/xml.cc +@@ -323,12 +323,14 @@ ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml + static void memcpylower(char* dst, const char* src, const size_t size) { + for (int i=0; iparent; + if (parent == NULL) { +- if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); ++ NCCLCHECK(getPciPath(busId, path)); + + // Save that for later in case next step is a CPU + char numaIdStr[MAX_STR_LEN]; +@@ -544,7 +546,6 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* + } else if (strcmp(parent->name, "cpu") == 0) { + NCCLCHECK(ncclTopoGetXmlFromCpu(parent, xml)); + } +- free(path); + return ncclSuccess; + } + +@@ -640,8 +641,8 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm + if (index == -1) { + const char* busId; + NCCLCHECK(xmlGetAttr(sub, "target", &busId)); +- char* path; +- NCCLCHECK(getPciPath(busId, &path)); ++ char path[PATH_MAX+1]; ++ NCCLCHECK(getPciPath(busId, path)); + NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass")); + } + } + +From f02d51952ac587237ea5f7c607a5b379381d09d7 Mon Sep 17 00:00:00 2001 +From: Danilo +Date: Tue, 14 Apr 2020 22:17:49 +0200 +Subject: [PATCH 2/2] Performance tweaks in ncclTopoGetXmlFromSys. + +Reduce the number of getPciPath calls to a single one per invocation +and split the function in two so that the large `path` buffer does +not linger the in the stack during recursive calls. +--- + src/graph/xml.cc | 17 +++++++++++------ + 1 file changed, 11 insertions(+), 6 deletions(-) + +diff --git a/src/graph/xml.cc b/src/graph/xml.cc +index 8fea91950..42eb68a4b 100644 +--- a/src/graph/xml.cc ++++ b/src/graph/xml.cc +@@ -460,20 +460,21 @@ int checkBDFFormat(char* bdf) { + return 1; + } + +-ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* xml) { ++ncclResult_t ncclTopoGetXmlNodeFromSys(struct ncclXmlNode* pciNode, ++ struct ncclXml* xml, ++ struct ncclXmlNode** return_parent) { + // Fill info, then parent + const char* busId; + NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId)); + char path[PATH_MAX+1]; ++ NCCLCHECK(getPciPath(busId, path)); + int index; + NCCLCHECK(xmlGetAttrIndex(pciNode, "class", &index)); + if (index == -1) { +- NCCLCHECK(getPciPath(busId, path)); + NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class")); + } + NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index)); + if (index == -1) { +- NCCLCHECK(getPciPath(busId, path)); + char deviceSpeedStr[MAX_STR_LEN]; + float deviceSpeed; + NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_speed", deviceSpeedStr)); +@@ -486,7 +487,6 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* + } + NCCLCHECK(xmlGetAttrIndex(pciNode, "link_width", &index)); + if (index == -1) { +- NCCLCHECK(getPciPath(busId, path)); + char strValue[MAX_STR_LEN]; + NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_width", strValue)); + int deviceWidth = strtol(strValue, NULL, 0); +@@ -496,8 +496,6 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* + } + struct ncclXmlNode* parent = pciNode->parent; + if (parent == NULL) { +- NCCLCHECK(getPciPath(busId, path)); +- + // Save that for later in case next step is a CPU + char numaIdStr[MAX_STR_LEN]; + NCCLCHECK(ncclTopoGetStrFromSys(path, "numa_node", numaIdStr)); +@@ -541,6 +539,13 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* + pciNode->parent = parent; + parent->subs[parent->nSubs++] = pciNode; + } ++ *return_parent = parent; ++ return ncclSuccess; ++} ++ ++ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* xml) { ++ struct ncclXmlNode* parent; ++ ncclTopoGetXmlNodeFromSys(pciNode, xml, &parent); + if (strcmp(parent->name, "pci") == 0) { + NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml)); + } else if (strcmp(parent->name, "cpu") == 0) {