Merge pull request #40305 from ROCmSoftwarePlatform:google-upstream-kernel-helper-test
PiperOrigin-RevId: 317684035 Change-Id: I743fb0d2c8ba73cbe66a399c921b28d09762ef39
This commit is contained in:
commit
809e482247
|
@ -53,6 +53,8 @@ using gpuEvent_t = cudaEvent_t;
|
||||||
#define gpuEventCreate cudaEventCreate
|
#define gpuEventCreate cudaEventCreate
|
||||||
#define gpuEventCreateWithFlags cudaEventCreateWithFlags
|
#define gpuEventCreateWithFlags cudaEventCreateWithFlags
|
||||||
#define gpuEventDisableTiming cudaEventDisableTiming
|
#define gpuEventDisableTiming cudaEventDisableTiming
|
||||||
|
#define gpuDeviceSynchronize cudaDeviceSynchronize
|
||||||
|
#define gpuFree cudaFree
|
||||||
#elif TENSORFLOW_USE_ROCM
|
#elif TENSORFLOW_USE_ROCM
|
||||||
using gpuFloatComplex = hipFloatComplex;
|
using gpuFloatComplex = hipFloatComplex;
|
||||||
using gpuDoubleComplex = hipDoubleComplex;
|
using gpuDoubleComplex = hipDoubleComplex;
|
||||||
|
@ -68,6 +70,8 @@ using cudaError_t = int;
|
||||||
#define gpuEventCreate hipEventCreate
|
#define gpuEventCreate hipEventCreate
|
||||||
#define gpuEventCreateWithFlags hipEventCreateWithFlags
|
#define gpuEventCreateWithFlags hipEventCreateWithFlags
|
||||||
#define gpuEventDisableTiming hipEventDisableTiming
|
#define gpuEventDisableTiming hipEventDisableTiming
|
||||||
|
#define gpuDeviceSynchronize hipDeviceSynchronize
|
||||||
|
#define gpuFree hipFree
|
||||||
static std::string cudaGetErrorString(int err) { return std::to_string(err); }
|
static std::string cudaGetErrorString(int err) { return std::to_string(err); }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
|
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
|
#include <time.h>
|
||||||
|
|
||||||
#include <numeric>
|
#include <numeric>
|
||||||
|
|
||||||
#include "tensorflow/core/lib/core/status_test_util.h"
|
#include "tensorflow/core/lib/core/status_test_util.h"
|
||||||
|
@ -25,14 +27,14 @@ limitations under the License.
|
||||||
|
|
||||||
#define CUDA_EXPECT_SUCCESS \
|
#define CUDA_EXPECT_SUCCESS \
|
||||||
{ \
|
{ \
|
||||||
cudaDeviceSynchronize(); \
|
gpuDeviceSynchronize(); \
|
||||||
cudaError_t err = cudaGetLastError(); \
|
cudaError_t err = cudaGetLastError(); \
|
||||||
EXPECT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
|
EXPECT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CUDA_ASSERT_SUCCESS \
|
#define CUDA_ASSERT_SUCCESS \
|
||||||
{ \
|
{ \
|
||||||
cudaDeviceSynchronize(); \
|
gpuDeviceSynchronize(); \
|
||||||
cudaError_t err = cudaGetLastError(); \
|
cudaError_t err = cudaGetLastError(); \
|
||||||
ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
|
ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
|
||||||
}
|
}
|
||||||
|
@ -94,8 +96,7 @@ __global__ void Count3D(Gpu3DLaunchConfig config, int bufsize,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void CudaShuffleGetSrcLaneTest(
|
__global__ void GpuShuffleGetSrcLaneTest(unsigned* __restrict__ failure_count) {
|
||||||
unsigned* __restrict__ failure_count) {
|
|
||||||
unsigned lane_id = GpuLaneId();
|
unsigned lane_id = GpuLaneId();
|
||||||
for (int width = warpSize; width > 1; width /= 2) {
|
for (int width = warpSize; width > 1; width /= 2) {
|
||||||
auto check_result = [&](const char* op_name, int param, unsigned actual,
|
auto check_result = [&](const char* op_name, int param, unsigned actual,
|
||||||
|
@ -103,31 +104,38 @@ __global__ void CudaShuffleGetSrcLaneTest(
|
||||||
if (actual != expected) {
|
if (actual != expected) {
|
||||||
printf("Cuda%sGetSrcLane(%d, %d) for lane %d returned %d, not %d\n",
|
printf("Cuda%sGetSrcLane(%d, %d) for lane %d returned %d, not %d\n",
|
||||||
op_name, param, width, lane_id, actual, expected);
|
op_name, param, width, lane_id, actual, expected);
|
||||||
CudaAtomicAdd(failure_count, 1);
|
GpuAtomicAdd(failure_count, 1);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
for (int src_lane = -warpSize; src_lane <= warpSize; ++src_lane) {
|
for (int src_lane = -warpSize; src_lane <= warpSize; ++src_lane) {
|
||||||
unsigned actual_lane = detail::CudaShuffleGetSrcLane(src_lane, width);
|
#if TENSORFLOW_USE_ROCM
|
||||||
|
if (src_lane < 0 || src_lane >= width) continue;
|
||||||
|
#endif
|
||||||
|
unsigned actual_lane = detail::GpuShuffleGetSrcLane(src_lane, width);
|
||||||
unsigned expect_lane =
|
unsigned expect_lane =
|
||||||
CudaShuffleSync(kCudaWarpAll, lane_id, src_lane, width);
|
GpuShuffleSync(kCudaWarpAll, lane_id, src_lane, width);
|
||||||
check_result("Shuffle", src_lane, actual_lane, expect_lane);
|
check_result("Shuffle", src_lane, actual_lane, expect_lane);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (unsigned delta = 0; delta <= warpSize; ++delta) {
|
for (unsigned delta = 0; delta <= warpSize; ++delta) {
|
||||||
unsigned actual_lane = detail::CudaShuffleUpGetSrcLane(delta, width);
|
unsigned actual_lane = detail::GpuShuffleUpGetSrcLane(delta, width);
|
||||||
unsigned expect_lane =
|
unsigned expect_lane =
|
||||||
CudaShuffleUpSync(kCudaWarpAll, lane_id, delta, width);
|
GpuShuffleUpSync(kCudaWarpAll, lane_id, delta, width);
|
||||||
check_result("ShuffleUp", delta, actual_lane, expect_lane);
|
check_result("ShuffleUp", delta, actual_lane, expect_lane);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (unsigned delta = 0; delta <= warpSize; ++delta) {
|
for (unsigned delta = 0; delta <= warpSize; ++delta) {
|
||||||
unsigned actual_lane = detail::CudaShuffleDownGetSrcLane(delta, width);
|
unsigned actual_lane = detail::GpuShuffleDownGetSrcLane(delta, width);
|
||||||
unsigned expect_lane =
|
unsigned expect_lane =
|
||||||
CudaShuffleDownSync(kCudaWarpAll, lane_id, delta, width);
|
GpuShuffleDownSync(kCudaWarpAll, lane_id, delta, width);
|
||||||
check_result("ShuffleDown", delta, actual_lane, expect_lane);
|
check_result("ShuffleDown", delta, actual_lane, expect_lane);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int lane_lane = warpSize; lane_lane > 0; lane_lane /= 2) {
|
for (int lane_lane = warpSize; lane_lane > 0; lane_lane /= 2) {
|
||||||
unsigned actual_lane = detail::CudaShuffleXorGetSrcLane(lane_lane, width);
|
unsigned actual_lane = detail::GpuShuffleXorGetSrcLane(lane_lane, width);
|
||||||
unsigned expect_lane =
|
unsigned expect_lane =
|
||||||
CudaShuffleXorSync(kCudaWarpAll, lane_id, lane_lane, width);
|
GpuShuffleXorSync(kCudaWarpAll, lane_id, lane_lane, width);
|
||||||
check_result("ShuffleXor", lane_lane, actual_lane, expect_lane);
|
check_result("ShuffleXor", lane_lane, actual_lane, expect_lane);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -137,19 +145,32 @@ __global__ void CudaShuffleGetSrcLaneTest(
|
||||||
|
|
||||||
class GpuLaunchConfigTest : public ::testing::Test {
|
class GpuLaunchConfigTest : public ::testing::Test {
|
||||||
protected:
|
protected:
|
||||||
const int bufsize = 1024;
|
static const int bufsize = 1024;
|
||||||
int* outbuf = nullptr;
|
int* outbuf = nullptr;
|
||||||
|
int* outbuf_host = nullptr;
|
||||||
|
int hostbuf[bufsize];
|
||||||
Eigen::GpuStreamDevice stream;
|
Eigen::GpuStreamDevice stream;
|
||||||
Eigen::GpuDevice d = Eigen::GpuDevice(&stream);
|
Eigen::GpuDevice d = Eigen::GpuDevice(&stream);
|
||||||
|
|
||||||
|
void copyToHost() {
|
||||||
|
#if TENSORFLOW_USE_ROCM
|
||||||
|
hipMemcpy(hostbuf, outbuf, sizeof(int) * bufsize, hipMemcpyDeviceToHost);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
virtual void SetUp() {
|
virtual void SetUp() {
|
||||||
|
#if GOOGLE_CUDA
|
||||||
cudaError_t err = cudaMallocManaged(&outbuf, sizeof(int) * bufsize);
|
cudaError_t err = cudaMallocManaged(&outbuf, sizeof(int) * bufsize);
|
||||||
|
outbuf_host = outbuf;
|
||||||
|
#else
|
||||||
|
cudaError_t err = hipMalloc(&outbuf, sizeof(int) * bufsize);
|
||||||
|
outbuf_host = hostbuf;
|
||||||
|
#endif
|
||||||
ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err);
|
ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err);
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void TearDown() {
|
virtual void TearDown() {
|
||||||
cudaDeviceSynchronize();
|
gpuDeviceSynchronize();
|
||||||
cudaFree(outbuf);
|
gpuFree(outbuf);
|
||||||
outbuf = nullptr;
|
outbuf = nullptr;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -158,28 +179,32 @@ TEST_F(GpuLaunchConfigTest, GetGpuLaunchConfig) {
|
||||||
GpuLaunchConfig cfg;
|
GpuLaunchConfig cfg;
|
||||||
|
|
||||||
// test valid inputs
|
// test valid inputs
|
||||||
#define TEST_LAUNCH_PARAMETER(work_element_count) \
|
#define TEST_LAUNCH_PARAMETER(work_element_count) \
|
||||||
cfg = GetGpuLaunchConfig(bufsize, d); \
|
cfg = GetGpuLaunchConfig(bufsize, d); \
|
||||||
TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count, \
|
TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count, \
|
||||||
cfg.thread_per_block, 0, d.stream(), cfg, \
|
cfg.thread_per_block, 0, d.stream(), cfg, \
|
||||||
outbuf)); \
|
outbuf)); \
|
||||||
CUDA_ASSERT_SUCCESS \
|
CUDA_ASSERT_SUCCESS \
|
||||||
cfg = GetGpuLaunchConfig(work_element_count, d); \
|
cfg = GetGpuLaunchConfig(work_element_count, d); \
|
||||||
TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
|
TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
|
||||||
0, d.stream(), cfg, bufsize, outbuf)); \
|
0, d.stream(), cfg, bufsize, outbuf)); \
|
||||||
CUDA_EXPECT_SUCCESS \
|
CUDA_EXPECT_SUCCESS \
|
||||||
EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \
|
copyToHost(); \
|
||||||
\
|
EXPECT_EQ(work_element_count, \
|
||||||
cfg = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
|
std::accumulate(outbuf_host, outbuf_host + bufsize, 0)); \
|
||||||
TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count, \
|
\
|
||||||
cfg.thread_per_block, 0, d.stream(), cfg, \
|
cfg = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
|
||||||
outbuf)); \
|
TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count, \
|
||||||
CUDA_ASSERT_SUCCESS \
|
cfg.thread_per_block, 0, d.stream(), cfg, \
|
||||||
cfg = GetGpuLaunchConfig(work_element_count, d, Count1D, 0, 0); \
|
outbuf)); \
|
||||||
TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
|
CUDA_ASSERT_SUCCESS \
|
||||||
0, d.stream(), cfg, bufsize, outbuf)); \
|
cfg = GetGpuLaunchConfig(work_element_count, d, Count1D, 0, 0); \
|
||||||
CUDA_EXPECT_SUCCESS \
|
TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
|
||||||
EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
|
0, d.stream(), cfg, bufsize, outbuf)); \
|
||||||
|
CUDA_EXPECT_SUCCESS \
|
||||||
|
copyToHost(); \
|
||||||
|
EXPECT_EQ(work_element_count, \
|
||||||
|
std::accumulate(outbuf_host, outbuf_host + bufsize, 0));
|
||||||
|
|
||||||
TEST_LAUNCH_PARAMETER(128);
|
TEST_LAUNCH_PARAMETER(128);
|
||||||
TEST_LAUNCH_PARAMETER(129);
|
TEST_LAUNCH_PARAMETER(129);
|
||||||
|
@ -221,7 +246,9 @@ TEST_F(GpuLaunchConfigTest, GetGpu2DLaunchConfig) {
|
||||||
TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \
|
TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \
|
||||||
0, d.stream(), cfg, bufsize, outbuf)); \
|
0, d.stream(), cfg, bufsize, outbuf)); \
|
||||||
CUDA_EXPECT_SUCCESS \
|
CUDA_EXPECT_SUCCESS \
|
||||||
EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0)); \
|
copyToHost(); \
|
||||||
|
EXPECT_EQ(dimx* dimy, \
|
||||||
|
std::accumulate(outbuf_host, outbuf_host + bufsize, 0)); \
|
||||||
\
|
\
|
||||||
cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
|
cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
|
||||||
TF_EXPECT_OK(GpuLaunchKernel(SetOutbufZero, cfg1d.block_count, \
|
TF_EXPECT_OK(GpuLaunchKernel(SetOutbufZero, cfg1d.block_count, \
|
||||||
|
@ -232,7 +259,8 @@ TEST_F(GpuLaunchConfigTest, GetGpu2DLaunchConfig) {
|
||||||
TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \
|
TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \
|
||||||
0, d.stream(), cfg, bufsize, outbuf)); \
|
0, d.stream(), cfg, bufsize, outbuf)); \
|
||||||
CUDA_EXPECT_SUCCESS \
|
CUDA_EXPECT_SUCCESS \
|
||||||
EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
|
copyToHost(); \
|
||||||
|
EXPECT_EQ(dimx* dimy, std::accumulate(outbuf_host, outbuf_host + bufsize, 0))
|
||||||
|
|
||||||
TEST_LAUNCH_PARAMETER(128, 128);
|
TEST_LAUNCH_PARAMETER(128, 128);
|
||||||
TEST_LAUNCH_PARAMETER(129, 64);
|
TEST_LAUNCH_PARAMETER(129, 64);
|
||||||
|
@ -263,7 +291,9 @@ TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) {
|
||||||
TF_EXPECT_OK(GpuLaunchKernel(Count3D, cfg.block_count, cfg.thread_per_block, \
|
TF_EXPECT_OK(GpuLaunchKernel(Count3D, cfg.block_count, cfg.thread_per_block, \
|
||||||
0, d.stream(), cfg, bufsize, outbuf)); \
|
0, d.stream(), cfg, bufsize, outbuf)); \
|
||||||
CUDA_EXPECT_SUCCESS \
|
CUDA_EXPECT_SUCCESS \
|
||||||
EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
|
copyToHost(); \
|
||||||
|
EXPECT_EQ(dimx* dimy* dimz, \
|
||||||
|
std::accumulate(outbuf_host, outbuf_host + bufsize, 0))
|
||||||
|
|
||||||
TEST_LAUNCH_PARAMETER(128, 128, 128);
|
TEST_LAUNCH_PARAMETER(128, 128, 128);
|
||||||
TEST_LAUNCH_PARAMETER(129, 64, 1024);
|
TEST_LAUNCH_PARAMETER(129, 64, 1024);
|
||||||
|
@ -282,15 +312,19 @@ TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) {
|
||||||
|
|
||||||
TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
|
TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
|
||||||
unsigned* failure_count;
|
unsigned* failure_count;
|
||||||
|
#if GOOGLE_CUDA
|
||||||
ASSERT_EQ(cudaMallocManaged(&failure_count, sizeof(unsigned)), cudaSuccess);
|
ASSERT_EQ(cudaMallocManaged(&failure_count, sizeof(unsigned)), cudaSuccess);
|
||||||
|
#else
|
||||||
|
ASSERT_EQ(hipHostMalloc(&failure_count, sizeof(unsigned), 0), cudaSuccess);
|
||||||
|
#endif
|
||||||
*failure_count = 0;
|
*failure_count = 0;
|
||||||
TF_EXPECT_OK(GpuLaunchKernel(CudaShuffleGetSrcLaneTest, 1, 32, 0, nullptr,
|
TF_EXPECT_OK(GpuLaunchKernel(GpuShuffleGetSrcLaneTest, 1, TF_RED_WARPSIZE, 0,
|
||||||
failure_count));
|
nullptr, failure_count));
|
||||||
ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
|
ASSERT_EQ(gpuDeviceSynchronize(), cudaSuccess);
|
||||||
ASSERT_EQ(*failure_count, 0);
|
ASSERT_EQ(*failure_count, 0);
|
||||||
cudaFree(failure_count);
|
gpuFree(failure_count);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
|
||||||
#endif // GOOGLE_CUDA
|
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
|
|
Loading…
Reference in New Issue