Adding ROCm support for comapre_and_bitpack op

This commit is contained in:
Deven Desai 2019-05-29 22:13:31 +00:00
parent b211c7a053
commit bf65e06819
2 changed files with 13 additions and 13 deletions

View File

@ -164,7 +164,7 @@ struct CompareAndBitpack<CPUDevice, T> {
} // namespace functor
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define REGISTER_COMPARE_AND_BITPACK(type) \
REGISTER_KERNEL_BUILDER( \
@ -193,6 +193,6 @@ TF_CALL_bool(DECLARE_GPU_SPEC)
} // namespace functor
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
} // namespace tensorflow

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU
@ -41,7 +41,7 @@ __global__ void CompareAndBitpackKernel(const int size, const T* threshold,
// result for 4 blocks) followed by an appropriate shift and mask to
// get the 8-bits of interest.
const T thresh = ldg(threshold);
CUDA_1D_KERNEL_LOOP(i, size) {
GPU_1D_KERNEL_LOOP(i, size) {
const T* block = input + 8 * i;
output[i] =
((((ldg(block) > thresh) << 7)) | (((ldg(block + 1) > thresh) << 6)) |
@ -61,7 +61,7 @@ __global__ void CompareAndBitpackKernel<bool>(const int size,
// TODO(ebrevdo): Erich said: I think you could again have multiple
// threads work on one block and use the ballot instruction to the
// bit packing in one instruction.
CUDA_1D_KERNEL_LOOP(i, size) {
GPU_1D_KERNEL_LOOP(i, size) {
const int64 block = ldg(reinterpret_cast<const int64*>(input + 8 * i));
// NOTE(ebrevdo): This assumes memory is little-endian.
output[i] =
@ -81,7 +81,7 @@ __global__ void CompareAndBitpackKernel<float>(const int size,
const float* input,
uint8* output) {
const float thresh = ldg(threshold);
CUDA_1D_KERNEL_LOOP(i, size) {
GPU_1D_KERNEL_LOOP(i, size) {
const float4 block0 = ldg(reinterpret_cast<const float4*>(input + 8 * i));
const float4 block1 =
ldg(reinterpret_cast<const float4*>(input + 8 * i + 4));
@ -98,7 +98,7 @@ __global__ void CompareAndBitpackKernel<double>(const int size,
const double* input,
uint8* output) {
const double thresh = ldg(threshold);
CUDA_1D_KERNEL_LOOP(i, size) {
GPU_1D_KERNEL_LOOP(i, size) {
const double2 block0 = ldg(reinterpret_cast<const double2*>(input + 8 * i));
const double2 block1 =
ldg(reinterpret_cast<const double2*>(input + 8 * i + 2));
@ -121,12 +121,12 @@ __global__ void CompareAndBitpackKernel<double>(const int size,
TTypes<uint8>::Matrix output) { \
const GPUDevice& d = c->eigen_device<GPUDevice>(); \
int64 total_count = output.size(); \
GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d); \
GpuLaunchConfig config = GetGpuLaunchConfig(total_count, d); \
\
TF_CHECK_OK(CudaLaunchKernel(CompareAndBitpackKernel<T>, \
config.block_count, config.thread_per_block, \
0, d.stream(), total_count, threshold.data(), \
input.data(), output.data())); \
TF_CHECK_OK(GpuLaunchKernel(CompareAndBitpackKernel<T>, \
config.block_count, config.thread_per_block, \
0, d.stream(), total_count, threshold.data(), \
input.data(), output.data())); \
}
TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS)
@ -138,4 +138,4 @@ TF_CALL_bool(DEFINE_GPU_SPECS)
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM