Adding ROCm support for comapre_and_bitpack op
This commit is contained in:
parent
b211c7a053
commit
bf65e06819
@ -164,7 +164,7 @@ struct CompareAndBitpack<CPUDevice, T> {
|
||||
|
||||
} // namespace functor
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#define REGISTER_COMPARE_AND_BITPACK(type) \
|
||||
REGISTER_KERNEL_BUILDER( \
|
||||
@ -193,6 +193,6 @@ TF_CALL_bool(DECLARE_GPU_SPEC)
|
||||
|
||||
} // namespace functor
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
@ -41,7 +41,7 @@ __global__ void CompareAndBitpackKernel(const int size, const T* threshold,
|
||||
// result for 4 blocks) followed by an appropriate shift and mask to
|
||||
// get the 8-bits of interest.
|
||||
const T thresh = ldg(threshold);
|
||||
CUDA_1D_KERNEL_LOOP(i, size) {
|
||||
GPU_1D_KERNEL_LOOP(i, size) {
|
||||
const T* block = input + 8 * i;
|
||||
output[i] =
|
||||
((((ldg(block) > thresh) << 7)) | (((ldg(block + 1) > thresh) << 6)) |
|
||||
@ -61,7 +61,7 @@ __global__ void CompareAndBitpackKernel<bool>(const int size,
|
||||
// TODO(ebrevdo): Erich said: I think you could again have multiple
|
||||
// threads work on one block and use the ballot instruction to the
|
||||
// bit packing in one instruction.
|
||||
CUDA_1D_KERNEL_LOOP(i, size) {
|
||||
GPU_1D_KERNEL_LOOP(i, size) {
|
||||
const int64 block = ldg(reinterpret_cast<const int64*>(input + 8 * i));
|
||||
// NOTE(ebrevdo): This assumes memory is little-endian.
|
||||
output[i] =
|
||||
@ -81,7 +81,7 @@ __global__ void CompareAndBitpackKernel<float>(const int size,
|
||||
const float* input,
|
||||
uint8* output) {
|
||||
const float thresh = ldg(threshold);
|
||||
CUDA_1D_KERNEL_LOOP(i, size) {
|
||||
GPU_1D_KERNEL_LOOP(i, size) {
|
||||
const float4 block0 = ldg(reinterpret_cast<const float4*>(input + 8 * i));
|
||||
const float4 block1 =
|
||||
ldg(reinterpret_cast<const float4*>(input + 8 * i + 4));
|
||||
@ -98,7 +98,7 @@ __global__ void CompareAndBitpackKernel<double>(const int size,
|
||||
const double* input,
|
||||
uint8* output) {
|
||||
const double thresh = ldg(threshold);
|
||||
CUDA_1D_KERNEL_LOOP(i, size) {
|
||||
GPU_1D_KERNEL_LOOP(i, size) {
|
||||
const double2 block0 = ldg(reinterpret_cast<const double2*>(input + 8 * i));
|
||||
const double2 block1 =
|
||||
ldg(reinterpret_cast<const double2*>(input + 8 * i + 2));
|
||||
@ -121,12 +121,12 @@ __global__ void CompareAndBitpackKernel<double>(const int size,
|
||||
TTypes<uint8>::Matrix output) { \
|
||||
const GPUDevice& d = c->eigen_device<GPUDevice>(); \
|
||||
int64 total_count = output.size(); \
|
||||
GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d); \
|
||||
GpuLaunchConfig config = GetGpuLaunchConfig(total_count, d); \
|
||||
\
|
||||
TF_CHECK_OK(CudaLaunchKernel(CompareAndBitpackKernel<T>, \
|
||||
config.block_count, config.thread_per_block, \
|
||||
0, d.stream(), total_count, threshold.data(), \
|
||||
input.data(), output.data())); \
|
||||
TF_CHECK_OK(GpuLaunchKernel(CompareAndBitpackKernel<T>, \
|
||||
config.block_count, config.thread_per_block, \
|
||||
0, d.stream(), total_count, threshold.data(), \
|
||||
input.data(), output.data())); \
|
||||
}
|
||||
|
||||
TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS)
|
||||
@ -138,4 +138,4 @@ TF_CALL_bool(DEFINE_GPU_SPECS)
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
Loading…
Reference in New Issue
Block a user