From 1c2527fd1134753431796831dcf225ce0846862d Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Wed, 27 May 2020 17:17:39 +0000 Subject: [PATCH] [ROCm] Fix for ROCm CSB breakage - 200527 The following commit introduces a new unit-test which fails on ROCm. https://github.com/tensorflow/tensorflow/commit/dbef0933ebe4d3d85be73e88cfe5f83cac0ae1d6 I think that this unit-test is for checking the reduced memory usage of the gradient checkpointing method. The sub-test `test_does_not_raise_oom_exception` fails on ROCm, because on the ROCm platform the scratch space required for doing backward convolution pushes the total memory allocation just beyond the 1GB limit imposed by the testcase. This fix moves up the threshold by 128MB (from 1024 MB to 1152 MB). This still presevers the intent of the unit-test, i.e. the `test_raises_oom_exception` continues to raise the exception, while also allowing the `test_does_not_raise_oom_exception` sub-test to pass on the ROCm platform. --- .../python/keras/integration_test/gradient_checkpoint_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py index 9d9e0a062b3..100f3ca2022 100644 --- a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py +++ b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py @@ -75,7 +75,7 @@ def _limit_gpu_memory(): if gpus: tf.config.experimental.set_virtual_device_configuration( gpus[0], - [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)]) + [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1152)]) return True return False