Prevent test failures by manually triggering Python garbage collector before resetting the server def.

Due to current implementation of set_server_def, resources might be leaked and destroyed after the device manager (and devices) are released. When there are multiple set_server_def calls, this leads to non-deterministic segfaults when the Python GC starts to clean up hanging resources.

PiperOrigin-RevId: 259668467
This commit is contained in:
Haoyu Zhang 2019-07-23 20:56:05 -07:00 committed by TensorFlower Gardener
parent 150a6c06b2
commit 805b28132e

View File

@ -25,6 +25,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gc
import os
import time
@ -1114,8 +1115,7 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
wall_time=mean_us,
extras={"examples_per_sec": num_iters / total_time})
# TODO(b/136184459): Re-enabled once crash is fixed
def _DISABLED_benchmark_send_mirroring_off(self):
def benchmark_send_mirroring_off(self):
remote.connect_to_remote_host(self._cached_server_target1)
x = random_ops.random_uniform((2, 2)).cpu()
@ -1130,9 +1130,12 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
context.context().mirroring_policy = context.MIRRORING_NONE
self._run(lambda: func(x))
# NOTE(b/136184459): Force garbage collecting hanging resources before
# subsequent calls to set_server_def, to ensure the destroy resource ops are
# executed when their corresponding device and manager are still available.
gc.collect()
# TODO(b/136184459): Re-enabled once crash is fixed
def _DISABLED_benchmark_send_mirroring_on(self):
def benchmark_send_mirroring_on(self):
remote.connect_to_remote_host(self._cached_server_target1)
x = random_ops.random_uniform((2, 2)).cpu()
@ -1147,9 +1150,12 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
context.context().mirroring_policy = context.MIRRORING_ALL
self._run(lambda: func(x))
# NOTE(b/136184459): Force garbage collecting hanging resources before
# subsequent calls to set_server_def, to ensure the destroy resource ops are
# executed when their corresponding device and manager are still available.
gc.collect()
# TODO(b/136184459): Re-enabled once crash is fixed
def _DISABLED_benchmark_worker_mirroring_off(self):
def benchmark_worker_mirroring_off(self):
remote.connect_to_remote_host(
[self._cached_server_target1, self._cached_server_target2])
@ -1166,9 +1172,12 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
context.context().mirroring_policy = context.MIRRORING_NONE
self._run(func)
# NOTE(b/136184459): Force garbage collecting hanging resources before
# subsequent calls to set_server_def, to ensure the destroy resource ops are
# executed when their corresponding device and manager are still available.
gc.collect()
# TODO(b/136184459): Re-enabled once crash is fixed
def _DISABLED_benchmark_worker_mirroring_on(self):
def benchmark_worker_mirroring_on(self):
remote.connect_to_remote_host(
[self._cached_server_target1, self._cached_server_target2])
@ -1185,6 +1194,10 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
context.context().mirroring_policy = context.MIRRORING_ALL
self._run(func)
# NOTE(b/136184459): Force garbage collecting hanging resources before
# subsequent calls to set_server_def, to ensure the destroy resource ops are
# executed when their corresponding device and manager are still available.
gc.collect()
if __name__ == "__main__":