Work around PyTorch internal fragmentation in L3 SSD test. (#3343)

Signed-off-by: Michal Zientkiewicz <michalz@nvidia.com>
NVIDIA · Sep 16, 2021 · 54034c4 · 54034c4
1 parent 96d1c94
commit 54034c4
Showing 1 changed file with 2 additions and 0 deletions.
diff --git a/qa/TL3_SSD_convergence/test_pytorch.sh b/qa/TL3_SSD_convergence/test_pytorch.sh
@@ -18,6 +18,8 @@ NUM_GPUS=$(nvidia-smi -L | wc -l)
 LOG=dali.log
 
 SECONDS=0
+# Prevent OOM due to fragmentation on 16G machines
+export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:4096
 python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} main.py --backbone resnet50 --warmup 300 --bs 64 --eval-batch-size 8 --data /coco --data /data/coco/coco-2017/coco2017/ --data_pipeline dali --target 0.25 2>&1 | tee $LOG
 
 RET=${PIPESTATUS[0]}