NVIDIA · JanuszL · Dec 5, 2023 · Dec 4, 2023
diff --git a/dali/test/python/test_RN50_external_source_parallel_utils.py b/dali/test/python/test_RN50_external_source_parallel_utils.py
@@ -396,17 +396,17 @@ def parse_test_arguments(supports_distributed):
         "with tensor.gpu()",
     )
 
-    if supports_distributed:
-        parser.add_argument(
-            "--local_rank",
-            default=0,
-            type=int,
-            help="Id of the local rank in distributed scenario.",
-        )
-    else:
+    if not supports_distributed:
         parser.add_argument("-g", "--gpus", default=1, type=int, metavar="N", help="number of GPUs")
     args = parser.parse_args()
 
+    if "WORLD_SIZE" in os.environ:
+        args.distributed = int(os.environ["WORLD_SIZE"]) > 1
+    if "LOCAL_RANK" in os.environ:
+        args.local_rank = int(os.environ["LOCAL_RANK"])
+    else:
+        args.local_rank = 0
+
     if supports_distributed:
         print(
             "GPU ID: {}, batch: {}, epochs: {}, workers: {}, py_workers: {}, prefetch depth: {}, "

diff --git a/qa/TL2_FW_iterators_perf/test_pytorch.sh b/qa/TL2_FW_iterators_perf/test_pytorch.sh
@@ -13,10 +13,10 @@ test_body() {
         python test_RN50_data_fw_iterators.py --framework ${fw} --gpus ${NUM_GPUS} -b 13 \
             --workers 3 --prefetch 2 --epochs 3
     done
-    python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init fork --benchmark_iters 500 --test_pipes parallel
-    python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init spawn --benchmark_iters 500 --test_pipes parallel
-    python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --benchmark_iters 500 --test_pipes file_reader
-    python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --benchmark_iters 250 --test_pipes scalar
+    torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init fork --benchmark_iters 500 --test_pipes parallel
+    torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init spawn --benchmark_iters 500 --test_pipes parallel
+    torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --benchmark_iters 500 --test_pipes file_reader
+    torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --benchmark_iters 250 --test_pipes scalar
 }
 
 pushd ../..