Skip to content

Commit

Permalink
Update test_RN50_external_source_parallel_train_ddp.py to work with t…
Browse files Browse the repository at this point in the history
…he latest PyTorch

- updates the invocation of test_RN50_external_source_parallel_train_ddp.py test
  to use torchrun and propagate local rank through the env variable

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
  • Loading branch information
JanuszL committed Dec 4, 2023
1 parent d18879c commit 31ca1f3
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 12 deletions.
16 changes: 8 additions & 8 deletions dali/test/python/test_RN50_external_source_parallel_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,17 +396,17 @@ def parse_test_arguments(supports_distributed):
"with tensor.gpu()",
)

if supports_distributed:
parser.add_argument(
"--local_rank",
default=0,
type=int,
help="Id of the local rank in distributed scenario.",
)
else:
if not supports_distributed:
parser.add_argument("-g", "--gpus", default=1, type=int, metavar="N", help="number of GPUs")
args = parser.parse_args()

if 'WORLD_SIZE' in os.environ:
args.distributed = int(os.environ['WORLD_SIZE']) > 1
if 'LOCAL_RANK' in os.environ:
args.local_rank = int(os.environ['LOCAL_RANK'])
else:
args.local_rank = 0

if supports_distributed:
print(
"GPU ID: {}, batch: {}, epochs: {}, workers: {}, py_workers: {}, prefetch depth: {}, "
Expand Down
8 changes: 4 additions & 4 deletions qa/TL2_FW_iterators_perf/test_pytorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ test_body() {
python test_RN50_data_fw_iterators.py --framework ${fw} --gpus ${NUM_GPUS} -b 13 \
--workers 3 --prefetch 2 --epochs 3
done
python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init fork --benchmark_iters 500 --test_pipes parallel
python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init spawn --benchmark_iters 500 --test_pipes parallel
python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --benchmark_iters 500 --test_pipes file_reader
python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --benchmark_iters 250 --test_pipes scalar
torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init fork --benchmark_iters 500 --test_pipes parallel
torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init spawn --benchmark_iters 500 --test_pipes parallel
torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --benchmark_iters 500 --test_pipes file_reader
torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --benchmark_iters 250 --test_pipes scalar
}

pushd ../..
Expand Down

0 comments on commit 31ca1f3

Please sign in to comment.