Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug fix and refine cloud training for DS2 #201

Merged
merged 3 commits into from
Aug 14, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions deep_speech_2/cloud/_init_paths.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Set up paths for DS2"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os.path
import sys


def add_path(path):
if path not in sys.path:
sys.path.insert(0, path)


this_dir = os.path.dirname(__file__)
proj_path = os.path.join(this_dir, '..')
add_path(proj_path)
37 changes: 21 additions & 16 deletions deep_speech_2/cloud/pcloud_submit.sh
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
# Configure input data set in local filesystem
TRAIN_MANIFEST="/home/work/demo/ds2/pcloud/models/deep_speech_2/datasets/manifest.dev"
TEST_MANIFEST="/home/work/demo/ds2/pcloud/models/deep_speech_2/datasets/manifest.dev"
VOCAB_FILE="/home/work/demo/ds2/pcloud/models/deep_speech_2/datasets/vocab/eng_vocab.txt"
MEAN_STD_FILE="/home/work/demo/ds2/pcloud/models/deep_speech_2/mean_std.npz"

TRAIN_MANIFEST="../datasets/manifest.train"
DEV_MANIFEST="../datasets/manifest.dev"
VOCAB_FILE="../datasets/vocab/eng_vocab.txt"
MEAN_STD_FILE="../mean_std.npz"
# Configure output path in PaddleCloud filesystem
CLOUD_DATA_DIR="/pfs/dlnel/home/demo/deepspeech2/data"
CLOUD_MODEL_DIR="/pfs/dlnel/home/demo/deepspeech2/model"
CLOUD_DATA_DIR="/pfs/dlnel/home/sunxinghai@baidu.com/deepspeech2/data"
CLOUD_MODEL_DIR="/pfs/dlnel/home/sunxinghai@baidu.com/deepspeech2/model"
# Configure cloud resources
NUM_CPU=8
NUM_GPU=8
NUM_NODE=1
MEMORY="10Gi"
IS_LOCAL="True"

# Pack and upload local data to PaddleCloud filesystem
python upload_data.py \
--train_manifest_path=${TRAIN_MANIFEST} \
--test_manifest_path=${TEST_MANIFEST} \
--dev_manifest_path=${DEV_MANIFEST} \
--vocab_file=${VOCAB_FILE} \
--mean_std_file=${MEAN_STD_FILE} \
--cloud_data_path=${CLOUD_DATA_DIR}
Expand All @@ -21,23 +26,23 @@ then
exit 1
fi

JOB_NAME=deepspeech`date +%Y%m%d%H%M%S`
# Submit job to PaddleCloud
JOB_NAME=deepspeech-`date +%Y%m%d%H%M%S`
DS2_PATH=${PWD%/*}
cp -f pcloud_train.sh ${DS2_PATH}

# Configure computation resource and submit job to PaddleCloud
paddlecloud submit \
-image bootstrapper:5000/wanghaoshuang/pcloud_ds2:latest \
-jobname ${JOB_NAME} \
-cpu 4 \
-gpu 4 \
-memory 10Gi \
-parallelism 2 \
-cpu ${NUM_CPU} \
-gpu ${NUM_GPU} \
-memory ${MEMORY} \
-parallelism ${NUM_NODE} \
-pscpu 1 \
-pservers 1 \
-psmemory 10Gi \
-psmemory ${MEMORY} \
-passes 1 \
-entry "sh pcloud_train.sh ${CLOUD_DATA_DIR} ${CLOUD_MODEL_DIR}" \
-entry "sh pcloud_train.sh ${CLOUD_DATA_DIR} ${CLOUD_MODEL_DIR} ${NUM_CPU} ${NUM_GPU} ${IS_LOCAL}" \
${DS2_PATH}

rm ${DS2_PATH}/pcloud_train.sh
32 changes: 20 additions & 12 deletions deep_speech_2/cloud/pcloud_train.sh
Original file line number Diff line number Diff line change
@@ -1,28 +1,36 @@
DATA_PATH=$1
MODEL_PATH=$2
NUM_CPU=$3
NUM_GPU=$4
IS_LOCAL=$5

TRAIN_MANI=${DATA_PATH}/cloud.train.manifest
DEV_MANI=${DATA_PATH}/cloud.test.manifest
DEV_MANI=${DATA_PATH}/cloud.dev.manifest
TRAIN_TAR=${DATA_PATH}/cloud.train.tar
DEV_TAR=${DATA_PATH}/cloud.test.tar
DEV_TAR=${DATA_PATH}/cloud.dev.tar
VOCAB_PATH=${DATA_PATH}/vocab.txt
MEAN_STD_FILE=${DATA_PATH}/mean_std.npz

# split train data for each pcloud node
python ./cloud/split_data.py \
--in_manifest_path=$TRAIN_MANI \
--data_tar_path=$TRAIN_TAR \
--out_manifest_path='./local.train.manifest'
--in_manifest_path=${TRAIN_MANI} \
--data_tar_path=${TRAIN_TAR} \
--out_manifest_path='/local.train.manifest'

# split dev data for each pcloud node
python ./cloud/split_data.py \
--in_manifest_path=$DEV_MANI \
--data_tar_path=$DEV_TAR \
--out_manifest_path='./local.test.manifest'
--in_manifest_path=${DEV_MANI} \
--data_tar_path=${DEV_TAR} \
--out_manifest_path='/local.dev.manifest'

# run train
python train.py \
--use_gpu=1 \
--mean_std_filepath=$MEAN_STD_FILE \
--train_manifest_path='./local.train.manifest' \
--dev_manifest_path='./local.test.manifest' \
--vocab_filepath=$VOCAB_PATH \
--trainer_count=${NUM_GPU} \
--num_threads_data=${NUM_CPU} \
--is_local=${IS_LOCAL} \
--mean_std_filepath=${MEAN_STD_FILE} \
--train_manifest_path='/local.train.manifest' \
--dev_manifest_path='/local.dev.manifest' \
--vocab_filepath=${VOCAB_PATH} \
--output_model_dir=${MODEL_PATH}
1 change: 1 addition & 0 deletions deep_speech_2/cloud/split_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import json
import argparse
Expand Down
107 changes: 51 additions & 56 deletions deep_speech_2/cloud/upload_data.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,31 @@
"""This tool is used for preparing data for DeepSpeech2 trainning on paddle cloud.
"""This script is used for preparing data for DeepSpeech2 trainning on paddle
cloud.

Steps:
1. Read original manifest and get the local path of sound files.
2. Tar all local sound files into one tar file.
3. Modify original manifest to remove the local path information.

Finally, we will get a tar file and a manifest with sound file name, duration
and text.
Finally, we will get a tar file and a new manifest.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import json
import os
import tarfile
import sys
import argparse
import shutil
sys.path.append('../')
from data_utils.utils import read_manifest
from subprocess import call
import _init_paths
from data_utils.utils import read_manifest

TRAIN_TAR = "cloud.train.tar"
TRAIN_MANIFEST = "cloud.train.manifest"
TEST_TAR = "cloud.test.tar"
TEST_MANIFEST = "cloud.test.manifest"
DEV_TAR = "cloud.dev.tar"
DEV_MANIFEST = "cloud.dev.manifest"
VOCAB_FILE = "vocab.txt"
MEAN_STD_FILE = "mean_std.npz"

Expand All @@ -33,41 +34,41 @@
"--train_manifest_path",
default="../datasets/manifest.train",
type=str,
help="Manifest file of train data. (default: %(default)s)")
help="Manifest file path for train data. (default: %(default)s)")
parser.add_argument(
"--test_manifest_path",
default="../datasets/manifest.test",
"--dev_manifest_path",
default="../datasets/manifest.dev",
type=str,
help="Manifest file of test data. (default: %(default)s)")
help="Manifest file path for validation data. (default: %(default)s)")
parser.add_argument(
"--vocab_file",
default="../datasets/vocab/eng_vocab.txt",
type=str,
help="Vocab file to be uploaded to paddlecloud. (default: %(default)s)")
help="Vocabulary file to be uploaded to paddlecloud. "
"(default: %(default)s)")
parser.add_argument(
"--mean_std_file",
default="../mean_std.npz",
type=str,
help="mean_std file to be uploaded to paddlecloud. (default: %(default)s)")
help="Normalizer's statistics (mean and stddev) file to be uploaded to "
"paddlecloud. (default: %(default)s)")
parser.add_argument(
"--cloud_data_path",
required=True,
type=str,
help="Destination path on paddlecloud. (default: %(default)s)")
args = parser.parse_args()

help="Destination path on paddlecloud. (default: %(default)s)")
parser.add_argument(
"--local_tmp_path",
default="./tmp/",
type=str,
help="Local directory for storing temporary data. (default: %(default)s)")
help="Local directory for storing temporary data. (default: %(default)s)")
args = parser.parse_args()


def pack_data(manifest_path, out_tar_path, out_manifest_path):
'''1. According to the manifest, tar sound files into out_tar_path
2. Generate a new manifest for output tar file
'''
"""1. According to the manifest, tar sound files into out_tar_path.
2. Generate a new manifest for output tar file.
"""
out_tar = tarfile.open(out_tar_path, 'w')
manifest = read_manifest(manifest_path)
results = []
Expand All @@ -83,11 +84,19 @@ def pack_data(manifest_path, out_tar_path, out_manifest_path):
out_tar.close()


def pcloud_mkdir(dir):
"""Make directory in PaddleCloud filesystem.
"""
if call(['paddlecloud', 'mkdir', dir]) != 0:
raise IOError("PaddleCloud mkdir failed: %s." % dir)


def pcloud_cp(src, dst):
"""Copy src from local filesytem to dst in PaddleCloud filesystem.
"""Copy src from local filesytem to dst in PaddleCloud filesystem,
or downlowd src from PaddleCloud filesystem to dst in local filesystem.
"""
ret = call(['paddlecloud', 'cp', src, dst])
return ret
if call(['paddlecloud', 'cp', src, dst]) != 0:
raise IOError("PaddleCloud cp failed: from [%s] to [%s]." % (src, dst))


def pcloud_exist(path):
Expand All @@ -100,48 +109,34 @@ def pcloud_exist(path):
if __name__ == '__main__':
cloud_train_manifest = os.path.join(args.cloud_data_path, TRAIN_MANIFEST)
cloud_train_tar = os.path.join(args.cloud_data_path, TRAIN_TAR)
cloud_test_manifest = os.path.join(args.cloud_data_path, TEST_MANIFEST)
cloud_test_tar = os.path.join(args.cloud_data_path, TEST_TAR)
cloud_dev_manifest = os.path.join(args.cloud_data_path, DEV_MANIFEST)
cloud_dev_tar = os.path.join(args.cloud_data_path, DEV_TAR)
cloud_vocab_file = os.path.join(args.cloud_data_path, VOCAB_FILE)
cloud_mean_file = os.path.join(args.cloud_data_path, MEAN_STD_FILE)

local_train_manifest = os.path.join(args.local_tmp_path, TRAIN_MANIFEST)
local_train_tar = os.path.join(args.local_tmp_path, TRAIN_TAR)
local_test_manifest = os.path.join(args.local_tmp_path, TEST_MANIFEST)
local_test_tar = os.path.join(args.local_tmp_path, TEST_TAR)
local_dev_manifest = os.path.join(args.local_tmp_path, DEV_MANIFEST)
local_dev_tar = os.path.join(args.local_tmp_path, DEV_TAR)

# prepare local and cloud dir
if os.path.exists(args.local_tmp_path):
shutil.rmtree(args.local_tmp_path)
os.makedirs(args.local_tmp_path)
pcloud_mkdir(args.cloud_data_path)

# pack and upload train data
pack_data(args.train_manifest_path, local_train_tar, local_train_manifest)
pcloud_cp(local_train_manifest, cloud_train_manifest)
pcloud_cp(local_train_tar, cloud_train_tar)

# pack and upload validation data
pack_data(args.dev_manifest_path, local_dev_tar, local_dev_manifest)
pcloud_cp(local_dev_manifest, cloud_dev_manifest)
pcloud_cp(local_dev_tar, cloud_dev_tar)

# train data
if args.train_manifest_path != "":
ret = pcloud_exist(cloud_train_manifest)
if ret != 0:
pack_data(args.train_manifest_path, local_train_tar,
local_train_manifest)
pcloud_cp(local_train_manifest, cloud_train_manifest)
pcloud_cp(local_train_tar, cloud_train_tar)

# test data
if args.test_manifest_path != "":
ret = pcloud_exist(cloud_test_manifest)
if ret != 0:
pack_data(args.test_manifest_path, local_test_tar,
local_test_manifest)
pcloud_cp(local_test_manifest, cloud_test_manifest)
pcloud_cp(local_test_tar, cloud_test_tar)

# vocab file
if args.vocab_file != "":
ret = pcloud_exist(cloud_vocab_file)
if ret != 0:
pcloud_cp(args.vocab_file, cloud_vocab_file)

# mean_std file
if args.mean_std_file != "":
ret = pcloud_exist(cloud_mean_file)
if ret != 0:
pcloud_cp(args.mean_std_file, cloud_mean_file)
# upload vocab file and mean_std file
pcloud_cp(args.vocab_file, cloud_vocab_file)
pcloud_cp(args.mean_std_file, cloud_mean_file)

shutil.rmtree(args.local_tmp_path)
6 changes: 5 additions & 1 deletion deep_speech_2/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def train(self,
gradient_clipping,
num_passes,
output_model_dir,
is_local=True,
num_iterations_print=100):
"""Train the model.

Expand All @@ -65,6 +66,8 @@ def train(self,
:param num_iterations_print: Number of training iterations for printing
a training loss.
:type rnn_iteratons_print: int
:param is_local: Set to False if running with pserver with multi-nodes.
:type is_local: bool
:param output_model_dir: Directory for saving the model (every pass).
:type output_model_dir: basestring
"""
Expand All @@ -79,7 +82,8 @@ def train(self,
trainer = paddle.trainer.SGD(
cost=self._loss,
parameters=self._parameters,
update_equation=optimizer)
update_equation=optimizer,
is_local=is_local)

# create event handler
def event_handler(event):
Expand Down
9 changes: 8 additions & 1 deletion deep_speech_2/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,12 @@
type=str,
help="Augmentation configuration in json-format. "
"(default: %(default)s)")
parser.add_argument(
"--is_local",
default=True,
type=distutils.util.strtobool,
help="Set to false if running with pserver in paddlecloud. "
"(default: %(default)s)")
args = parser.parse_args()


Expand Down Expand Up @@ -173,7 +179,8 @@ def train():
gradient_clipping=400,
num_passes=args.num_passes,
num_iterations_print=args.num_iterations_print,
output_model_dir=args.output_model_dir)
output_model_dir=args.output_model_dir,
is_local=args.is_local)


def main():
Expand Down