diff --git a/xdl/docs/tutorial/v1/xdl-job.yaml b/xdl/docs/tutorial/v1/xdl-job.yaml new file mode 100644 index 00000000..ece2121b --- /dev/null +++ b/xdl/docs/tutorial/v1/xdl-job.yaml @@ -0,0 +1,122 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: xdl-mnist-example-config +data: + config: |- + { + "worker": { + "instance_num": 2, + "cpu_cores": 4, + "gpu_cores": 0, + "memory_m": 4000 + }, + "ps": { + "instance_num": 1, + "cpu_cores": 2, + "gpu_cores": 0, + "memory_m": 1000 + }, + "checkpoint": { + "output_dir": "/checkpoint/" + } + } +--- +apiVersion: xdl.kubedl.io/v1alpha1 +kind: XDLJob +metadata: + name: xdl-mnist-example +spec: + cleanPodPolicy: None + xdlReplicaSpecs: + Scheduler: + replicas: 1 + restartPolicy: ExitCode + template: + spec: + containers: + - env: + - name: ZK_ADDR + value: zfs://zk:2181/xdl/ + command: + - bash + - -c + - exec python mnist.py --task_name $(TASK_NAME) --task_index $(TASK_INDEX) --run_mode dist --zk_addr $(ZK_ADDR) --config /config/config.json + image: kubedl/xdl-mnist-example + name: xdl + volumeMounts: + - mountPath: /checkpoint + name: checkpoint + - name: config-volume + mountPath: /config + volumes: + - emptyDir: {} + name: checkpoint + - name: config-volume + configMap: + name: xdl-mnist-example-config + items: + - key: config + path: config.json + PS: + replicas: 1 + restartPolicy: ExitCode + template: + spec: + containers: + - env: + - name: ZK_ADDR + value: zfs://zk:2181/xdl/ + command: + - bash + - -c + - exec python mnist.py --task_name $(TASK_NAME) --task_index $(TASK_INDEX) --run_mode dist --zk_addr $(ZK_ADDR) --config /config/config.json + image: kubedl/xdl-mnist-example + name: xdl + volumeMounts: + - mountPath: /checkpoint + name: checkpoint + - name: config-volume + mountPath: /config + volumes: + - emptyDir: {} + name: checkpoint + volumes: + - emptyDir: {} + name: checkpoint + - name: config-volume + configMap: + name: xdl-mnist-example-config + items: + - key: config + path: config.json + Worker: + replicas: 2 + restartPolicy: ExitCode + template: + spec: + containers: + - env: + - name: ZK_ADDR + value: zfs://zk:2181/xdl/ + command: + - bash + - -c + - exec python mnist.py --task_name $(TASK_NAME) --task_index $(TASK_INDEX) --run_mode dist --zk_addr $(ZK_ADDR) --config /config/config.json + image: kubedl/xdl-mnist-example + name: xdl + volumeMounts: + - mountPath: /checkpoint + name: checkpoint + - name: config-volume + mountPath: /config + volumes: + - emptyDir: {} + name: checkpoint + - name: config-volume + configMap: + name: xdl-mnist-example-config + items: + - key: config + path: config.json + ttlSecondsAfterFinished: 3600 \ No newline at end of file diff --git a/xdl/docs/tutorial/v1/xdl-zk.yaml b/xdl/docs/tutorial/v1/xdl-zk.yaml new file mode 100644 index 00000000..63545330 --- /dev/null +++ b/xdl/docs/tutorial/v1/xdl-zk.yaml @@ -0,0 +1,61 @@ +kind: Service +apiVersion: v1 +metadata: + name: zk + labels: + app: zk +spec: + clusterIP: None + ports: + - name: port-2181 + port: 2181 + - name: port-2888 + port: 2888 + - name: port-3888 + port: 3888 + selector: + app: zk +--- +kind: Deployment +apiVersion: apps/v1 +metadata: + name: zk +spec: + replicas: 1 + selector: + matchLabels: + app: zk + template: + metadata: + labels: + app: zk + spec: + hostname: zk + volumes: + - name: zk-data + emptyDir: {} + - name: zk-logs + emptyDir: {} + containers: + - name: zk + image: zookeeper + volumeMounts: + - name: zk-data + readOnly: false + mountPath: "/data/zk-data" + - name: zk-logs + readOnly: false + mountPath: "/data/zk-logs" + ports: + - containerPort: 2181 + - containerPort: 2888 + - containerPort: 3888 + env: + - name: ZOO_MY_ID + value: '0' + - name: ZOO_SERVERS + value: server.0=0.0.0.0:2888:3888;2181 + - name: ZOO_DATA_DIR + value: '/data/zk-data' + - name: ZOO_DATA_LOG_DIR + value: '/data/zk-logs' \ No newline at end of file diff --git a/xdl/docs/tutorial/xdljob.md b/xdl/docs/tutorial/xdljob.md new file mode 100644 index 00000000..601e6b5e --- /dev/null +++ b/xdl/docs/tutorial/xdljob.md @@ -0,0 +1,49 @@ +# 使用KubeDL Operator运行XDL + +这个教程将说明,如何在Kubernetes中运行分布式XDL训练任务。 + +## 依赖 + +在开始教程前,我们需要有一个完整的Kubernetes集群,同时需要在Kubernetes集群中安装上[KubeDL Operator](https://github.com/alibaba/kubedl#getting-started),并且[开启XDLJob支持](https://github.com/alibaba/kubedl#optional-enable-workload-kind-selectively)。 + +## 安装ZooKeeper服务 + +XDL强依赖ZooKeeper进行服务发现,所以我们需要先安装ZooKeeper服务。 + +下面的命令将按照一个单节点的ZooKeeper服务。 + +```bash +kubectl apply -f https://raw.githubusercontent.com/alibaba/x-deeplearning/master/xdl/docs/tutorial/v1/xdl-zk.yaml +``` + +对于生产环境,可以遵循Kubernetes[官方文档](https://kubernetes.io/docs/tutorials/stateful-application/zookeeper/)来安装一个3节点ZooKeeper集群。 + +## 运行XDLJob训练 + +我们需要生成一个XDLJob的Yaml部署文件,并且在里面设置好ZooKeeper服务地址。对于所有XDLJob的容器,KubeDL Operator会在容器的环境变量中增加```TASK_NAME``` 和 ```TASK_INDEX```来区分每个容器的身份。同时KubeDL Operator会修改环境变量```ZK_ADDR```来加上XDLJob的UUID。 + +下面的命令就是运行一个XDLJob。 + +```bash +kubectl apply -f https://raw.githubusercontent.com/alibaba/x-deeplearning/master/xdl/docs/tutorial/v1/xdl-job.yaml +``` + +## 查看XDLJob运行情况 + +查看XDLJob是否正常拉起,所有的Pod是否正常运行。 + +```bash +kubectl get xdljob + +NAME STATE AGE FINISHED-TTL MAX-LIFETIME +xdl-mnist-example Running 70s 3600 + +kubectl get po + +NAME READY STATUS RESTARTS AGE +xdl-mnist-example-ps-0 1/1 Running 0 116s +xdl-mnist-example-scheduler-0 1/1 Running 0 116s +xdl-mnist-example-worker-0 1/1 Running 0 116s +xdl-mnist-example-worker-1 1/1 Running 0 116s +zk-c5cc46c8d-s6bkc 1/1 Running 0 2m26s +```