From f1996bcf57a0927688b26e4d21281f9a1c8a6e60 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Mon, 13 Nov 2017 10:26:36 +0000
Subject: [PATCH 1/4] Fix bug in MergeModel.cpp.

---
 paddle/trainer/MergeModel.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/trainer/MergeModel.cpp b/paddle/trainer/MergeModel.cpp
index f3cfd9f97fea8..56c38015fb239 100644
--- a/paddle/trainer/MergeModel.cpp
+++ b/paddle/trainer/MergeModel.cpp
@@ -27,6 +27,9 @@ using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
 int main(int argc, char** argv) {
+  initMain(argc, argv);
+  initPython(argc, argv);
+
   if (FLAGS_model_dir.empty() || FLAGS_config_file.empty() ||
       FLAGS_model_file.empty()) {
     LOG(INFO) << "Usage: ./paddle_merge_model --model_dir=pass-00000 "
@@ -34,9 +37,6 @@ int main(int argc, char** argv) {
     return 0;
   }
 
-  initMain(argc, argv);
-  initPython(argc, argv);
-
   string confFile = FLAGS_config_file;
 #ifndef PADDLE_WITH_CUDA
   FLAGS_use_gpu = false;

From 0cc1b6cfe5c57a44ce36f50252a898bc82ff191a Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Mon, 20 Nov 2017 05:03:03 +0000
Subject: [PATCH 2/4] Add a c-api inferface to initilize the thread environment
 of Paddle and add a GPU example.

---
 paddle/capi/Main.cpp                          |  12 ++
 paddle/capi/Matrix.cpp                        |   6 +-
 paddle/capi/error.h                           |  17 +++
 .../multi_thread/CMakeLists.txt               |  29 ++++-
 .../model_inference/multi_thread/main.c       |   3 +
 .../model_inference/multi_thread/main_gpu.c   | 106 ++++++++++++++++++
 paddle/capi/main.h                            |   5 +
 paddle/capi/matrix.h                          |   8 +-
 8 files changed, 175 insertions(+), 11 deletions(-)
 create mode 100644 paddle/capi/examples/model_inference/multi_thread/main_gpu.c

diff --git a/paddle/capi/Main.cpp b/paddle/capi/Main.cpp
index bb8249a5511c0..85296db9d7d61 100644
--- a/paddle/capi/Main.cpp
+++ b/paddle/capi/Main.cpp
@@ -43,4 +43,16 @@ paddle_error paddle_init(int argc, char** argv) {
   isInit = true;
   return kPD_NO_ERROR;
 }
+
+paddle_error paddle_init_thread() {
+  static bool isInit = false;
+  if (isInit) return kPD_NO_ERROR;
+
+  if (FLAGS_use_gpu) {
+    hl_init(FLAGS_gpu_id);
+  }
+
+  isInit = true;
+  return kPD_NO_ERROR;
+}
 }
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index d5b55e1c95f24..cbacd1fb71c14 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -40,7 +40,7 @@ paddle_error paddle_matrix_destroy(paddle_matrix mat) {
 paddle_error paddle_matrix_set_row(paddle_matrix mat,
                                    uint64_t rowID,
                                    paddle_real* rowArray) {
-  if (mat == nullptr) return kPD_NULLPTR;
+  if (mat == nullptr || rowArray == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (ptr->mat == nullptr) return kPD_NULLPTR;
   if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
@@ -55,7 +55,7 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
 }
 
 PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                          paddle_real* value) {
+                                            paddle_real* value) {
   if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (ptr->mat == nullptr) return kPD_NULLPTR;
@@ -75,7 +75,7 @@ PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
 }
 
 PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                          paddle_real* result) {
+                                            paddle_real* result) {
   if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (ptr->mat == nullptr) return kPD_NULLPTR;
diff --git a/paddle/capi/error.h b/paddle/capi/error.h
index 44d8c2040d1aa..85853c202a1ff 100644
--- a/paddle/capi/error.h
+++ b/paddle/capi/error.h
@@ -27,4 +27,21 @@ typedef enum {
   kPD_UNDEFINED_ERROR = -1,
 } paddle_error;
 
+static const char* paddle_error_string(paddle_error err) {
+  switch (err) {
+    case kPD_NULLPTR:
+      return "nullptr error";
+    case kPD_OUT_OF_RANGE:
+      return "out of range error";
+    case kPD_PROTOBUF_ERROR:
+      return "protobuf error";
+    case kPD_NOT_SUPPORTED:
+      return "not supported error";
+    case kPD_UNDEFINED_ERROR:
+      return "undefined error";
+    default:
+      return "";
+  }
+}
+
 #endif
diff --git a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
index 98e411ddc02a4..2fc8debddedea 100644
--- a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
+++ b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
@@ -1,8 +1,29 @@
 project(multi_thread)
 cmake_minimum_required(VERSION 2.8)
-aux_source_directory(. SRC_LIST)
-add_executable(${PROJECT_NAME} ${SRC_LIST})
+
 find_package (Threads)
+
+if(NOT PADDLE_ROOT)
+  set(PADDLE_ROOT $ENV{PADDLE_ROOT} CACHE PATH "Paddle Path")
+endif()
+if(PADDLE_ROOT)
+  include_directories(${PADDLE_ROOT}/include)
+  link_directories(${PADDLE_ROOT}/lib)
+endif()
+
+set(CPU_SRCS main.c)
+add_executable(${PROJECT_NAME} ${CPU_SRCS})
 set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
-target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared
-  ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${PROJECT_NAME}
+                      -lpaddle_capi_shared
+                      ${CMAKE_THREAD_LIBS_INIT})
+
+find_package(CUDA QUIET)
+if(CUDA_FOUND)
+  set(GPU_SRCS main_gpu.c)
+  cuda_add_executable(${PROJECT_NAME}_gpu ${GPU_SRCS})
+  set_property(TARGET ${PROJECT_NAME}_gpu PROPERTY C_STANDARD 99)
+  target_link_libraries(${PROJECT_NAME}_gpu
+                        -lpaddle_capi_shared
+                        ${CMAKE_THREAD_LIBS_INIT})
+endif(CUDA_FOUND)
diff --git a/paddle/capi/examples/model_inference/multi_thread/main.c b/paddle/capi/examples/model_inference/multi_thread/main.c
index d7675cd80a52f..17eda85e64b59 100644
--- a/paddle/capi/examples/model_inference/multi_thread/main.c
+++ b/paddle/capi/examples/model_inference/multi_thread/main.c
@@ -10,6 +10,9 @@
 pthread_mutex_t mutex;
 
 void* thread_main(void* gm_ptr) {
+  // Initialize the thread environment of Paddle.
+  CHECK(paddle_init_thread());
+
   paddle_gradient_machine machine = (paddle_gradient_machine)(gm_ptr);
   paddle_arguments in_args = paddle_arguments_create_none();
   // Create input matrix.
diff --git a/paddle/capi/examples/model_inference/multi_thread/main_gpu.c b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
new file mode 100644
index 0000000000000..63f2a9eeb09a0
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
@@ -0,0 +1,106 @@
+#include <paddle/capi.h>
+#include <pthread.h>
+#include <time.h>
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+#define NUM_THREAD 4
+#define NUM_ITER 1000
+
+pthread_mutex_t mutex;
+
+void* thread_main(void* gm_ptr) {
+  // Initialize the thread environment of Paddle.
+  CHECK(paddle_init_thread());
+
+  paddle_gradient_machine machine = (paddle_gradient_machine)(gm_ptr);
+  // Create input arguments.
+  paddle_arguments in_args = paddle_arguments_create_none();
+  // Create input matrix.
+  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
+                                           /* size */ 784,
+                                           /* useGPU */ true);
+  // Create output arguments.
+  paddle_arguments out_args = paddle_arguments_create_none();
+  // Create output matrix.
+  paddle_matrix prob = paddle_matrix_create_none();
+
+  // CPU buffer to cache the input and output.
+  paddle_real* cpu_input = (paddle_real*)malloc(784 * sizeof(paddle_real));
+  paddle_real* cpu_output = (paddle_real*)malloc(10 * sizeof(paddle_real));
+  for (int iter = 0; iter < NUM_ITER; ++iter) {
+    // There is only one input of this network.
+    CHECK(paddle_arguments_resize(in_args, 1));
+    CHECK(paddle_arguments_set_value(in_args, 0, mat));
+
+    for (int i = 0; i < 784; ++i) {
+      cpu_input[i] = rand() / ((float)RAND_MAX);
+    }
+    CHECK(paddle_matrix_set_value(mat, cpu_input));
+
+    CHECK(paddle_gradient_machine_forward(machine,
+                                          in_args,
+                                          out_args,
+                                          /* isTrain */ false));
+
+    CHECK(paddle_arguments_get_value(out_args, 0, prob));
+    CHECK(paddle_matrix_get_value(prob, cpu_output));
+
+    pthread_mutex_lock(&mutex);
+    printf("Prob: ");
+    for (int i = 0; i < 10; ++i) {
+      printf("%.2f ", cpu_output[i]);
+    }
+    printf("\n");
+    pthread_mutex_unlock(&mutex);
+  }
+
+  CHECK(paddle_matrix_destroy(prob));
+  CHECK(paddle_arguments_destroy(out_args));
+  CHECK(paddle_matrix_destroy(mat));
+  CHECK(paddle_arguments_destroy(in_args));
+  CHECK(paddle_gradient_machine_destroy(machine));
+
+  free(cpu_input);
+  free(cpu_output);
+
+  return NULL;
+}
+
+int main() {
+  // Initalize Paddle
+  char* argv[] = {"--use_gpu=True"};
+  CHECK(paddle_init(1, (char**)argv));
+
+  // Reading config binary file. It is generated by `convert_protobin.sh`
+  long size;
+  void* buf = read_config(CONFIG_BIN, &size);
+
+  // Create a gradient machine for inference.
+  paddle_gradient_machine machine;
+  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+  CHECK(paddle_gradient_machine_randomize_param(machine));
+
+  // Loading parameter. Uncomment the following line and change the directory.
+  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+  //                                                "./some_where_to_params"));
+  srand(time(0));
+  pthread_mutex_init(&mutex, NULL);
+
+  pthread_t threads[NUM_THREAD];
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    paddle_gradient_machine thread_local_machine;
+    CHECK(paddle_gradient_machine_create_shared_param(
+        machine, buf, size, &thread_local_machine));
+    pthread_create(&threads[i], NULL, thread_main, thread_local_machine);
+  }
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    pthread_join(threads[i], NULL);
+  }
+
+  pthread_mutex_destroy(&mutex);
+
+  return 0;
+}
diff --git a/paddle/capi/main.h b/paddle/capi/main.h
index 893ebcbd58dd2..20eb36f3080fc 100644
--- a/paddle/capi/main.h
+++ b/paddle/capi/main.h
@@ -26,6 +26,11 @@ extern "C" {
  */
 PD_API paddle_error paddle_init(int argc, char** argv);
 
+/**
+ * Initialize the thread environment of Paddle.
+ */
+PD_API paddle_error paddle_init_thread();
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/paddle/capi/matrix.h b/paddle/capi/matrix.h
index 01b8bad2ee9f5..8cc3e0034e058 100644
--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@@ -79,7 +79,7 @@ PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
  * @note  value should contain enough element of data to init the mat
  */
 PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                          paddle_real* value);
+                                            paddle_real* value);
 
 /**
  * @brief PDMatGetRow Get raw row buffer from matrix
@@ -93,14 +93,14 @@ PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
                                           paddle_real** rawRowBuffer);
 
 /**
- * @brief copy data from the matrix 
+ * @brief copy data from the matrix
  * @param [in] mat Target matrix
- * @param [out] result pointer to store the matrix data 
+ * @param [out] result pointer to store the matrix data
  * @return paddle_error
  * @note the space of the result should allocated before invoke this API
  */
 PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                          paddle_real* result);
+                                            paddle_real* result);
 /**
  * @brief PDMatCreateNone Create None Matrix
  * @return

From ee5df622c9cf63c0d25a794068333156a8e6e8e6 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Mon, 20 Nov 2017 07:56:52 +0000
Subject: [PATCH 3/4] Add some note for paddle_init_thread and move the
 inplementation of paddle_error_string into a .cpp file.

---
 paddle/capi/Main.cpp                          |  5 ---
 paddle/capi/error.cpp                         | 32 +++++++++++++++++++
 paddle/capi/error.h                           | 22 ++++---------
 .../model_inference/multi_thread/main.c       |  3 --
 paddle/capi/main.h                            |  1 +
 5 files changed, 39 insertions(+), 24 deletions(-)
 create mode 100644 paddle/capi/error.cpp

diff --git a/paddle/capi/Main.cpp b/paddle/capi/Main.cpp
index 6f932af16ded1..c038789340033 100644
--- a/paddle/capi/Main.cpp
+++ b/paddle/capi/Main.cpp
@@ -45,14 +45,9 @@ paddle_error paddle_init(int argc, char** argv) {
 }
 
 paddle_error paddle_init_thread() {
-  static __thread bool isInit = false;
-  if (isInit) return kPD_NO_ERROR;
-
   if (FLAGS_use_gpu) {
     hl_init(FLAGS_gpu_id);
   }
-
-  isInit = true;
   return kPD_NO_ERROR;
 }
 }
diff --git a/paddle/capi/error.cpp b/paddle/capi/error.cpp
new file mode 100644
index 0000000000000..169b65f921043
--- /dev/null
+++ b/paddle/capi/error.cpp
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "error.h"
+
+const char* paddle_error_string(paddle_error err) {
+  switch (err) {
+    case kPD_NULLPTR:
+      return "nullptr error";
+    case kPD_OUT_OF_RANGE:
+      return "out of range error";
+    case kPD_PROTOBUF_ERROR:
+      return "protobuf error";
+    case kPD_NOT_SUPPORTED:
+      return "not supported error";
+    case kPD_UNDEFINED_ERROR:
+      return "undefined error";
+    default:
+      return "";
+  }
+}
diff --git a/paddle/capi/error.h b/paddle/capi/error.h
index 85853c202a1ff..9d9d0ed63a527 100644
--- a/paddle/capi/error.h
+++ b/paddle/capi/error.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifndef __PADDLE_CAPI_ERROR_H__
 #define __PADDLE_CAPI_ERROR_H__
 
+#include "config.h"
+
 /**
  * Error Type for Paddle API.
  */
@@ -27,21 +29,9 @@ typedef enum {
   kPD_UNDEFINED_ERROR = -1,
 } paddle_error;
 
-static const char* paddle_error_string(paddle_error err) {
-  switch (err) {
-    case kPD_NULLPTR:
-      return "nullptr error";
-    case kPD_OUT_OF_RANGE:
-      return "out of range error";
-    case kPD_PROTOBUF_ERROR:
-      return "protobuf error";
-    case kPD_NOT_SUPPORTED:
-      return "not supported error";
-    case kPD_UNDEFINED_ERROR:
-      return "undefined error";
-    default:
-      return "";
-  }
-}
+/**
+ * Error string for Paddle API.
+ */
+PD_API const char* paddle_error_string(paddle_error err);
 
 #endif
diff --git a/paddle/capi/examples/model_inference/multi_thread/main.c b/paddle/capi/examples/model_inference/multi_thread/main.c
index 17eda85e64b59..d7675cd80a52f 100644
--- a/paddle/capi/examples/model_inference/multi_thread/main.c
+++ b/paddle/capi/examples/model_inference/multi_thread/main.c
@@ -10,9 +10,6 @@
 pthread_mutex_t mutex;
 
 void* thread_main(void* gm_ptr) {
-  // Initialize the thread environment of Paddle.
-  CHECK(paddle_init_thread());
-
   paddle_gradient_machine machine = (paddle_gradient_machine)(gm_ptr);
   paddle_arguments in_args = paddle_arguments_create_none();
   // Create input matrix.
diff --git a/paddle/capi/main.h b/paddle/capi/main.h
index 20eb36f3080fc..ffa4caa05a3b4 100644
--- a/paddle/capi/main.h
+++ b/paddle/capi/main.h
@@ -28,6 +28,7 @@ PD_API paddle_error paddle_init(int argc, char** argv);
 
 /**
  * Initialize the thread environment of Paddle.
+ * @note it is requisite for GPU runs but optional for CPU runs.
  */
 PD_API paddle_error paddle_init_thread();
 

From 68f6b80acdb7ed7abf3ce56d17d759d12183c266 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Fri, 8 Dec 2017 02:44:53 +0000
Subject: [PATCH 4/4] Add some comments.

---
 .../examples/model_inference/multi_thread/main_gpu.c     | 9 ++++++++-
 paddle/capi/main.h                                       | 1 +
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/paddle/capi/examples/model_inference/multi_thread/main_gpu.c b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
index 63f2a9eeb09a0..6fd376e0d1a2f 100644
--- a/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
+++ b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
@@ -9,6 +9,13 @@
 
 pthread_mutex_t mutex;
 
+/*
+ * @brief It is an simple inference example that runs multi-threads on a GPU.
+ *        Each thread holds it own local gradient_machine but shares the same
+ *        parameters.
+ *        If you want to run on different GPUs, you need to launch
+ *        multi-processes or set trainer_count > 1.
+ */
 void* thread_main(void* gm_ptr) {
   // Initialize the thread environment of Paddle.
   CHECK(paddle_init_thread());
@@ -29,7 +36,7 @@ void* thread_main(void* gm_ptr) {
   paddle_real* cpu_input = (paddle_real*)malloc(784 * sizeof(paddle_real));
   paddle_real* cpu_output = (paddle_real*)malloc(10 * sizeof(paddle_real));
   for (int iter = 0; iter < NUM_ITER; ++iter) {
-    // There is only one input of this network.
+    // There is only one input layer of this network.
     CHECK(paddle_arguments_resize(in_args, 1));
     CHECK(paddle_arguments_set_value(in_args, 0, mat));
 
diff --git a/paddle/capi/main.h b/paddle/capi/main.h
index ffa4caa05a3b4..99c4e8428dbaa 100644
--- a/paddle/capi/main.h
+++ b/paddle/capi/main.h
@@ -29,6 +29,7 @@ PD_API paddle_error paddle_init(int argc, char** argv);
 /**
  * Initialize the thread environment of Paddle.
  * @note it is requisite for GPU runs but optional for CPU runs.
+ *       For GPU runs, all threads will run on the same GPU devices.
  */
 PD_API paddle_error paddle_init_thread();