PaddlePaddle · peterzhang2029 · Dec 12, 2017 · Dec 12, 2017 · Dec 12, 2017 · Dec 15, 2017
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -75,6 +75,8 @@ class ChunkEvaluator : public Evaluator {
   std::vector<Segment> labelSegments_;
   std::vector<Segment> outputSegments_;
   std::set<int> excludedChunkTypes_;
+  IVectorPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
   mutable std::unordered_map<std::string, real> values_;
 
 public:
@@ -142,16 +144,27 @@ class ChunkEvaluator : public Evaluator {
     CHECK_EQ(arguments.size(), (size_t)2);
     IVectorPtr& output = arguments[0].ids;
     IVectorPtr& label = arguments[1].ids;
-    CHECK(!output->useGpu() && !label->useGpu()) << "Not supported";
     auto sequenceStartPositions =
         arguments[1].sequenceStartPositions->getVector(false);
     CHECK_EQ(output->getSize(), label->getSize());
     CHECK(sequenceStartPositions);
     size_t numSequences = sequenceStartPositions->getSize() - 1;
     const int* starts = sequenceStartPositions->getData();
+    if (output->useGpu()) {
+      IVector::resizeOrCreate(cpuOutput_, output->getSize(), false);
+      cpuOutput_->copyFrom(*output);
+    } else {
+      cpuOutput_ = output;
+    }
+    if (label->useGpu()) {
+      IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+      cpuLabel_->copyFrom(*label);
+    } else {
+      cpuLabel_ = label;
+    }
     for (size_t i = 0; i < numSequences; ++i) {
-      eval1(output->getData() + starts[i],
-            label->getData() + starts[i],
+      eval1(cpuOutput_->getData() + starts[i],
+            cpuLabel_->getData() + starts[i],
             starts[i + 1] - starts[i]);
     }
     return 0;

diff --git a/paddle/gserver/layers/CRFDecodingLayer.cpp b/paddle/gserver/layers/CRFDecodingLayer.cpp
@@ -23,42 +23,88 @@ bool CRFDecodingLayer::init(const LayerMap& layerMap,
   if (!CRFLayer::init(layerMap, parameterMap)) {
     return false;
   }
-  crf_.reset(new LinearChainCRF(
-      numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData()));
+  if (!useGpu_) {
+    crf_.reset(new LinearChainCRF(
+        numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData()));
+  }
   return true;
 }
 
 void CRFDecodingLayer::forward(PassType passType) {
   Layer::forward(passType);
 
-  CHECK(!useGpu_) << "GPU is not supported";
-
+  if (useGpu_) {
+    cpuParam =
+        Vector::create(parameter_->getBuf(PARAMETER_VALUE)->getSize(), false);
+    cpuParam->copyFrom(*parameter_->getBuf(PARAMETER_VALUE));
+    crf_.reset(new LinearChainCRF(numClasses_, cpuParam->getData()));
+  }
   const Argument& output = getInput(0);
   CHECK(output.sequenceStartPositions);
 
   size_t batchSize = output.getBatchSize();
   size_t numSequences = output.sequenceStartPositions->getSize() - 1;
 
   IVector::resizeOrCreate(output_.ids, batchSize, useGpu_);
+  IVectorPtr output_ids = output_.ids;
+  MatrixPtr output_arg_val = output.value;
+  if (useGpu_) {
+    Matrix::resizeOrCreate(cpuOutputArg_,
+                           /* height */ output_arg_val->getHeight(),
+                           /* width */ output_arg_val->getWidth(),
+                           /* trans */ false,
+                           /* useGpu */ false);
+    IVector::resizeOrCreate(cpuOutputId_, batchSize, false);
+    cpuOutputArg_->copyFrom(*output_arg_val);
+  } else {
+    cpuOutputId_ = output_ids;
+    cpuOutputArg_ = output_arg_val;
+  }
   const int* starts = output.sequenceStartPositions->getData(false);
   CHECK_EQ(starts[numSequences], (int)batchSize);
 
   for (size_t i = 0; i < numSequences; ++i) {
-    crf_->decode(output.value->getData() + numClasses_ * starts[i],
-                 output_.ids->getData() + starts[i],
+    crf_->decode(cpuOutputArg_->getData() + numClasses_ * starts[i],
+                 cpuOutputId_->getData() + starts[i],
                  starts[i + 1] - starts[i]);
   }
 
   if (inputLayers_.size() == 2) {
     const Argument& label = getInput(1);
     resizeOutput(batchSize, 1);
     CHECK(label.ids);
-    real* error = output_.value->getData();
-    int* ids = label.ids->getData();
-    int* result = output_.ids->getData();
+    MatrixPtr output_val = output_.value;
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuOutput_,
+                             /* height */ output_val->getHeight(),
+                             /* width */ output_val->getWidth(),
+                             /* trans */ false,
+                             /* useGpu */ false);
+      IVector::resizeOrCreate(cpuLabel_, label.ids->getSize(), false);
+      cpuOutput_->copyFrom(*output_val);
+      cpuLabel_->copyFrom(*label.ids);
+    } else {
+      cpuOutput_ = output_val;
+      cpuLabel_ = label.ids;
+    }
+    real* error = cpuOutput_->getData();
+    int* ids = cpuLabel_->getData();
+    int* result = cpuOutputId_->getData();
     for (size_t i = 0; i < batchSize; ++i) {
       error[i] = ids[i] == result[i] ? 0 : 1;
     }
+    if (useGpu_) {
+      output_val->copyFrom(*cpuOutput_);
+    } else {
+      output_val = cpuOutput_;
+    }
+  }
+  if (useGpu_) {
+    output_ids->copyFrom(*cpuOutputId_);
+    output_arg_val->copyFrom(*cpuOutputArg_);
+  } else {
+    output_ids = cpuOutputId_;
+    output_arg_val = cpuOutputArg_;
   }
 }
 

diff --git a/paddle/gserver/layers/CRFDecodingLayer.h b/paddle/gserver/layers/CRFDecodingLayer.h
@@ -39,6 +39,12 @@ class CRFDecodingLayer : public CRFLayer {
 
 protected:
   std::unique_ptr<LinearChainCRF> crf_;
+  // The temporary variables in CPU memory.
+  MatrixPtr cpuOutputArg_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
+  IVectorPtr cpuOutputId_;
+  VectorPtr cpuParam;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp
@@ -54,8 +54,6 @@ bool CRFLayer::init(const LayerMap& layerMap,
 void CRFLayer::forward(PassType passType) {
   Layer::forward(passType);
 
-  CHECK(!useGpu_) << "GPU is not supported";
-
   const Argument& output = getInput(0);
   const Argument& label = getInput(1);
   CHECK(label.sequenceStartPositions);
@@ -68,16 +66,53 @@ void CRFLayer::forward(PassType passType) {
   const int* starts = label.sequenceStartPositions->getData(false);
   CHECK_EQ(starts[numSequences], batchSize);
 
+  MatrixPtr weight_val = weight_->getW();
+  MatrixPtr output_val = output_.value;
+  MatrixPtr output_arg_val = output.value;
+  IVectorPtr label_val = label.ids;
+  if (useGpu_) {
+    Matrix::resizeOrCreate(cpuWeight_,
+                           /* height */ weight_val->getHeight(),
+                           /* width */ weight_val->getWidth(),
+                           /* trans */ false,
+                           /* useGpu */ false);
+    Matrix::resizeOrCreate(cpuOutput_,
+                           /* height */ output_val->getHeight(),
+                           /* width */ output_val->getWidth(),
+                           /* trans */ false,
+                           /* useGpu */ false);
+    Matrix::resizeOrCreate(cpuOutputArg_,
+                           /* height */ output_arg_val->getHeight(),
+                           /* width */ output_arg_val->getWidth(),
+                           /* trans */ false,
+                           /* useGpu */ false);
+    IVector::resizeOrCreate(cpuLabel_, label_val->getSize(), false);
+    cpuWeight_->copyFrom(*weight_val);
+    cpuOutputArg_->copyFrom(*output_arg_val);
+    cpuOutput_->copyFrom(*output_val);
+    cpuLabel_->copyFrom(*label_val);
+  } else {
+    cpuWeight_ = weight_val;
+    cpuOutputArg_ = output_arg_val;
+    cpuOutput_ = output_val;
+    cpuLabel_ = label_val;
+  }
   for (size_t i = 0; i < numSequences; ++i) {
     if (i >= crfs_.size()) {
-      crfs_.emplace_back(numClasses_, weight_->getW()->getData());
+      crfs_.emplace_back(numClasses_, cpuWeight_->getData());
     }
-    output_.value->getData()[i] =
-        crfs_[i].forward(output.value->getData() + numClasses_ * starts[i],
-                         label.ids->getData() + starts[i],
+    cpuOutput_->getData()[i] =
+        crfs_[i].forward(cpuOutputArg_->getData() + numClasses_ * starts[i],
+                         cpuLabel_->getData() + starts[i],
                          starts[i + 1] - starts[i]);
   }
-
+  if (useGpu_) {
+    output_val->copyFrom(*cpuOutput_);
+    output_arg_val->copyFrom(*cpuOutputArg_);
+  } else {
+    output_val = cpuOutput_;
+    output_arg_val = cpuOutputArg_;
+  }
   if (weightLayer_) {
     const MatrixPtr& weight = getInputValue(*weightLayer_);
     getOutputValue()->dotMul(*getOutputValue(), *weight);
@@ -91,9 +126,42 @@ void CRFLayer::backward(const UpdateCallback& callback) {
   int numSequences = label.sequenceStartPositions->getSize() - 1;
 
   bool needWGrad = weight_->getWGrad() ? true : false;
+  MatrixPtr output_arg_grad = output.grad;
+  MatrixPtr weight_grad = weight_->getWGrad();
+  MatrixPtr output_arg_val = output.value;
+  IVectorPtr label_val = label.ids;
+  if (useGpu_) {
+    cpuOutputArg_->copyFrom(*output_arg_val);
+    cpuLabel_->copyFrom(*label_val);
+    if (output_arg_grad) {
+      Matrix::resizeOrCreate(cpuOutputArgGrad_,
+                             /* height */ output_arg_grad->getHeight(),
+                             /* width */ output_arg_grad->getWidth(),
+                             /* trans */ false,
+                             /* useGpu */ false);
+      cpuOutputArgGrad_->copyFrom(*output_arg_grad);
+    }
+    if (needWGrad) {
+      Matrix::resizeOrCreate(cpuWeightGrad_,
+                             /* height */ weight_grad->getHeight(),
+                             /* width */ weight_grad->getWidth(),
+                             /* trans */ false,
+                             /* useGpu */ false);
+      cpuWeightGrad_->copyFrom(*weight_grad);
+    }
+  } else {
+    cpuOutputArg_ = output_arg_val;
+    cpuLabel_ = label_val;
+    if (output_arg_grad) {
+      cpuOutputArgGrad_ = output_arg_grad;
+    }
+    if (needWGrad) {
+      cpuWeightGrad_ = weight_grad;
+    }
+  }
   for (int i = 0; i < numSequences; ++i) {
-    crfs_[i].backward(output.value->getData() + numClasses_ * starts[i],
-                      label.ids->getData() + starts[i],
+    crfs_[i].backward(cpuOutputArg_->getData() + numClasses_ * starts[i],
+                      cpuLabel_->getData() + starts[i],
                       starts[i + 1] - starts[i],
                       needWGrad);
     real instanceWeight = weightLayer_
@@ -102,13 +170,30 @@ void CRFLayer::backward(const UpdateCallback& callback) {
     instanceWeight *= coeff_;
 
     if (output.grad) {
-      MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
+      MatrixPtr grad =
+          cpuOutputArgGrad_->subRowMatrix(starts[i], starts[i + 1]);
       grad->add(*crfs_[i].getXGrad(), real(1.0f), instanceWeight);
     }
     if (needWGrad) {
-      weight_->getWGrad()->add(
-          *crfs_[i].getWGrad(), real(1.0f), instanceWeight);
+      cpuWeightGrad_->add(*crfs_[i].getWGrad(), real(1.0f), instanceWeight);
+    }
+  }
+  if (useGpu_) {
+    if (output.grad) {
+      output_arg_grad->copyFrom(*cpuOutputArgGrad_);
+    }
+    if (needWGrad) {
+      weight_grad->copyFrom(*cpuWeightGrad_);
+    }
+    output_arg_val->copyFrom(*cpuOutputArg_);
+  } else {
+    if (output.grad) {
+      output_arg_grad = cpuOutputArgGrad_;
+    }
+    if (needWGrad) {
+      weight_grad = cpuWeightGrad_;
     }
+    output_arg_val = cpuOutputArg_;
   }
 
   parameter_->incUpdate(callback);

diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h
@@ -41,6 +41,14 @@ class CRFLayer : public Layer {
   LayerPtr weightLayer_;            // weight for each sequence
   std::unique_ptr<Weight> weight_;  // parameters
   real coeff_;                      // weight for the layer
+
+  // The temporary variables in CPU memory.
+  MatrixPtr cpuWeight_;
+  MatrixPtr cpuOutputArg_;
+  MatrixPtr cpuOutput_;
+  MatrixPtr cpuWeightGrad_;
+  MatrixPtr cpuOutputArgGrad_;
+  IVectorPtr cpuLabel_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/tests/test_CRFLayerGrad.cpp b/paddle/gserver/tests/test_CRFLayerGrad.cpp
@@ -130,34 +130,36 @@ TestConfig initTestConfig(size_t numClasses, bool withWeight) {
 
 TEST(Layer, CRFLayer) {
   size_t numClasses = 10;
-  for (int tries = 0; tries < 5; ++tries) {
-    TestConfig config = initTestConfig(numClasses, /* withWeight= */ false);
-    for (int length : {1, 3, 100}) {
-      // Not support GPU now
-      testLayerGrad(config,
-                    "crf",
-                    length,
-                    /* trans= */ false,
-                    /* useGpu= */ false,
-                    /* useWeight= */ false,
-                    epsilon());
+  for (auto useGpu : {false, true}) {
+    for (int tries = 0; tries < 5; ++tries) {
+      TestConfig config = initTestConfig(numClasses, /* withWeight= */ false);
+      for (int length : {1, 3, 100}) {
+        testLayerGrad(config,
+                      "crf",
+                      length,
+                      /* trans= */ false,
+                      /* useGpu= */ useGpu,
+                      /* useWeight= */ false,
+                      epsilon());
+      }
     }
   }
 }
 
 TEST(Layer, CRFLayerUseWeight) {
   size_t numClasses = 10;
-  for (int tries = 0; tries < 5; ++tries) {
-    TestConfig config = initTestConfig(numClasses, /* withWeight= */ true);
-    for (int length : {1, 3, 100}) {
-      // Not support GPU now
-      testLayerGrad(config,
-                    "crf",
-                    length,
-                    /* trans= */ false,
-                    /* useGpu= */ false,
-                    /* useWeight= */ false,
-                    epsilon());
+  for (auto useGpu : {false, true}) {
+    for (int tries = 0; tries < 5; ++tries) {
+      TestConfig config = initTestConfig(numClasses, /* withWeight= */ true);
+      for (int length : {1, 3, 100}) {
+        testLayerGrad(config,
+                      "crf",
+                      length,
+                      /* trans= */ false,
+                      /* useGpu= */ useGpu,
+                      /* useWeight= */ false,
+                      epsilon());
+      }
     }
   }
 }