From 8ff3a456b9f2d7ed57c8de04d163378a81daf3a0 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Fri, 29 Sep 2017 23:21:37 -0700 Subject: [PATCH 1/9] Implementing the Adamax optimizer step operator --- paddle/operators/adamax_op.cc | 107 ++++++++++++++++++++++++++++++++++ paddle/operators/adamax_op.cu | 20 +++++++ paddle/operators/adamax_op.h | 64 ++++++++++++++++++++ 3 files changed, 191 insertions(+) create mode 100644 paddle/operators/adamax_op.cc create mode 100644 paddle/operators/adamax_op.cu create mode 100644 paddle/operators/adamax_op.h diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc new file mode 100644 index 0000000000000..4e23735b48f3e --- /dev/null +++ b/paddle/operators/adamax_op.cc @@ -0,0 +1,107 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/adamax_op.h" + +namespace paddle { +namespace operators { + +class AdamaxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContextBase *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("param"), + "Input(param) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("grad"), + "Input(grad) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("moment"), + "Input(moment) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("inf_norm"), + "Input(inf_norm) of AdamaxOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("param_out"), + "Output(param_out) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("moment_out"), + "Output(moment_out) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("inf_norm_out"), + "Output(inf_norm_out) of AdamaxOp should not be null."); + + auto param_dim = ctx->GetInputDim("param"); + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("grad"), + "param and grad input of AdamaxOp should have same dimension"); + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("moment"), + "param and moment input of AdamaxOp should have same dimension"); + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("inf_norm"), + "param and inf_norm input of AdamaxOp should have same dimension"); + + ctx->SetOutputDim("param_out", param_dim); + ctx->SetOutputDim("moment_out", param_dim); + ctx->SetOutputDim("inf_norm_out", param_dim); + } +}; + +class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AdamaxOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("param", "Input parameter"); + AddInput("grad", "Input gradient"); + AddInput("moment", "First moment"); + AddInput("inf_norm", "Input exponentially weighted infinity norm"); + + AddOutput("param_out", "Output parameter"); + AddOutput("moment_out", "Output first moment"); + AddOutput("inf_norm_out", "Output exponentially weighted infinity norm"); + + AddAttr("time_step", "Time step"); + AddAttr("learning_rate", "Learning rate"); + AddAttr("beta_1", + "exponential decay rate for the 1st moment estimates."); + AddAttr( + "beta_2", + "exponential decay rate for the weighted infinity norm estimates."); + AddAttr("epsilon", "Constant for numerical stability"); + AddComment(R"DOC( +Adamax Updates Operator. + +This implements the Adamax optimizer from Section 7 of the Adam +paper(https://arxiv.org/abs/1412.6980). Adamax is a variant of the +Adam algorithm based on the infinity norm. + +Adamax updates: + +moment_out = beta_1 * moment + (1 - beta_1) * grad +inf_norm_out = max(beta_2 * inf_norm + epsilon, abs(grad)) +param_out = param - (learning_rate/(1 - beta_1^t)) * moment_out/inf_norm_out + +The original paper(https://arxiv.org/abs/1412.6980) does not have an +epsilon attribute. However, it is added here for numerical stability +by preventing divide by 0. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker); +REGISTER_OP_CPU_KERNEL(adamax, + ops::AdamaxOpKernel); diff --git a/paddle/operators/adamax_op.cu b/paddle/operators/adamax_op.cu new file mode 100644 index 0000000000000..fee3b6fc6b656 --- /dev/null +++ b/paddle/operators/adamax_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/operators/adamax_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL(adamax, + ops::AdamaxOpKernel); diff --git a/paddle/operators/adamax_op.h b/paddle/operators/adamax_op.h new file mode 100644 index 0000000000000..9b70a9b951911 --- /dev/null +++ b/paddle/operators/adamax_op.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +using EigenVector = framework::EigenVector; + +template +class AdamaxOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto param_out = ctx.Output("param_out"); + auto moment_out = ctx.Output("moment_out"); + auto norm_out = ctx.Output("inf_norm_out"); + + param_out->mutable_data(ctx.GetPlace()); + moment_out->mutable_data(ctx.GetPlace()); + norm_out->mutable_data(ctx.GetPlace()); + + float lr = ctx.Attr("learning_rate"); + float beta_1 = ctx.Attr("beta_1"); + float beta_2 = ctx.Attr("beta_2"); + float epsilon = ctx.Attr("epsilon"); + int t = ctx.Attr("time_step"); + + auto p = EigenVector::Flatten(*ctx.Input("param")); + auto g = EigenVector::Flatten(*ctx.Input("grad")); + auto m = EigenVector::Flatten(*ctx.Input("moment")); + auto u = EigenVector::Flatten(*ctx.Input("inf_norm")); + auto p_out = EigenVector::Flatten(*param_out); + auto m_out = EigenVector::Flatten(*moment_out); + auto u_out = EigenVector::Flatten(*norm_out); + auto place = ctx.GetEigenDevice(); + + m_out.device(place) = beta_1 * m + (1 - beta_1) * g; + u_out.device(place) = g.abs().cwiseMax((beta_2 * u) + epsilon); + + float lr_t = lr / (1 - std::pow(beta_1, t)); + p_out.device(place) = p - lr_t * (m_out / u_out); + } +}; + +} // namespace operators +} // namespace paddle From be9986814fd9f52bf5dddd3ca8b3e8423da87af1 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Sun, 1 Oct 2017 21:01:03 -0700 Subject: [PATCH 2/9] Adding unit tests for adamax_op --- .../v2/framework/tests/test_adamax_op.py | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 python/paddle/v2/framework/tests/test_adamax_op.py diff --git a/python/paddle/v2/framework/tests/test_adamax_op.py b/python/paddle/v2/framework/tests/test_adamax_op.py new file mode 100644 index 0000000000000..fbc92b9cd498e --- /dev/null +++ b/python/paddle/v2/framework/tests/test_adamax_op.py @@ -0,0 +1,52 @@ +import unittest +import numpy as np +from op_test import OpTest + + +class TestAdamaxOp(OpTest): + def setUp(self): + self.op_type = "adamax" + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The infinity norm is positive + inf_norm = np.random.random((102, 105)).astype("float32") + + time_step = 9 + learning_rate = 0.002 + beta_1 = 0.9 + beta_2 = 0.999 + epsilon = 1e-8 + + self.inputs = { + 'param': param, + 'grad': grad, + 'moment': moment, + 'inf_norm': inf_norm + } + + self.attrs = { + 'time_step': time_step, + 'learning_rate': learning_rate, + 'beta_1': beta_1, + 'beta_2': beta_2, + 'epsilon': epsilon + } + + moment_out = beta_1 * moment + (1 - beta_1) * grad + inf_norm_out = np.maximum(beta_2 * inf_norm + epsilon, np.abs(grad)) + lr_t = (learning_rate / (1 - beta_1**time_step)) + param_out = param - lr_t * np.divide(moment_out, inf_norm_out) + + self.outputs = { + 'param_out': param_out, + 'moment_out': moment_out, + 'inf_norm_out': inf_norm_out + } + + def test_check_output(self): + self.check_output() + + +if __name__ == "__main__": + unittest.main() From abd618161af0a0743b99e6eac3bc4b75391140df Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Mon, 2 Oct 2017 21:01:54 -0700 Subject: [PATCH 3/9] Changing learning rate and time step to inputs from attributes --- paddle/operators/adamax_op.cc | 8 ++++++-- paddle/operators/adamax_op.h | 4 ++-- python/paddle/v2/framework/tests/test_adamax_op.py | 12 ++++-------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc index 4e23735b48f3e..b5283b96ed19f 100644 --- a/paddle/operators/adamax_op.cc +++ b/paddle/operators/adamax_op.cc @@ -31,6 +31,10 @@ class AdamaxOp : public framework::OperatorWithKernel { "Input(moment) of AdamaxOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("inf_norm"), "Input(inf_norm) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("learning_rate"), + "Input(learning_rate) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("time_step"), + "Input(time_step) of AdamaxOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("param_out"), "Output(param_out) of AdamaxOp should not be null."); @@ -62,15 +66,15 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker { : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("param", "Input parameter"); AddInput("grad", "Input gradient"); + AddInput("learning_rate", "Learning rate"); AddInput("moment", "First moment"); AddInput("inf_norm", "Input exponentially weighted infinity norm"); + AddInput("time_step", "Time step"); AddOutput("param_out", "Output parameter"); AddOutput("moment_out", "Output first moment"); AddOutput("inf_norm_out", "Output exponentially weighted infinity norm"); - AddAttr("time_step", "Time step"); - AddAttr("learning_rate", "Learning rate"); AddAttr("beta_1", "exponential decay rate for the 1st moment estimates."); AddAttr( diff --git a/paddle/operators/adamax_op.h b/paddle/operators/adamax_op.h index 9b70a9b951911..0d2c2af565db5 100644 --- a/paddle/operators/adamax_op.h +++ b/paddle/operators/adamax_op.h @@ -37,11 +37,11 @@ class AdamaxOpKernel : public framework::OpKernel { moment_out->mutable_data(ctx.GetPlace()); norm_out->mutable_data(ctx.GetPlace()); - float lr = ctx.Attr("learning_rate"); float beta_1 = ctx.Attr("beta_1"); float beta_2 = ctx.Attr("beta_2"); float epsilon = ctx.Attr("epsilon"); - int t = ctx.Attr("time_step"); + float lr = *ctx.Input("learning_rate"); + int t = *ctx.Input("time_step"); auto p = EigenVector::Flatten(*ctx.Input("param")); auto g = EigenVector::Flatten(*ctx.Input("grad")); diff --git a/python/paddle/v2/framework/tests/test_adamax_op.py b/python/paddle/v2/framework/tests/test_adamax_op.py index fbc92b9cd498e..d18e93d1d059b 100644 --- a/python/paddle/v2/framework/tests/test_adamax_op.py +++ b/python/paddle/v2/framework/tests/test_adamax_op.py @@ -22,17 +22,13 @@ def setUp(self): 'param': param, 'grad': grad, 'moment': moment, - 'inf_norm': inf_norm - } - - self.attrs = { + 'inf_norm': inf_norm, 'time_step': time_step, - 'learning_rate': learning_rate, - 'beta_1': beta_1, - 'beta_2': beta_2, - 'epsilon': epsilon + 'learning_rate': learning_rate } + self.attrs = {'beta_1': beta_1, 'beta_2': beta_2, 'epsilon': epsilon} + moment_out = beta_1 * moment + (1 - beta_1) * grad inf_norm_out = np.maximum(beta_2 * inf_norm + epsilon, np.abs(grad)) lr_t = (learning_rate / (1 - beta_1**time_step)) From 0b98f7d8d3a710f566e5f2ce1c442b20a88e3e3e Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Wed, 4 Oct 2017 09:30:42 -0700 Subject: [PATCH 4/9] Changing learning rate and time step to input(tensors) --- paddle/operators/adamax_op.cc | 12 ++++++++++++ paddle/operators/adamax_op.h | 4 ++-- python/paddle/v2/framework/tests/test_adamax_op.py | 4 ++-- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc index b5283b96ed19f..74a05331c6164 100644 --- a/paddle/operators/adamax_op.cc +++ b/paddle/operators/adamax_op.cc @@ -43,6 +43,12 @@ class AdamaxOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("inf_norm_out"), "Output(inf_norm_out) of AdamaxOp should not be null."); + auto lr_dims = ctx->GetInputDim("learning_rate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "Learning rate should have 1 dimension"); + auto t_dims = ctx->GetInputDim("time_step"); + PADDLE_ENFORCE_EQ(framework::product(t_dims), 1, + "Time step should have 1 dimension"); auto param_dim = ctx->GetInputDim("param"); PADDLE_ENFORCE_EQ( param_dim, ctx->GetInputDim("grad"), @@ -58,6 +64,12 @@ class AdamaxOp : public framework::OperatorWithKernel { ctx->SetOutputDim("moment_out", param_dim); ctx->SetOutputDim("inf_norm_out", param_dim); } + + // Datatype of operator is determined by Param tensor + framework::DataType IndicateDataType( + const framework::ExecutionContext &ctx) const override { + return framework::ToDataType(ctx.Input("param")->type()); + } }; class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/operators/adamax_op.h b/paddle/operators/adamax_op.h index 0d2c2af565db5..d383d26caeb3a 100644 --- a/paddle/operators/adamax_op.h +++ b/paddle/operators/adamax_op.h @@ -40,8 +40,8 @@ class AdamaxOpKernel : public framework::OpKernel { float beta_1 = ctx.Attr("beta_1"); float beta_2 = ctx.Attr("beta_2"); float epsilon = ctx.Attr("epsilon"); - float lr = *ctx.Input("learning_rate"); - int t = *ctx.Input("time_step"); + auto lr = ctx.Input("learning_rate")->data()[0]; + auto t = ctx.Input("time_step")->data()[0]; auto p = EigenVector::Flatten(*ctx.Input("param")); auto g = EigenVector::Flatten(*ctx.Input("grad")); diff --git a/python/paddle/v2/framework/tests/test_adamax_op.py b/python/paddle/v2/framework/tests/test_adamax_op.py index d18e93d1d059b..cfcffb013b767 100644 --- a/python/paddle/v2/framework/tests/test_adamax_op.py +++ b/python/paddle/v2/framework/tests/test_adamax_op.py @@ -23,8 +23,8 @@ def setUp(self): 'grad': grad, 'moment': moment, 'inf_norm': inf_norm, - 'time_step': time_step, - 'learning_rate': learning_rate + 'time_step': np.array([time_step]).astype("int32"), + 'learning_rate': np.array([learning_rate]).astype("float32") } self.attrs = {'beta_1': beta_1, 'beta_2': beta_2, 'epsilon': epsilon} From 1a4dda00bf2cdfdb660f58861100a08bd8070338 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Wed, 4 Oct 2017 10:14:28 -0700 Subject: [PATCH 5/9] Making the Adamax operator conform to naming convention --- paddle/operators/adamax_op.cc | 123 ++++++++++-------- paddle/operators/adamax_op.h | 22 ++-- .../v2/framework/tests/test_adamax_op.py | 64 +++++++-- 3 files changed, 134 insertions(+), 75 deletions(-) diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc index 74a05331c6164..bff445de6f56f 100644 --- a/paddle/operators/adamax_op.cc +++ b/paddle/operators/adamax_op.cc @@ -23,52 +23,52 @@ class AdamaxOp : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContextBase *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("param"), - "Input(param) of AdamaxOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("grad"), - "Input(grad) of AdamaxOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("moment"), - "Input(moment) of AdamaxOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("inf_norm"), - "Input(inf_norm) of AdamaxOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("learning_rate"), - "Input(learning_rate) of AdamaxOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("time_step"), - "Input(time_step) of AdamaxOp should not be null."); - - PADDLE_ENFORCE(ctx->HasOutput("param_out"), - "Output(param_out) of AdamaxOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("moment_out"), - "Output(moment_out) of AdamaxOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("inf_norm_out"), - "Output(inf_norm_out) of AdamaxOp should not be null."); - - auto lr_dims = ctx->GetInputDim("learning_rate"); + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment"), + "Input(Moment) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("InfNorm"), + "Input(InfNorm) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("TimeStep"), + "Input(TimeStep) of AdamaxOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), + "Output(MomentOut) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("InfNormOut"), + "Output(InfNormOut) of AdamaxOp should not be null."); + + auto lr_dims = ctx->GetInputDim("LearningRate"); PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, "Learning rate should have 1 dimension"); - auto t_dims = ctx->GetInputDim("time_step"); + auto t_dims = ctx->GetInputDim("TimeStep"); PADDLE_ENFORCE_EQ(framework::product(t_dims), 1, "Time step should have 1 dimension"); - auto param_dim = ctx->GetInputDim("param"); + auto param_dim = ctx->GetInputDim("Param"); PADDLE_ENFORCE_EQ( - param_dim, ctx->GetInputDim("grad"), - "param and grad input of AdamaxOp should have same dimension"); + param_dim, ctx->GetInputDim("Grad"), + "Param and Grad input of AdamaxOp should have same dimension"); PADDLE_ENFORCE_EQ( - param_dim, ctx->GetInputDim("moment"), - "param and moment input of AdamaxOp should have same dimension"); + param_dim, ctx->GetInputDim("Moment"), + "Param and Moment input of AdamaxOp should have same dimension"); PADDLE_ENFORCE_EQ( - param_dim, ctx->GetInputDim("inf_norm"), - "param and inf_norm input of AdamaxOp should have same dimension"); + param_dim, ctx->GetInputDim("InfNorm"), + "Param and InfNorm input of AdamaxOp should have same dimension"); - ctx->SetOutputDim("param_out", param_dim); - ctx->SetOutputDim("moment_out", param_dim); - ctx->SetOutputDim("inf_norm_out", param_dim); + ctx->SetOutputDim("ParamOut", param_dim); + ctx->SetOutputDim("MomentOut", param_dim); + ctx->SetOutputDim("InfNormOut", param_dim); } // Datatype of operator is determined by Param tensor framework::DataType IndicateDataType( const framework::ExecutionContext &ctx) const override { - return framework::ToDataType(ctx.Input("param")->type()); + return framework::ToDataType(ctx.Input("Param")->type()); } }; @@ -76,28 +76,41 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker { public: AdamaxOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("param", "Input parameter"); - AddInput("grad", "Input gradient"); - AddInput("learning_rate", "Learning rate"); - AddInput("moment", "First moment"); - AddInput("inf_norm", "Input exponentially weighted infinity norm"); - AddInput("time_step", "Time step"); - - AddOutput("param_out", "Output parameter"); - AddOutput("moment_out", "Output first moment"); - AddOutput("inf_norm_out", "Output exponentially weighted infinity norm"); - - AddAttr("beta_1", - "exponential decay rate for the 1st moment estimates."); - AddAttr( - "beta_2", - "exponential decay rate for the weighted infinity norm estimates."); - AddAttr("epsilon", "Constant for numerical stability"); + AddInput("Param", "(Tensor, default Tensor) Input parameter"); + AddInput("Grad", "(Tensor, default Tensor) Input gradient"); + AddInput("LearningRate", "(Tensor, default Tensor) Learning rate"); + AddInput("Moment", "(Tensor, default Tensor) First moment"); + AddInput("InfNorm", + "(Tensor, default Tensor) " + "Input exponentially weighted infinity norm"); + AddInput("TimeStep", "(Tensor, default Tensor) Time step"); + + AddOutput("ParamOut", "(Tensor, default Tensor) Output parameter"); + AddOutput("MomentOut", + "(Tensor, default Tensor) Output first moment"); + AddOutput("InfNormOut", + "(Tensor, default Tensor) " + "Output exponentially weighted infinity norm"); + + AddAttr("beta1", + "(float, default 0.9) " + "Exponential decay rate for the " + "1st moment estimates.") + .SetDefault(0.9f); + AddAttr("beta2", + "(float, default 0.999) " + "exponential decay rate for the weighted " + "infinity norm estimates.") + .SetDefault(0.999f); + AddAttr("epsilon", + "(float, default 1.0e-8) " + "Constant for numerical stability") + .SetDefault(1.0e-8f); AddComment(R"DOC( Adamax Updates Operator. This implements the Adamax optimizer from Section 7 of the Adam -paper(https://arxiv.org/abs/1412.6980). Adamax is a variant of the +paper[1]. Adamax is a variant of the Adam algorithm based on the infinity norm. Adamax updates: @@ -106,10 +119,14 @@ moment_out = beta_1 * moment + (1 - beta_1) * grad inf_norm_out = max(beta_2 * inf_norm + epsilon, abs(grad)) param_out = param - (learning_rate/(1 - beta_1^t)) * moment_out/inf_norm_out -The original paper(https://arxiv.org/abs/1412.6980) does not have an -epsilon attribute. However, it is added here for numerical stability +The original paper does not have an epsilon attribute. +However, it is added here for numerical stability by preventing divide by 0. +References: + [1] Adam: A Method for Stochastic Optimization + (https://arxiv.org/abs/1412.6980) + )DOC"); } }; diff --git a/paddle/operators/adamax_op.h b/paddle/operators/adamax_op.h index d383d26caeb3a..2be6fc5016b53 100644 --- a/paddle/operators/adamax_op.h +++ b/paddle/operators/adamax_op.h @@ -29,24 +29,24 @@ template class AdamaxOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto param_out = ctx.Output("param_out"); - auto moment_out = ctx.Output("moment_out"); - auto norm_out = ctx.Output("inf_norm_out"); + auto param_out = ctx.Output("ParamOut"); + auto moment_out = ctx.Output("MomentOut"); + auto norm_out = ctx.Output("InfNormOut"); param_out->mutable_data(ctx.GetPlace()); moment_out->mutable_data(ctx.GetPlace()); norm_out->mutable_data(ctx.GetPlace()); - float beta_1 = ctx.Attr("beta_1"); - float beta_2 = ctx.Attr("beta_2"); + float beta_1 = ctx.Attr("beta1"); + float beta_2 = ctx.Attr("beta2"); float epsilon = ctx.Attr("epsilon"); - auto lr = ctx.Input("learning_rate")->data()[0]; - auto t = ctx.Input("time_step")->data()[0]; + auto lr = ctx.Input("LearningRate")->data()[0]; + auto t = ctx.Input("TimeStep")->data()[0]; - auto p = EigenVector::Flatten(*ctx.Input("param")); - auto g = EigenVector::Flatten(*ctx.Input("grad")); - auto m = EigenVector::Flatten(*ctx.Input("moment")); - auto u = EigenVector::Flatten(*ctx.Input("inf_norm")); + auto p = EigenVector::Flatten(*ctx.Input("Param")); + auto g = EigenVector::Flatten(*ctx.Input("Grad")); + auto m = EigenVector::Flatten(*ctx.Input("Moment")); + auto u = EigenVector::Flatten(*ctx.Input("InfNorm")); auto p_out = EigenVector::Flatten(*param_out); auto m_out = EigenVector::Flatten(*moment_out); auto u_out = EigenVector::Flatten(*norm_out); diff --git a/python/paddle/v2/framework/tests/test_adamax_op.py b/python/paddle/v2/framework/tests/test_adamax_op.py index cfcffb013b767..7b2d9d1360e25 100644 --- a/python/paddle/v2/framework/tests/test_adamax_op.py +++ b/python/paddle/v2/framework/tests/test_adamax_op.py @@ -3,7 +3,7 @@ from op_test import OpTest -class TestAdamaxOp(OpTest): +class TestAdamaxOp1(OpTest): def setUp(self): self.op_type = "adamax" param = np.random.uniform(-1, 1, (102, 105)).astype("float32") @@ -19,15 +19,15 @@ def setUp(self): epsilon = 1e-8 self.inputs = { - 'param': param, - 'grad': grad, - 'moment': moment, - 'inf_norm': inf_norm, - 'time_step': np.array([time_step]).astype("int32"), - 'learning_rate': np.array([learning_rate]).astype("float32") + 'Param': param, + 'Grad': grad, + 'Moment': moment, + 'InfNorm': inf_norm, + 'TimeStep': np.array([time_step]).astype("int32"), + 'LearningRate': np.array([learning_rate]).astype("float32") } - self.attrs = {'beta_1': beta_1, 'beta_2': beta_2, 'epsilon': epsilon} + self.attrs = {'beta1': beta_1, 'beta2': beta_2, 'epsilon': epsilon} moment_out = beta_1 * moment + (1 - beta_1) * grad inf_norm_out = np.maximum(beta_2 * inf_norm + epsilon, np.abs(grad)) @@ -35,9 +35,51 @@ def setUp(self): param_out = param - lr_t * np.divide(moment_out, inf_norm_out) self.outputs = { - 'param_out': param_out, - 'moment_out': moment_out, - 'inf_norm_out': inf_norm_out + 'ParamOut': param_out, + 'MomentOut': moment_out, + 'InfNormOut': inf_norm_out + } + + def test_check_output(self): + self.check_output() + + +class TestAdamaxOp2(OpTest): + '''Test Adamax operator with default attributes + ''' + + def setUp(self): + self.op_type = "adamax" + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The infinity norm is positive + inf_norm = np.random.random((102, 105)).astype("float32") + + time_step = 9 + learning_rate = 0.002 + beta_1 = 0.9 + beta_2 = 0.999 + epsilon = 1e-8 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment': moment, + 'InfNorm': inf_norm, + 'TimeStep': np.array([time_step]).astype("int32"), + 'LearningRate': np.array([learning_rate]).astype("float32") + } + + moment_out = beta_1 * moment + (1 - beta_1) * grad + inf_norm_out = np.maximum(beta_2 * inf_norm + epsilon, np.abs(grad)) + lr_t = (learning_rate / (1 - beta_1**time_step)) + param_out = param - lr_t * np.divide(moment_out, inf_norm_out) + + self.outputs = { + 'ParamOut': param_out, + 'MomentOut': moment_out, + 'InfNormOut': inf_norm_out } def test_check_output(self): From 2e39197dae900a574ec9e7f086754dba65c893e2 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Wed, 4 Oct 2017 11:16:11 -0700 Subject: [PATCH 6/9] Removing Tensor from comments --- paddle/operators/adamax_op.cc | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc index bff445de6f56f..0ccc4d009ec33 100644 --- a/paddle/operators/adamax_op.cc +++ b/paddle/operators/adamax_op.cc @@ -76,20 +76,19 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker { public: AdamaxOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("Param", "(Tensor, default Tensor) Input parameter"); - AddInput("Grad", "(Tensor, default Tensor) Input gradient"); - AddInput("LearningRate", "(Tensor, default Tensor) Learning rate"); - AddInput("Moment", "(Tensor, default Tensor) First moment"); + AddInput("Param", "(Tensor) Input parameter"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("LearningRate", "(Tensor) Learning rate"); + AddInput("Moment", "(Tensor) First moment"); AddInput("InfNorm", - "(Tensor, default Tensor) " + "(Tensor) " "Input exponentially weighted infinity norm"); - AddInput("TimeStep", "(Tensor, default Tensor) Time step"); + AddInput("TimeStep", "(Tensor) Time step"); - AddOutput("ParamOut", "(Tensor, default Tensor) Output parameter"); - AddOutput("MomentOut", - "(Tensor, default Tensor) Output first moment"); + AddOutput("ParamOut", "(Tensor) Output parameter"); + AddOutput("MomentOut", "(Tensor) Output first moment"); AddOutput("InfNormOut", - "(Tensor, default Tensor) " + "(Tensor) " "Output exponentially weighted infinity norm"); AddAttr("beta1", From 7c921a8a3a8be4cd71ae316fa1fda489dd12da73 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Thu, 5 Oct 2017 13:42:07 -0700 Subject: [PATCH 7/9] Rectifying the Adamax implementation --- paddle/operators/adamax_op.cc | 44 ++++++------- paddle/operators/adamax_op.h | 64 +++++++++++-------- .../v2/framework/tests/test_adamax_op.py | 28 ++++---- 3 files changed, 75 insertions(+), 61 deletions(-) diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc index 0ccc4d009ec33..c348e0a0b2ba1 100644 --- a/paddle/operators/adamax_op.cc +++ b/paddle/operators/adamax_op.cc @@ -33,8 +33,8 @@ class AdamaxOp : public framework::OperatorWithKernel { "Input(InfNorm) of AdamaxOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("LearningRate"), "Input(LearningRate) of AdamaxOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("TimeStep"), - "Input(TimeStep) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), + "Input(Beta1Pow) of AdamaxOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), "Output(ParamOut) of AdamaxOp should not be null."); @@ -42,33 +42,30 @@ class AdamaxOp : public framework::OperatorWithKernel { "Output(MomentOut) of AdamaxOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("InfNormOut"), "Output(InfNormOut) of AdamaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Beta1PowOut"), + "Output(Beta1PowOut) of AdamaxOp should not be null."); auto lr_dims = ctx->GetInputDim("LearningRate"); PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, "Learning rate should have 1 dimension"); - auto t_dims = ctx->GetInputDim("TimeStep"); - PADDLE_ENFORCE_EQ(framework::product(t_dims), 1, - "Time step should have 1 dimension"); - auto param_dim = ctx->GetInputDim("Param"); + auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); + PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, + "Beta1 power accumulator should have 1 dimension"); + auto param_dims = ctx->GetInputDim("Param"); PADDLE_ENFORCE_EQ( - param_dim, ctx->GetInputDim("Grad"), + param_dims, ctx->GetInputDim("Grad"), "Param and Grad input of AdamaxOp should have same dimension"); PADDLE_ENFORCE_EQ( - param_dim, ctx->GetInputDim("Moment"), + param_dims, ctx->GetInputDim("Moment"), "Param and Moment input of AdamaxOp should have same dimension"); PADDLE_ENFORCE_EQ( - param_dim, ctx->GetInputDim("InfNorm"), + param_dims, ctx->GetInputDim("InfNorm"), "Param and InfNorm input of AdamaxOp should have same dimension"); - ctx->SetOutputDim("ParamOut", param_dim); - ctx->SetOutputDim("MomentOut", param_dim); - ctx->SetOutputDim("InfNormOut", param_dim); - } - - // Datatype of operator is determined by Param tensor - framework::DataType IndicateDataType( - const framework::ExecutionContext &ctx) const override { - return framework::ToDataType(ctx.Input("Param")->type()); + ctx->SetOutputDim("ParamOut", param_dims); + ctx->SetOutputDim("MomentOut", param_dims); + ctx->SetOutputDim("InfNormOut", param_dims); + ctx->SetOutputDim("Beta1PowOut", beta1_pow_dims); } }; @@ -83,13 +80,14 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("InfNorm", "(Tensor) " "Input exponentially weighted infinity norm"); - AddInput("TimeStep", "(Tensor) Time step"); + AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator"); AddOutput("ParamOut", "(Tensor) Output parameter"); AddOutput("MomentOut", "(Tensor) Output first moment"); AddOutput("InfNormOut", "(Tensor) " "Output exponentially weighted infinity norm"); + AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator"); AddAttr("beta1", "(float, default 0.9) " @@ -114,9 +112,11 @@ Adam algorithm based on the infinity norm. Adamax updates: -moment_out = beta_1 * moment + (1 - beta_1) * grad -inf_norm_out = max(beta_2 * inf_norm + epsilon, abs(grad)) -param_out = param - (learning_rate/(1 - beta_1^t)) * moment_out/inf_norm_out +moment_out = beta1 * moment + (1 - beta1) * grad +inf_norm_out = max(beta2 * inf_norm + epsilon, abs(grad)) +beta1_pow_out = beta1_pow * beta1 +learning_rate_t = learning_rate/(1 - beta1_pow_out) +param_out = param - learning_rate_t * moment_out/inf_norm_out The original paper does not have an epsilon attribute. However, it is added here for numerical stability diff --git a/paddle/operators/adamax_op.h b/paddle/operators/adamax_op.h index 2be6fc5016b53..9677b1bb78600 100644 --- a/paddle/operators/adamax_op.h +++ b/paddle/operators/adamax_op.h @@ -19,44 +19,52 @@ limitations under the License. */ namespace paddle { namespace operators { -using Tensor = framework::Tensor; - -template -using EigenVector = framework::EigenVector; - template class AdamaxOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto param_out = ctx.Output("ParamOut"); - auto moment_out = ctx.Output("MomentOut"); - auto norm_out = ctx.Output("InfNormOut"); + auto param_out_tensor = ctx.Output("ParamOut"); + auto moment_out_tensor = ctx.Output("MomentOut"); + auto inf_norm_out_tensor = ctx.Output("InfNormOut"); + auto beta1_pow_out_tensor = ctx.Output("Beta1PowOut"); - param_out->mutable_data(ctx.GetPlace()); - moment_out->mutable_data(ctx.GetPlace()); - norm_out->mutable_data(ctx.GetPlace()); + param_out_tensor->mutable_data(ctx.GetPlace()); + moment_out_tensor->mutable_data(ctx.GetPlace()); + inf_norm_out_tensor->mutable_data(ctx.GetPlace()); + beta1_pow_out_tensor->mutable_data(ctx.GetPlace()); - float beta_1 = ctx.Attr("beta1"); - float beta_2 = ctx.Attr("beta2"); + float beta1 = ctx.Attr("beta1"); + float beta2 = ctx.Attr("beta2"); float epsilon = ctx.Attr("epsilon"); - auto lr = ctx.Input("LearningRate")->data()[0]; - auto t = ctx.Input("TimeStep")->data()[0]; - auto p = EigenVector::Flatten(*ctx.Input("Param")); - auto g = EigenVector::Flatten(*ctx.Input("Grad")); - auto m = EigenVector::Flatten(*ctx.Input("Moment")); - auto u = EigenVector::Flatten(*ctx.Input("InfNorm")); - auto p_out = EigenVector::Flatten(*param_out); - auto m_out = EigenVector::Flatten(*moment_out); - auto u_out = EigenVector::Flatten(*norm_out); + auto param = framework::EigenVector::Flatten( + *ctx.Input("Param")); + auto grad = framework::EigenVector::Flatten( + *ctx.Input("Grad")); + auto moment = framework::EigenVector::Flatten( + *ctx.Input("Moment")); + auto inf_norm = framework::EigenVector::Flatten( + *ctx.Input("InfNorm")); + auto lr = framework::EigenVector::Flatten( + *ctx.Input("LearningRate")); + auto beta1_pow = framework::EigenVector::Flatten( + *ctx.Input("Beta1Pow")); + auto param_out = framework::EigenVector::Flatten(*param_out_tensor); + auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); + auto inf_norm_out = + framework::EigenVector::Flatten(*inf_norm_out_tensor); + auto beta1_pow_out = + framework::EigenVector::Flatten(*beta1_pow_out_tensor); auto place = ctx.GetEigenDevice(); - m_out.device(place) = beta_1 * m + (1 - beta_1) * g; - u_out.device(place) = g.abs().cwiseMax((beta_2 * u) + epsilon); - - float lr_t = lr / (1 - std::pow(beta_1, t)); - p_out.device(place) = p - lr_t * (m_out / u_out); + moment_out.device(place) = beta1 * moment + (1 - beta1) * grad; + inf_norm_out.device(place) = + grad.abs().cwiseMax((beta2 * inf_norm) + epsilon); + beta1_pow_out.device(place) = beta1_pow * beta1; + auto lr_t = lr / (1 - beta1_pow_out); + Eigen::DSizes m_dsize(moment_out_tensor->numel()); + param_out.device(place) = + param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out); } }; diff --git a/python/paddle/v2/framework/tests/test_adamax_op.py b/python/paddle/v2/framework/tests/test_adamax_op.py index 7b2d9d1360e25..a1df3ed9b23ce 100644 --- a/python/paddle/v2/framework/tests/test_adamax_op.py +++ b/python/paddle/v2/framework/tests/test_adamax_op.py @@ -12,32 +12,34 @@ def setUp(self): # The infinity norm is positive inf_norm = np.random.random((102, 105)).astype("float32") - time_step = 9 learning_rate = 0.002 beta_1 = 0.9 beta_2 = 0.999 epsilon = 1e-8 + beta_1_pow = beta_1**8 self.inputs = { 'Param': param, 'Grad': grad, 'Moment': moment, 'InfNorm': inf_norm, - 'TimeStep': np.array([time_step]).astype("int32"), - 'LearningRate': np.array([learning_rate]).astype("float32") + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta_1_pow]).astype("float32") } self.attrs = {'beta1': beta_1, 'beta2': beta_2, 'epsilon': epsilon} moment_out = beta_1 * moment + (1 - beta_1) * grad inf_norm_out = np.maximum(beta_2 * inf_norm + epsilon, np.abs(grad)) - lr_t = (learning_rate / (1 - beta_1**time_step)) + beta_1_pow_out = beta_1_pow * beta_1 + lr_t = (learning_rate / (1 - beta_1_pow_out)) param_out = param - lr_t * np.divide(moment_out, inf_norm_out) self.outputs = { 'ParamOut': param_out, 'MomentOut': moment_out, - 'InfNormOut': inf_norm_out + 'InfNormOut': inf_norm_out, + 'Beta1PowOut': beta_1_pow_out } def test_check_output(self): @@ -45,7 +47,7 @@ def test_check_output(self): class TestAdamaxOp2(OpTest): - '''Test Adamax operator with default attributes + '''Test Adamax Operator with default attributes ''' def setUp(self): @@ -56,30 +58,34 @@ def setUp(self): # The infinity norm is positive inf_norm = np.random.random((102, 105)).astype("float32") - time_step = 9 learning_rate = 0.002 beta_1 = 0.9 beta_2 = 0.999 epsilon = 1e-8 + beta_1_pow = beta_1**8 self.inputs = { 'Param': param, 'Grad': grad, 'Moment': moment, 'InfNorm': inf_norm, - 'TimeStep': np.array([time_step]).astype("int32"), - 'LearningRate': np.array([learning_rate]).astype("float32") + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta_1_pow]).astype("float32") } + self.attrs = {'beta1': beta_1, 'beta2': beta_2, 'epsilon': epsilon} + moment_out = beta_1 * moment + (1 - beta_1) * grad inf_norm_out = np.maximum(beta_2 * inf_norm + epsilon, np.abs(grad)) - lr_t = (learning_rate / (1 - beta_1**time_step)) + beta_1_pow_out = beta_1_pow * beta_1 + lr_t = (learning_rate / (1 - beta_1_pow_out)) param_out = param - lr_t * np.divide(moment_out, inf_norm_out) self.outputs = { 'ParamOut': param_out, 'MomentOut': moment_out, - 'InfNormOut': inf_norm_out + 'InfNormOut': inf_norm_out, + 'Beta1PowOut': beta_1_pow_out } def test_check_output(self): From cdc46c7620917703f622505a8dfc804e0ed34844 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Fri, 6 Oct 2017 12:43:13 -0700 Subject: [PATCH 8/9] Changing Unit Test values and adding comments --- python/paddle/v2/framework/tests/test_adamax_op.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/paddle/v2/framework/tests/test_adamax_op.py b/python/paddle/v2/framework/tests/test_adamax_op.py index a1df3ed9b23ce..17b814d64702b 100644 --- a/python/paddle/v2/framework/tests/test_adamax_op.py +++ b/python/paddle/v2/framework/tests/test_adamax_op.py @@ -5,6 +5,8 @@ class TestAdamaxOp1(OpTest): def setUp(self): + '''Test Adamax Operator with supplied attributes + ''' self.op_type = "adamax" param = np.random.uniform(-1, 1, (102, 105)).astype("float32") grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") @@ -13,10 +15,10 @@ def setUp(self): inf_norm = np.random.random((102, 105)).astype("float32") learning_rate = 0.002 - beta_1 = 0.9 - beta_2 = 0.999 - epsilon = 1e-8 - beta_1_pow = beta_1**8 + beta_1 = 0.78 + beta_2 = 0.899 + epsilon = 1e-5 + beta_1_pow = beta_1**10 self.inputs = { 'Param': param, @@ -73,8 +75,6 @@ def setUp(self): 'Beta1Pow': np.array([beta_1_pow]).astype("float32") } - self.attrs = {'beta1': beta_1, 'beta2': beta_2, 'epsilon': epsilon} - moment_out = beta_1 * moment + (1 - beta_1) * grad inf_norm_out = np.maximum(beta_2 * inf_norm + epsilon, np.abs(grad)) beta_1_pow_out = beta_1_pow * beta_1 From af36e75db1caf58d350e1503919faa066da03196 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Sun, 8 Oct 2017 18:25:09 -0700 Subject: [PATCH 9/9] Changing Unit Test to test multiple steps --- .../v2/framework/tests/test_adamax_op.py | 124 +++++++++++++++--- 1 file changed, 103 insertions(+), 21 deletions(-) diff --git a/python/paddle/v2/framework/tests/test_adamax_op.py b/python/paddle/v2/framework/tests/test_adamax_op.py index 17b814d64702b..af81075d6ad50 100644 --- a/python/paddle/v2/framework/tests/test_adamax_op.py +++ b/python/paddle/v2/framework/tests/test_adamax_op.py @@ -15,10 +15,10 @@ def setUp(self): inf_norm = np.random.random((102, 105)).astype("float32") learning_rate = 0.002 - beta_1 = 0.78 - beta_2 = 0.899 + beta1 = 0.78 + beta2 = 0.899 epsilon = 1e-5 - beta_1_pow = beta_1**10 + beta1_pow = beta1**10 self.inputs = { 'Param': param, @@ -26,22 +26,19 @@ def setUp(self): 'Moment': moment, 'InfNorm': inf_norm, 'LearningRate': np.array([learning_rate]).astype("float32"), - 'Beta1Pow': np.array([beta_1_pow]).astype("float32") + 'Beta1Pow': np.array([beta1_pow]).astype("float32") } - self.attrs = {'beta1': beta_1, 'beta2': beta_2, 'epsilon': epsilon} + self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon} - moment_out = beta_1 * moment + (1 - beta_1) * grad - inf_norm_out = np.maximum(beta_2 * inf_norm + epsilon, np.abs(grad)) - beta_1_pow_out = beta_1_pow * beta_1 - lr_t = (learning_rate / (1 - beta_1_pow_out)) - param_out = param - lr_t * np.divide(moment_out, inf_norm_out) + param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step( + self.inputs, self.attrs) self.outputs = { 'ParamOut': param_out, 'MomentOut': moment_out, 'InfNormOut': inf_norm_out, - 'Beta1PowOut': beta_1_pow_out + 'Beta1PowOut': beta1_pow_out } def test_check_output(self): @@ -61,10 +58,10 @@ def setUp(self): inf_norm = np.random.random((102, 105)).astype("float32") learning_rate = 0.002 - beta_1 = 0.9 - beta_2 = 0.999 + beta1 = 0.9 + beta2 = 0.999 epsilon = 1e-8 - beta_1_pow = beta_1**8 + beta1_pow = beta1**8 self.inputs = { 'Param': param, @@ -72,25 +69,110 @@ def setUp(self): 'Moment': moment, 'InfNorm': inf_norm, 'LearningRate': np.array([learning_rate]).astype("float32"), - 'Beta1Pow': np.array([beta_1_pow]).astype("float32") + 'Beta1Pow': np.array([beta1_pow]).astype("float32") } - moment_out = beta_1 * moment + (1 - beta_1) * grad - inf_norm_out = np.maximum(beta_2 * inf_norm + epsilon, np.abs(grad)) - beta_1_pow_out = beta_1_pow * beta_1 - lr_t = (learning_rate / (1 - beta_1_pow_out)) - param_out = param - lr_t * np.divide(moment_out, inf_norm_out) + attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon} + param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step( + self.inputs, attrs) self.outputs = { 'ParamOut': param_out, 'MomentOut': moment_out, 'InfNormOut': inf_norm_out, - 'Beta1PowOut': beta_1_pow_out + 'Beta1PowOut': beta1_pow_out } def test_check_output(self): self.check_output() +class TestAdamaxOpMultipleSteps(OpTest): + def setUp(self): + '''Test Adamax Operator with supplied attributes + ''' + self.op_type = "adamax" + self.num_steps = 10 + + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The infinity norm is positive + inf_norm = np.random.random((102, 105)).astype("float32") + + learning_rate = 0.002 + beta1 = 0.8 + beta2 = 0.99 + epsilon = 1e-5 + beta1_pow = 1 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment': moment, + 'InfNorm': inf_norm, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32") + } + + self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon} + + param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step( + self.inputs, self.attrs) + + def test_check_output(self): + for _ in range(self.num_steps): + param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step( + self.inputs, self.attrs) + + self.outputs = { + 'ParamOut': param_out, + 'MomentOut': moment_out, + 'InfNormOut': inf_norm_out, + 'Beta1PowOut': beta1_pow_out + } + + # Verify output for this step + self.check_output() + + # Output of this step becomes input for next step + self.inputs['Param'] = param_out + self.inputs['Moment'] = moment_out + self.inputs['InfNorm'] = inf_norm_out + self.inputs['Beta1Pow'] = beta1_pow_out + + # Randomize gradient for next step + self.inputs['Grad'] = np.random.uniform( + -1, 1, (102, 105)).astype("float32") + + +def adamax_step(inputs, attributes): + ''' + Simulate one step of the adamax optimizer + :param inputs: dict of inputs + :param attributes: dict of attributes + :return tuple: tuple of output param, moment, inf_norm and + beta1 power accumulator + ''' + param = inputs['Param'] + grad = inputs['Grad'] + moment = inputs['Moment'] + inf_norm = inputs['InfNorm'] + lr = inputs['LearningRate'] + beta1_pow = inputs['Beta1Pow'] + + beta1 = attributes['beta1'] + beta2 = attributes['beta2'] + epsilon = attributes['epsilon'] + + moment_out = beta1 * moment + (1 - beta1) * grad + inf_norm_out = np.maximum(beta2 * inf_norm + epsilon, np.abs(grad)) + beta1_pow_out = beta1_pow * beta1 + lr_t = (lr / (1 - beta1_pow_out)) + param_out = param - lr_t * np.divide(moment_out, inf_norm_out) + + return param_out, moment_out, inf_norm_out, beta1_pow_out + + if __name__ == "__main__": unittest.main()