From 0247210c35ebedb73db0d0c0d9605b1e896619b7 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 24 Feb 2022 20:15:05 +0800
Subject: [PATCH 01/15] add some datasets for finetune.

---
 paddlenlp/datasets/__init__.py        |  2 +
 paddlenlp/datasets/chnsenticorp_v2.py | 83 +++++++++++++++++++++++++
 paddlenlp/datasets/drcd_cn.py         | 88 +++++++++++++++++++++++++++
 paddlenlp/datasets/dureader_nlp.py    | 83 +++++++++++++++++++++++++
 paddlenlp/datasets/lcqmc_v2.py        | 81 ++++++++++++++++++++++++
 paddlenlp/datasets/nlpcc_dbqa.py      | 88 +++++++++++++++++++++++++++
 6 files changed, 425 insertions(+)
 create mode 100644 paddlenlp/datasets/chnsenticorp_v2.py
 create mode 100644 paddlenlp/datasets/drcd_cn.py
 create mode 100644 paddlenlp/datasets/dureader_nlp.py
 create mode 100644 paddlenlp/datasets/lcqmc_v2.py
 create mode 100644 paddlenlp/datasets/nlpcc_dbqa.py

diff --git a/paddlenlp/datasets/__init__.py b/paddlenlp/datasets/__init__.py
index bf3511104d11..ef03ee1abbd3 100644
--- a/paddlenlp/datasets/__init__.py
+++ b/paddlenlp/datasets/__init__.py
@@ -16,6 +16,7 @@
 from .chnsenticorp import *
 from .cmrc2018 import *
 from .drcd import *
+from .drcd_cn import *
 from .dureader_robust import *
 from .glue import *
 from .lcqmc import *
@@ -34,6 +35,7 @@
 from .seabsa16 import *
 from .cote import *
 from .clue import *
+from .nlpcc_dbqa import *
 from .nlpcc14_sc import *
 from .nlpcc13_evsam05_thu import *
 from .nlpcc13_evsam05_hit import *
diff --git a/paddlenlp/datasets/chnsenticorp_v2.py b/paddlenlp/datasets/chnsenticorp_v2.py
new file mode 100644
index 000000000000..5e6056a200fa
--- /dev/null
+++ b/paddlenlp/datasets/chnsenticorp_v2.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+from paddlenlp.utils.env import DATA_HOME
+from . import DatasetBuilder
+
+__all__ = ['ChnSentiCorpV2']
+
+
+class ChnSentiCorpV2(DatasetBuilder):
+    """
+    ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for
+    opinion mining)
+
+    """
+
+    URL = "https://paddlenlp.bj.bcebos.com/datasets/data-chnsenticorp.tar.gz"
+    MD5 = "e336e76d7be4ecd5479083d5b8f771e4"
+    META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))
+    SPLITS = {
+        'train': META_INFO(
+            os.path.join('chnsenticorp', 'train', 'part.0'),
+            '3fac2659547f1ddf90d223b8ed31f22f'),
+        'dev': META_INFO(
+            os.path.join('chnsenticorp', 'dev', 'part.0'),
+            'a3a853bfb3af4a592fc4df24b56c88a7'),
+        'test': META_INFO(
+            os.path.join('chnsenticorp', 'test', 'part.0'),
+            '6bfc8f35f523d2fdf12648d9d02778ff'),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Downloads dataset."""
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and
+                                            not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, split):
+        """Reads data."""
+        with open(filename, 'r', encoding='utf-8') as f:
+            head = None
+            for line in f:
+                data = line.strip().split("\t")
+                if not head:
+                    head = data
+                else:
+                    if split == 'train':
+                        text, label = data
+                        yield {"text": text, "label": label}
+                    elif split == 'dev':
+                        text, label = data
+                        yield {"text": text, "label": label}
+                    elif split == 'test':
+                        text, label = data
+                        yield {"text": text, "label": label}
+
+    def get_labels(self):
+        """
+        Return labels of the ChnSentiCorp object.
+        """
+        return ["0", "1"]
diff --git a/paddlenlp/datasets/drcd_cn.py b/paddlenlp/datasets/drcd_cn.py
new file mode 100644
index 000000000000..42d78c24f7f5
--- /dev/null
+++ b/paddlenlp/datasets/drcd_cn.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+from paddlenlp.utils.env import DATA_HOME
+from . import DatasetBuilder
+
+__all__ = ['DRCD_CN']
+
+
+class DRCD_CN(DatasetBuilder):
+    '''
+    Delta Reading Comprehension Dataset is an open domain traditional Chinese
+    machine reading comprehension (MRC) dataset. The dataset contains 10,014
+    paragraphs from 2,108 Wikipedia articles and 30,000+ questions generated
+    by annotators.
+
+    This dataset translate origin Traditional Chinese to Simplified Chinese.
+    '''
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/drcd_cn.tar.gz"
+    MD5 = "8ceed5076c4f59d7a3666b13851e41fa"
+    META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))
+    SPLITS = {
+        'train': META_INFO(
+            os.path.join('drcd_cn', 'train.json'),
+            '5a51ee5a106e16965c85fce364d316d7'),
+        'dev': META_INFO(
+            os.path.join('drcd_cn', 'dev.json'),
+            'f352b17cddeed69877ff94d4321817ce'),
+        'test': META_INFO(
+            os.path.join('drcd_cn', 'test.json'),
+            'e674a667033c4e8c9ae6d05d95073d02')
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and
+                                            not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf8") as f:
+            input_data = json.load(f)["data"]
+        for entry in input_data:
+            title = entry.get("title", "").strip()
+            for paragraph in entry["paragraphs"]:
+                context = paragraph["context"].strip()
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question = qa["question"].strip()
+                    answer_starts = [
+                        answer["answer_start"]
+                        for answer in qa.get("answers", [])
+                    ]
+                    answers = [
+                        answer["text"].strip()
+                        for answer in qa.get("answers", [])
+                    ]
+
+                    yield {
+                        'id': qas_id,
+                        'title': title,
+                        'context': context,
+                        'question': question,
+                        'answers': answers,
+                        'answer_starts': answer_starts
+                    }
diff --git a/paddlenlp/datasets/dureader_nlp.py b/paddlenlp/datasets/dureader_nlp.py
new file mode 100644
index 000000000000..d552504a1953
--- /dev/null
+++ b/paddlenlp/datasets/dureader_nlp.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+from paddlenlp.utils.env import DATA_HOME
+from . import DatasetBuilder
+
+__all__ = ['DuReaderNLP']
+
+
+class DuReaderNLP(DatasetBuilder):
+    '''
+    The machine reading comprehension dataset (i.e. DuReader) is designed
+    to measure the performance of a reading comprehension model.
+    
+    This is for internal dataset. You should nerver use it.
+    '''
+
+    URL = 'https://internal/datasets/dureader_nlp.tar.gz'
+    MD5 = '7372b42aadde59904c291341b73e30a1'
+    META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))
+    SPLITS = {
+        'train': META_INFO(
+            os.path.join('dureader', 'train.json'),
+            'd81648dccca54b48fd9cddecf28815b0'),
+        'dev': META_INFO(
+            os.path.join('dureader', 'dev.json'),
+            'd941140d8d5362d9031897ba2004af64'),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and
+                                            not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf8") as f:
+            input_data = json.load(f)["data"]
+        for entry in input_data:
+            title = entry.get("title", "").strip()
+            for paragraph in entry["paragraphs"]:
+                context = paragraph["context"].strip()
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question = qa["question"].strip()
+                    answer_starts = [
+                        answer["answer_start"]
+                        for answer in qa.get("answers", [])
+                    ]
+                    answers = [
+                        answer["text"].strip()
+                        for answer in qa.get("answers", [])
+                    ]
+
+                    yield {
+                        'id': qas_id,
+                        'title': title,
+                        'context': context,
+                        'question': question,
+                        'answers': answers,
+                        'answer_starts': answer_starts
+                    }
diff --git a/paddlenlp/datasets/lcqmc_v2.py b/paddlenlp/datasets/lcqmc_v2.py
new file mode 100644
index 000000000000..4ab60c4b0227
--- /dev/null
+++ b/paddlenlp/datasets/lcqmc_v2.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+from paddlenlp.utils.env import DATA_HOME
+from . import DatasetBuilder
+
+__all__ = ['LCQMC_V2']
+
+
+class LCQMC_V2(DatasetBuilder):
+    """
+    LCQMC:A Large-scale Chinese Question Matching Corpus
+    More information please refer to `https://www.aclweb.org/anthology/C18-1166/`
+
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/lcqmc_v2.tar.gz"
+    MD5 = "e44825d8e6d5117bc04caf3982cf934f"
+    META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))
+    SPLITS = {
+        'train': META_INFO(
+            os.path.join('lcqmc', 'train.tsv'),
+            '2193c022439b038ac12c0ae918b211a1'),
+        'dev': META_INFO(
+            os.path.join('lcqmc', 'dev.tsv'),
+            'c5dcba253cb4105d914964fd8b3c0e94'),
+        'test': META_INFO(
+            os.path.join('lcqmc', 'test.tsv'),
+            '8f4b71e15e67696cc9e112a459ec42bd'),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and
+                                            not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename):
+        """Reads data."""
+        with open(filename, 'r', encoding='utf-8') as f:
+            head = True
+            for line in f:
+                data = line.strip().split("\t")
+                if head:
+                    head = False
+                else:
+                    if len(data) == 3:
+                        query, title, label = data
+                        yield {"query": query, "title": title, "label": label}
+                    elif len(data) == 2:
+                        query, title = data
+                        yield {"query": query, "title": title, "label": ''}
+                    else:
+                        continue
+
+    def get_labels(self):
+        """
+        Return labels of the LCQMC object.
+        """
+        return ["0", "1"]
diff --git a/paddlenlp/datasets/nlpcc_dbqa.py b/paddlenlp/datasets/nlpcc_dbqa.py
new file mode 100644
index 000000000000..0befef2ebe49
--- /dev/null
+++ b/paddlenlp/datasets/nlpcc_dbqa.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+from paddlenlp.utils.env import DATA_HOME
+from . import DatasetBuilder
+
+__all__ = ['NLPCC_DBQA']
+
+
+class NLPCC_DBQA(DatasetBuilder):
+    """
+    NLPCC2016 DBQA dataset.
+
+    Document-based QA (or DBQA) task
+    When predicting answers to each question, a DBQA system built by each 
+    participating team IS LIMITED TO select sentences as answersfrom the 
+    question’s given document. 
+    
+    For more imformation: http://tcci.ccf.org.cn/conference/2016/dldoc/evagline2.pdf
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/nlpcc-dbqa.zip"
+    MD5 = "a5f69c2462136ef4d1707e4e2551a57b"
+    META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))
+    SPLITS = {
+        'train': META_INFO(
+            os.path.join('nlpcc-dbqa', 'nlpcc-dbqa', 'train.tsv'),
+            '4f84fefce1a8f52c8d9248d1ff5ab9bd'),
+        'dev': META_INFO(
+            os.path.join('nlpcc-dbqa', 'nlpcc-dbqa', 'dev.tsv'),
+            '3831beb0d42c29615d06343538538f53'),
+        'test': META_INFO(
+            os.path.join('nlpcc-dbqa', 'nlpcc-dbqa', 'test.tsv'),
+            'e224351353b1f6a15837008b5d0da703'),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Downloads dataset."""
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and
+                                            not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, split):
+        """Reads data."""
+        with open(filename, 'r', encoding='utf-8') as f:
+            head = None
+            for line in f:
+                data = line.strip().split("\t")
+                if not head:
+                    head = data
+                else:
+                    qid, text_a, text_b, label = data
+                    yield {
+                        "qid": qid,
+                        "text_a": text_a,
+                        "text_b": text_b,
+                        "label": label
+                    }
+
+    def get_labels(self):
+        """
+        Return labels of XNLI dataset.
+
+        Note:
+            Contradictory and contradiction are the same label
+        """
+        return ["0", "1"]

From 654d45ac9cf2fb3ddeafacc3042414ee3efa9d9b Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Fri, 11 Mar 2022 20:49:53 +0800
Subject: [PATCH 02/15] support fine tune for all tastks.

---
 .../ernie-1.0/finetune/config.yml             |  67 ++++
 .../ernie-1.0/finetune/finetune.py            | 282 ++++++++++++++
 .../ernie-1.0/finetune/question_answering.py  | 261 +++++++++++++
 .../finetune/sequence_classification.py       | 359 ++++++++++++++++++
 .../finetune/token_classification.py          |   0
 5 files changed, 969 insertions(+)
 create mode 100644 examples/language_model/ernie-1.0/finetune/config.yml
 create mode 100644 examples/language_model/ernie-1.0/finetune/finetune.py
 create mode 100644 examples/language_model/ernie-1.0/finetune/question_answering.py
 create mode 100644 examples/language_model/ernie-1.0/finetune/sequence_classification.py
 create mode 100644 examples/language_model/ernie-1.0/finetune/token_classification.py

diff --git a/examples/language_model/ernie-1.0/finetune/config.yml b/examples/language_model/ernie-1.0/finetune/config.yml
new file mode 100644
index 000000000000..fcbd7d433af5
--- /dev/null
+++ b/examples/language_model/ernie-1.0/finetune/config.yml
@@ -0,0 +1,67 @@
+# Default Args for all dataset 
+# You can overwrite the configs in each dataset.
+DefaultArgs:
+    learning_rate: 0.00005
+    num_train_epochs: 3
+    batch_size: 64
+    max_seq_length: 128
+    weight_decay: 0.0
+    logging_steps: 10
+    valid_steps: 100
+    minimum_valid_times: 20 # If under valid_steps, the valid time is less then 20, the config of valid_steps will be changed.
+    max_steps: -1
+    warmup_steps: 0
+    metric: "Accuracy"
+    split: "train dev"
+
+# Datasets which used for sequence classfication
+SequenceClassification:
+    clue afqmc: 
+        num_train_epochs: 4
+    clue tnews:
+        num_train_epochs: 4
+    clue iflytek:
+        num_train_epochs: 8
+    clue ocnli:
+        num_train_epochs: 8
+    clue cmnli: 
+        learning_rate: 1e-4, 5e-5, 1e-5
+        num_train_epochs: 3
+    clue wsc: 
+        num_train_epochs: 50
+    clue csl:
+        num_train_epochs: 10
+        max_seq_length: 256
+        batch_size: 32
+    xnli_cn:
+        learning_rate: 0.00005
+        num_train_epochs: 3
+    chnsenticorp_v2:
+        learning_rate: 0.00001
+        num_train_epochs: 5
+
+# Datasets which used for token classfication
+TokenClassification:
+    peoples_daily_ner:
+        num_train_epochs: 5
+    msra_ner:
+        num_train_epochs: 3
+
+# Datasets which used for question answersing
+QuestionAnswering:
+    cmrc2018:
+        num_train_epochs: 1
+        batch_size: 12
+        max_seq_length: 384
+    dureader_nlp:
+        num_train_epochs: 1
+        batch_size: 12
+        max_seq_length: 384
+    dureader_robust:
+        num_train_epochs: 1
+        batch_size: 12
+        max_seq_length: 384
+    dlbp:
+        num_train_epochs: 1
+        batch_size: 12
+        max_seq_length: 384
\ No newline at end of file
diff --git a/examples/language_model/ernie-1.0/finetune/finetune.py b/examples/language_model/ernie-1.0/finetune/finetune.py
new file mode 100644
index 000000000000..705efec52564
--- /dev/null
+++ b/examples/language_model/ernie-1.0/finetune/finetune.py
@@ -0,0 +1,282 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import os
+import sys
+import random
+import time
+import math
+import copy
+import yaml
+from functools import partial
+import distutils.util
+import os.path as osp
+
+import numpy as np
+import paddle
+from paddle.io import DataLoader
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.metric import Accuracy
+from paddlenlp.metrics.squad import squad_evaluate, compute_prediction
+
+import paddlenlp
+from paddlenlp.datasets import load_dataset
+from paddlenlp.data import Stack, Tuple, Pad, Dict
+from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer
+from paddlenlp.transformers import AutoModelForTokenClassification
+from paddlenlp.transformers import AutoModelForQuestionAnswering
+from paddlenlp.transformers import LinearDecayWithWarmup
+from paddlenlp.utils.log import logger
+
+sys.path.insert(0, os.path.abspath("."))
+from sequence_classification import CLUE_TRAINING
+from question_answering import QA_TRAINING
+
+ALL_TASKS = {
+    "SequenceClassification": [],
+    "TokenClassification": [],
+    "QuestionAnswering": []
+}
+
+for x in dir(paddlenlp.transformers):
+    for task in ALL_TASKS.keys():
+        if x.endswith(task):
+            if not x.startswith("AutoModel"):
+                ALL_TASKS[task].append(x)
+
+CONFIG = yaml.load(
+    open(osp.join(osp.abspath("."), "./config.yml"), 'r'),
+    Loader=yaml.FullLoader)
+ARGS = CONFIG["DefaultArgs"]
+ALL_DATASETS = {}
+
+for task_type in ALL_TASKS.keys():
+    task = CONFIG[task_type]
+    for data_name in task.keys():
+        new_args = task[data_name]
+        new_args = {} if new_args is None else new_args
+        final_args = copy.deepcopy(ARGS)
+        final_args.update(new_args)
+        final_args["model"] = "AutoModelFor{}".format(task_type)
+        ALL_DATASETS[data_name] = final_args
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    # Required parameters
+
+    parser.add_argument(
+        "--dataset",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the dataset to train selected in the list: " +
+        ", ".join(ALL_DATASETS.keys()), )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: "
+        +
+        " https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html . "
+        + " Such as ernie-1.0, bert-base-uncased")
+
+    group = parser.add_argument_group(title='Common training configs.')
+    group.add_argument(
+        "--max_seq_length",
+        default=None,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.", )
+    group.add_argument(
+        "--learning_rate",
+        default=None,
+        type=float,
+        help="The initial learning rate for Adam.")
+    group.add_argument(
+        "--batch_size",
+        default=None,
+        type=int,
+        help="Batch size per GPU/CPU for training.", )
+    group.add_argument(
+        "--weight_decay",
+        default=0.0,
+        type=float,
+        help="Weight decay if we apply some.")
+
+    group.add_argument(
+        "--num_train_epochs",
+        default=None,
+        type=int,
+        help="Total number of training epochs to perform.", )
+    group.add_argument(
+        "--logging_steps",
+        type=int,
+        default=10,
+        help="Log every X updates steps.")
+    group.add_argument(
+        "--valid_steps",
+        type=int,
+        default=200,
+        help="Save checkpoint every X updates steps.")
+    group.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    group.add_argument(
+        "--warmup_steps",
+        default=0,
+        type=int,
+        help="Linear warmup over warmup_steps. If > 0: Override warmup_proportion"
+    )
+    group.add_argument(
+        "--warmup_proportion",
+        default=0.1,
+        type=float,
+        help="Linear warmup proportion over total steps.")
+
+    group = parser.add_argument_group(title='Additional training configs.')
+    group.add_argument(
+        "--use_amp",
+        type=distutils.util.strtobool,
+        default=False,
+        help="Enable mixed precision training.")
+    group.add_argument(
+        "--scale_loss",
+        type=float,
+        default=2**15,
+        help="The value of scale_loss for fp16.")
+    group.add_argument(
+        "--adam_epsilon",
+        default=1e-8,
+        type=float,
+        help="Epsilon for Adam optimizer.")
+    group.add_argument(
+        "--max_grad_norm",
+        default=1.0,
+        type=float,
+        help="The max value of grad norm.")
+
+    group.add_argument(
+        "--seed", default=42, type=int, help="random seed for initialization")
+    group.add_argument(
+        "--device",
+        default="gpu",
+        choices=["cpu", "gpu"],
+        help="The device to select to train the model, is must be cpu/gpu.")
+
+    group = parser.add_argument_group(title='Additional configs for QA task.')
+    group.add_argument(
+        "--doc_stride",
+        type=int,
+        default=128,
+        help="When splitting up a long document into chunks, how much stride to take between chunks."
+    )
+    group.add_argument(
+        "--n_best_size",
+        type=int,
+        default=20,
+        help="The total number of n-best predictions to generate in the nbest_predictions.json output file."
+    )
+    group.add_argument(
+        "--max_query_length", type=int, default=64, help="Max query length.")
+    group.add_argument(
+        "--max_answer_length", type=int, default=30, help="Max answer length.")
+    group.add_argument(
+        "--do_lower_case",
+        action='store_false',
+        help="Whether to lower case the input text. Should be True for uncased models and False for cased models."
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def set_seed(args):
+    # Use the same data seed(for data shuffle) for all procs to guarantee data
+    # consistency after sharding.
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    # Maybe different op seeds(for dropout) for different procs is better. By:
+    # `paddle.seed(args.seed + paddle.distributed.get_rank())`
+    paddle.seed(args.seed)
+
+
+def do_train(args):
+    paddle.set_device(args.device)
+    rank = paddle.distributed.get_rank()
+    if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.init_parallel_env()
+
+    set_seed(args)
+    args.dataset = args.dataset.strip()
+
+    if args.dataset not in ALL_DATASETS:
+        raise ValueError("Not found {}".format(args.dataset))
+
+    config = ALL_DATASETS[args.dataset]
+    for arg in vars(args):
+        if getattr(args, arg) is None:
+            if arg in config.keys():
+                setattr(args, arg, config[arg])
+
+    dataset_config = args.dataset.split(" ")
+    all_ds = load_dataset(
+        dataset_config[0],
+        None if len(dataset_config) <= 1 else dataset_config[1],
+        # lazy=False
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+
+    args.label_list = getattr(all_ds['train'], "label_list", None)
+
+    num_classes = 1 if all_ds["train"].label_list == None else len(all_ds[
+        'train'].label_list)
+
+    model = getattr(paddlenlp.transformers, config["model"]).from_pretrained(
+        args.model_name_or_path, num_classes=num_classes)
+
+    if paddle.distributed.get_world_size() > 1:
+        model = paddle.DataParallel(model)
+
+    if 'clue' in args.dataset:
+        trainer = CLUE_TRAINING(all_ds["train"], all_ds["dev"], model,
+                                tokenizer, args)
+    elif "Answering" in config["model"]:
+        trainer = QA_TRAINING(all_ds["train"], all_ds["dev"], model, tokenizer,
+                              args)
+
+    trainer.train()
+    trainer.eval()
+
+
+def print_arguments(args):
+    """print arguments"""
+    logger.info('{:^40}'.format("Configuration Arguments"))
+    logger.info('{:20}:{}'.format("paddle commit id", paddle.version.commit))
+    for arg in vars(args):
+        logger.info('{:20}:{}'.format(arg, getattr(args, arg)))
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    # print_arguments(args)
+    do_train(args)
diff --git a/examples/language_model/ernie-1.0/finetune/question_answering.py b/examples/language_model/ernie-1.0/finetune/question_answering.py
new file mode 100644
index 000000000000..c2255b831747
--- /dev/null
+++ b/examples/language_model/ernie-1.0/finetune/question_answering.py
@@ -0,0 +1,261 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import json
+from functools import partial
+
+import numpy as np
+import paddle
+import paddlenlp as ppnlp
+from paddlenlp.data import Pad, Stack, Tuple, Dict
+from paddlenlp.metrics.squad import squad_evaluate, compute_prediction
+
+from sequence_classification import BaseTrainer
+from paddlenlp.utils.log import logger
+
+
+class CrossEntropyLossForSQuAD(paddle.nn.Layer):
+    def __init__(self):
+        super(CrossEntropyLossForSQuAD, self).__init__()
+
+    def forward(self, y, label):
+        start_logits, end_logits = y
+        start_position, end_position = label
+        start_position = paddle.unsqueeze(start_position, axis=-1)
+        end_position = paddle.unsqueeze(end_position, axis=-1)
+        start_loss = paddle.nn.functional.cross_entropy(
+            input=start_logits, label=start_position)
+        end_loss = paddle.nn.functional.cross_entropy(
+            input=end_logits, label=end_position)
+        loss = (start_loss + end_loss) / 2
+        return loss
+
+
+@paddle.no_grad()
+def evaluate(model, data_loader, args):
+    model.eval()
+
+    all_start_logits = []
+    all_end_logits = []
+    tic_eval = time.time()
+
+    for batch in data_loader:
+        input_ids, token_type_ids = batch
+        start_logits_tensor, end_logits_tensor = model(input_ids,
+                                                       token_type_ids)
+
+        for idx in range(start_logits_tensor.shape[0]):
+            if len(all_start_logits) % 1000 == 0 and len(all_start_logits):
+                print("Processing example: %d" % len(all_start_logits))
+                print('time per 1000:', time.time() - tic_eval)
+                tic_eval = time.time()
+
+            all_start_logits.append(start_logits_tensor.numpy()[idx])
+            all_end_logits.append(end_logits_tensor.numpy()[idx])
+
+    all_predictions, _, _ = compute_prediction(
+        data_loader.dataset.data, data_loader.dataset.new_data,
+        (all_start_logits, all_end_logits), False, args.n_best_size,
+        args.max_answer_length)
+
+    # Can also write all_nbest_json and scores_diff_json files if needed
+    with open('prediction.json', "w", encoding='utf-8') as writer:
+        writer.write(
+            json.dumps(
+                all_predictions, ensure_ascii=False, indent=4) + "\n")
+
+    squad_evaluate(
+        examples=data_loader.dataset.data,
+        preds=all_predictions,
+        is_whitespace_splited=False)
+
+    model.train()
+
+
+def prepare_train_features(examples, tokenizer, args):
+    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+    # in one example possible giving several features when a context is long, each of those features having a
+    # context that overlaps a bit the context of the previous feature.
+    # NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is
+    # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead.
+    contexts = [examples[i]['context'] for i in range(len(examples))]
+    questions = [examples[i]['question'] for i in range(len(examples))]
+
+    print(len(examples))
+
+    tokenized_examples = tokenizer(
+        questions,
+        contexts,
+        stride=args.doc_stride,
+        max_seq_len=args.max_seq_length)
+
+    # Let's label those examples!
+    for i, tokenized_example in enumerate(tokenized_examples):
+        # We will label impossible answers with the index of the CLS token.
+        input_ids = tokenized_example["input_ids"]
+        cls_index = input_ids.index(tokenizer.cls_token_id)
+
+        # The offset mappings will give us a map from token to character position in the original context. This will
+        # help us compute the start_positions and end_positions.
+        offsets = tokenized_example['offset_mapping']
+
+        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+        sequence_ids = tokenized_example['token_type_ids']
+
+        # One example can give several spans, this is the index of the example containing this span of text.
+        sample_index = tokenized_example['overflow_to_sample']
+        answers = examples[sample_index]['answers']
+        answer_starts = examples[sample_index]['answer_starts']
+
+        # Start/end character index of the answer in the text.
+        start_char = answer_starts[0]
+        end_char = start_char + len(answers[0])
+
+        # Start token index of the current span in the text.
+        token_start_index = 0
+        while sequence_ids[token_start_index] != 1:
+            token_start_index += 1
+
+        # End token index of the current span in the text.
+        token_end_index = len(input_ids) - 1
+        while sequence_ids[token_end_index] != 1:
+            token_end_index -= 1
+        # Minus one more to reach actual text
+        token_end_index -= 1
+
+        # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
+        if not (offsets[token_start_index][0] <= start_char and
+                offsets[token_end_index][1] >= end_char):
+            tokenized_examples[i]["start_positions"] = cls_index
+            tokenized_examples[i]["end_positions"] = cls_index
+        else:
+            # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
+            # Note: we could go after the last offset if the answer is the last word (edge case).
+            while token_start_index < len(offsets) and offsets[
+                    token_start_index][0] <= start_char:
+                token_start_index += 1
+            tokenized_examples[i]["start_positions"] = token_start_index - 1
+            while offsets[token_end_index][1] >= end_char:
+                token_end_index -= 1
+            tokenized_examples[i]["end_positions"] = token_end_index + 1
+
+    return tokenized_examples
+
+
+def prepare_validation_features(examples, tokenizer, args):
+    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+    # in one example possible giving several features when a context is long, each of those features having a
+    # context that overlaps a bit the context of the previous feature.
+    # NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is
+    # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead.
+    contexts = [examples[i]['context'] for i in range(len(examples))]
+    questions = [examples[i]['question'] for i in range(len(examples))]
+
+    tokenized_examples = tokenizer(
+        questions,
+        contexts,
+        stride=args.doc_stride,
+        max_seq_len=args.max_seq_length)
+
+    # For validation, there is no need to compute start and end positions
+    for i, tokenized_example in enumerate(tokenized_examples):
+        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+        sequence_ids = tokenized_example['token_type_ids']
+
+        # One example can give several spans, this is the index of the example containing this span of text.
+        sample_index = tokenized_example['overflow_to_sample']
+        tokenized_examples[i]["example_id"] = examples[sample_index]['id']
+
+        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
+        # position is part of the context or not.
+        tokenized_examples[i]["offset_mapping"] = [
+            (o if sequence_ids[k] == 1 else None)
+            for k, o in enumerate(tokenized_example["offset_mapping"])
+        ]
+
+    return tokenized_examples
+
+
+class QA_TRAINING(BaseTrainer):
+    def __init__(self, train_ds, dev_ds, model, tokenizer, args):
+        super().__init__()
+        self.rank = paddle.distributed.get_rank()
+        self.train_ds = train_ds
+        self.dev_ds = dev_ds
+        self.model = model
+        self.tokenizer = tokenizer
+        self.args = args
+
+        self.dataloader_inner()
+        self.prepare_train_config()
+        self.print_config()
+
+    def eval(self):
+        if self.rank == 0:
+            evaluate(self.model, self.dev_dl, self.args)
+
+    def dataloader_inner(self):
+        train_batchify_fn = lambda samples, fn=Dict({
+            "input_ids": Pad(axis=0, pad_val=self.tokenizer.pad_token_id),
+            "token_type_ids": Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id),
+            "start_positions": Stack(dtype="int64"),
+            "end_positions": Stack(dtype="int64")
+        }): fn(samples)
+
+        dev_batchify_fn = lambda samples, fn=Dict({
+                "input_ids": Pad(axis=0, pad_val=self.tokenizer.pad_token_id),
+                "token_type_ids": Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id)
+         }): fn(samples)
+
+        trans_train_fn = partial(
+            prepare_train_features, tokenizer=self.tokenizer, args=self.args)
+        trans_dev_fn = partial(
+            prepare_validation_features,
+            tokenizer=self.tokenizer,
+            args=self.args)
+
+        self.train_dl = self.create_dataloader(
+            self.train_ds, "train", self.args.batch_size, train_batchify_fn,
+            trans_train_fn, True)
+        self.dev_dl = self.create_dataloader(
+            self.dev_ds, "dev", self.args.batch_size, dev_batchify_fn,
+            trans_dev_fn, True)
+
+    def train(self):
+        criterion = CrossEntropyLossForSQuAD()
+
+        global_step = 0
+        tic_train = time.time()
+        for epoch in range(self.args.num_train_epochs):
+            for step, batch in enumerate(self.train_dl):
+                global_step += 1
+                input_ids, token_type_ids, start_positions, end_positions = batch
+                logits = self.model(
+                    input_ids=input_ids, token_type_ids=token_type_ids)
+                loss = criterion(logits, (start_positions, end_positions))
+
+                if global_step % self.args.logging_steps == 0:
+                    logger.info(
+                        "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
+                        % (global_step, epoch + 1, step + 1, loss,
+                           self.args.logging_steps / (time.time() - tic_train)))
+                    tic_train = time.time()
+                loss.backward()
+                self.optimizer.step()
+                self.lr_scheduler.step()
+                self.optimizer.clear_grad()
+
+                if global_step == self.args.num_training_steps:
+                    break
diff --git a/examples/language_model/ernie-1.0/finetune/sequence_classification.py b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
new file mode 100644
index 000000000000..84a918a95e15
--- /dev/null
+++ b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
@@ -0,0 +1,359 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle
+from paddle.io import DataLoader
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.metric import Accuracy
+import numpy as np
+
+from paddlenlp.data import Stack, Tuple, Pad, Dict
+
+import argparse
+import os
+import sys
+import random
+import time
+import math
+import copy
+import yaml
+from functools import partial
+
+import numpy as np
+import paddle
+from paddle.io import DataLoader
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.metric import Accuracy
+from paddlenlp.metrics.squad import squad_evaluate, compute_prediction
+
+import paddlenlp
+from paddlenlp.datasets import load_dataset
+from paddlenlp.data import Stack, Tuple, Pad, Dict
+from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer
+from paddlenlp.transformers import AutoModelForTokenClassification
+from paddlenlp.transformers import AutoModelForQuestionAnswering
+from paddlenlp.transformers import LinearDecayWithWarmup
+from paddlenlp.utils.log import logger
+
+
+class BaseTrainer(object):
+    def create_dataloader(self,
+                          dataset,
+                          mode='train',
+                          batch_size=16,
+                          batchify_fn=None,
+                          trans_fn=None,
+                          batched=False):
+        if trans_fn:
+            dataset = dataset.map(trans_fn, batched=batched)
+
+        shuffle = True if mode == 'train' else False
+        if mode == 'train':
+            batch_sampler = paddle.io.DistributedBatchSampler(
+                dataset, batch_size=batch_size, shuffle=shuffle)
+        else:
+            batch_sampler = paddle.io.BatchSampler(
+                dataset, batch_size=batch_size, shuffle=shuffle)
+
+        return paddle.io.DataLoader(
+            dataset=dataset,
+            batch_sampler=batch_sampler,
+            collate_fn=batchify_fn,
+            num_workers=0,
+            return_list=True)
+
+    def prepare_train_config(self):
+        if self.args.max_steps > 0:
+            self.args.num_training_steps = self.args.max_steps
+            self.args.num_train_epochs = math.ceil(
+                self.args.num_training_steps / len(self.train_dl))
+
+        else:
+            self.args.num_training_steps = len(
+                self.train_dl) * self.args.num_train_epochs
+            self.args.num_train_epochs = self.args.num_train_epochs
+
+        if self.args.num_training_steps // self.args.valid_steps < 20:
+            exp_step = self.args.num_training_steps / 20
+            exp_step = max(int(exp_step - exp_step % 10), 10)
+            logger.info("Set eval step to %d" % exp_step)
+            self.args.valid_steps = exp_step
+
+        warmup = self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_proportion
+
+        self.lr_scheduler = LinearDecayWithWarmup(
+            self.args.learning_rate, self.args.num_training_steps, warmup)
+
+        # Generate parameter names needed to perform weight decay.
+        # All bias and LayerNorm parameters are excluded.
+        decay_params = [
+            p.name for n, p in self.model.named_parameters()
+            if not any(nd in n for nd in ["bias", "norm"])
+        ]
+
+        self.optimizer = paddle.optimizer.AdamW(
+            learning_rate=self.lr_scheduler,
+            beta1=0.9,
+            beta2=0.999,
+            epsilon=self.args.adam_epsilon,
+            parameters=self.model.parameters(),
+            weight_decay=self.args.weight_decay,
+            apply_decay_param_fun=lambda x: x in decay_params,
+            grad_clip=nn.ClipGradByGlobalNorm(self.args.max_grad_norm))
+
+    def print_config(self):
+        logger.info('{:^40}'.format("Configuration Arguments"))
+        logger.info('{:20}:{}'.format("paddle commit id",
+                                      paddle.version.commit))
+        for arg in vars(self.args):
+            logger.info('{:20}:{}'.format(arg, getattr(self.args, arg)))
+
+
+def clue_trans_fn(examples, tokenizer, args):
+    return convert_clue(
+        examples,
+        tokenizer=tokenizer,
+        label_list=args.label_list,
+        max_seq_length=args.max_seq_length)
+
+
+def clue_batchify_fn(tokenizer, args):
+    batchify_fn = lambda samples, fn=Tuple(
+        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
+        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
+        Stack(dtype="int64" if args.label_list else "float32")  # label
+    ): fn(samples)
+
+    return batchify_fn
+
+
+def convert_clue(example,
+                 label_list,
+                 tokenizer=None,
+                 is_test=False,
+                 max_seq_length=512,
+                 **kwargs):
+    """convert a glue example into necessary features"""
+    if not is_test:
+        # `label_list == None` is for regression task
+        label_dtype = "int64" if label_list else "float32"
+        # Get the label
+        example['label'] = np.array(example["label"], dtype="int64")
+        label = example['label']
+    # Convert raw text to feature
+    if 'keyword' in example:  # CSL
+        sentence1 = " ".join(example['keyword'])
+        example = {
+            'sentence1': sentence1,
+            'sentence2': example['abst'],
+            'label': example['label']
+        }
+    elif 'target' in example:  # wsc
+        text, query, pronoun, query_idx, pronoun_idx = example['text'], example[
+            'target']['span1_text'], example['target']['span2_text'], example[
+                'target']['span1_index'], example['target']['span2_index']
+        text_list = list(text)
+        assert text[pronoun_idx:(pronoun_idx + len(pronoun)
+                                 )] == pronoun, "pronoun: {}".format(pronoun)
+        assert text[query_idx:(query_idx + len(query)
+                               )] == query, "query: {}".format(query)
+        if pronoun_idx > query_idx:
+            text_list.insert(query_idx, "_")
+            text_list.insert(query_idx + len(query) + 1, "_")
+            text_list.insert(pronoun_idx + 2, "[")
+            text_list.insert(pronoun_idx + len(pronoun) + 2 + 1, "]")
+        else:
+            text_list.insert(pronoun_idx, "[")
+            text_list.insert(pronoun_idx + len(pronoun) + 1, "]")
+            text_list.insert(query_idx + 2, "_")
+            text_list.insert(query_idx + len(query) + 2 + 1, "_")
+        text = "".join(text_list)
+        example['sentence'] = text
+
+    if tokenizer is None:
+        return example
+    if 'sentence' in example:
+        example = tokenizer(example['sentence'], max_seq_len=max_seq_length)
+    elif 'sentence1' in example:
+        example = tokenizer(
+            example['sentence1'],
+            text_pair=example['sentence2'],
+            max_seq_len=max_seq_length)
+    if not is_test:
+        return example['input_ids'], example['token_type_ids'], label
+    else:
+        return example['input_ids'], example['token_type_ids']
+
+
+@paddle.no_grad()
+def evaluate(model, criterion, metric, data_loader, mode="dev"):
+    """
+    Given a dataset, it evals model and computes the metric.
+
+    Args:
+        model(obj:`paddle.nn.Layer`): A model to classify texts.
+        data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
+        criterion(obj:`paddle.nn.Layer`): It can compute the loss.
+        metric(obj:`paddle.metric.Metric`): The evaluation metric.
+    """
+    model.eval()
+    metric.reset()
+    losses = []
+    for batch in data_loader:
+        input_ids, token_type_ids, labels = batch
+        logits = model(input_ids, token_type_ids)
+        loss = criterion(logits, labels)
+        losses.append(loss.numpy())
+        correct = metric.compute(logits, labels)
+        metric.update(correct)
+    accu = metric.accumulate()
+    logger.info("%s: eval loss: %.5f, accuracy: %.5f" %
+                (mode, np.mean(losses), accu))
+    metric.reset()
+    model.train()
+    return accu
+
+
+def create_dataloader(dataset,
+                      mode='train',
+                      batch_size=16,
+                      batched=False,
+                      batchify_fn=None,
+                      trans_fn=None):
+    if trans_fn:
+        dataset = dataset.map(trans_fn, batched=False)
+
+    shuffle = True if mode == 'train' else False
+    if mode == 'train':
+        batch_sampler = paddle.io.DistributedBatchSampler(
+            dataset, batch_size=batch_size, shuffle=shuffle)
+    else:
+        batch_sampler = paddle.io.BatchSampler(
+            dataset, batch_size=batch_size, shuffle=shuffle)
+
+    return paddle.io.DataLoader(
+        dataset=dataset,
+        batch_sampler=batch_sampler,
+        collate_fn=batchify_fn,
+        num_workers=0,
+        return_list=True)
+
+
+class CLUE_TRAINING(BaseTrainer):
+    def __init__(self, train_ds, dev_ds, model, tokenizer, args):
+        super().__init__()
+        self.rank = paddle.distributed.get_rank()
+        self.train_ds = train_ds
+        self.dev_ds = dev_ds
+        self.model = model
+        self.tokenizer = tokenizer
+        self.args = args
+
+        self.dataloader_inner()
+        self.prepare_train_config()
+        self.print_config()
+
+    def dataloader_inner(self):
+        trans_fn = partial(
+            clue_trans_fn, tokenizer=self.tokenizer, args=self.args)
+        batchify_fn = clue_batchify_fn(self.tokenizer, self.args)
+
+        self.train_dl = self.create_dataloader(
+            self.train_ds, "train", self.args.batch_size, batchify_fn, trans_fn)
+        self.dev_dl = self.create_dataloader(
+            self.dev_ds, "dev", self.args.batch_size, batchify_fn, trans_fn)
+
+    def eval(self):
+        pass
+
+    def train(self):
+        num_classes = 1 if self.train_ds.label_list == None else len(
+            self.train_ds.label_list)
+
+        loss_fct = paddle.nn.loss.CrossEntropyLoss(
+        ) if self.train_ds.label_list else paddle.nn.loss.MSELoss()
+
+        metric = Accuracy()
+
+        if self.args.use_amp:
+            scaler = paddle.amp.GradScaler(
+                init_loss_scaling=self.args.scale_loss)
+
+        best_dev_acc = 0.0
+        corr_test_acc = -1.0
+        global_step = 0
+        tic_train = time.time()
+
+        for epoch in range(self.args.num_train_epochs):
+            for step, batch in enumerate(self.train_dl):
+                global_step += 1
+                input_ids, segment_ids, labels = batch
+                with paddle.amp.auto_cast(
+                        bool(self.args.use_amp),
+                        custom_white_list=["layer_norm", "softmax", "gelu"], ):
+                    logits = self.model(input_ids, segment_ids)
+                    loss = loss_fct(logits, labels)
+
+                probs = F.softmax(logits, axis=1)
+                correct = metric.compute(probs, labels)
+                metric.update(correct)
+                acc = metric.accumulate()
+
+                if self.args.use_amp:
+                    scaler.scale(loss).backward()
+                    scaler.minimize(self.optimizer, loss)
+                else:
+                    loss.backward()
+                    self.optimizer.step()
+
+                self.lr_scheduler.step()
+                self.optimizer.clear_grad()
+
+                if global_step % self.args.logging_steps == 0:
+                    logger.info(
+                        "global step %d/%d, epoch: %d, batch: %d, acc: %.5f, loss: %f, lr: %.10f, speed: %.4f step/s"
+                        % (global_step, self.args.num_training_steps, epoch,
+                           step, metric.accumulate(), loss,
+                           self.optimizer.get_lr(),
+                           self.args.logging_steps / (time.time() - tic_train)))
+                    metric.reset()
+                    tic_train = time.time()
+                if global_step % self.args.valid_steps == 0 or global_step == self.args.num_training_steps:
+                    tic_eval = time.time()
+                    metric.reset()
+                    if self.dev_dl is not None:
+                        dev_acc = evaluate(self.model, loss_fct, metric,
+                                           self.dev_dl, "dev")
+                    else:
+                        dev_acc = -1.0
+                    metric.reset()
+                    test_acc = -1
+                    metric.reset()
+
+                    logger.info("eval done total : %s s" %
+                                (time.time() - tic_eval))
+                    if dev_acc > best_dev_acc:
+                        best_dev_acc = dev_acc
+                        corr_test_acc = test_acc
+
+                if global_step >= self.args.num_training_steps:
+                    logger.info("best_dev_acc: {:.6f}".format(best_dev_acc))
+                    logger.info("corr_test_acc: {:.6f}".format(corr_test_acc))
+                    return
+
+        logger.info("best_dev_acc: {:.6f}".format(best_dev_acc))
+        logger.info("corr_test_acc: {:.6f}".format(corr_test_acc))
diff --git a/examples/language_model/ernie-1.0/finetune/token_classification.py b/examples/language_model/ernie-1.0/finetune/token_classification.py
new file mode 100644
index 000000000000..e69de29bb2d1

From e4f2f023cdc73da24ff5ddaa89a518c6e9e0015a Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Tue, 15 Mar 2022 19:26:35 +0800
Subject: [PATCH 03/15] add trainer prototype.

---
 .../ernie-1.0/finetune/config.yml             |  25 +-
 .../ernie-1.0/finetune/finetune.py            |  51 ++-
 .../ernie-1.0/finetune/question_answering.py  |   7 +-
 .../finetune/sequence_classification.py       | 211 ++++------
 .../finetune/token_classification.py          | 170 ++++++++
 .../ernie-1.0/finetune/trainer_base.py        | 372 ++++++++++++++++++
 6 files changed, 668 insertions(+), 168 deletions(-)
 create mode 100644 examples/language_model/ernie-1.0/finetune/trainer_base.py

diff --git a/examples/language_model/ernie-1.0/finetune/config.yml b/examples/language_model/ernie-1.0/finetune/config.yml
index fcbd7d433af5..298b430964df 100644
--- a/examples/language_model/ernie-1.0/finetune/config.yml
+++ b/examples/language_model/ernie-1.0/finetune/config.yml
@@ -5,10 +5,10 @@ DefaultArgs:
     num_train_epochs: 3
     batch_size: 64
     max_seq_length: 128
-    weight_decay: 0.0
+    weight_decay: 0.01
     logging_steps: 10
-    valid_steps: 100
-    minimum_valid_times: 20 # If under valid_steps, the valid time is less then 20, the config of valid_steps will be changed.
+    valid_steps: 200
+    minimum_valid_times: 20
     max_steps: -1
     warmup_steps: 0
     metric: "Accuracy"
@@ -34,25 +34,30 @@ SequenceClassification:
         max_seq_length: 256
         batch_size: 32
     xnli_cn:
-        learning_rate: 0.00005
+        learning_rate: 0.0001
         num_train_epochs: 3
+        batch_size: 256
     chnsenticorp_v2:
-        learning_rate: 0.00001
-        num_train_epochs: 5
+        learning_rate: 0.00005
+        batch_size: 16
+        num_train_epochs: 8
 
 # Datasets which used for token classfication
 TokenClassification:
     peoples_daily_ner:
-        num_train_epochs: 5
+        learning_rate: 0.00005
+        num_train_epochs: 8
+        batch_size: 16
     msra_ner:
         num_train_epochs: 3
 
 # Datasets which used for question answersing
 QuestionAnswering:
     cmrc2018:
-        num_train_epochs: 1
-        batch_size: 12
-        max_seq_length: 384
+        learning_rate: 0.00005
+        num_train_epochs: 5
+        batch_size: 32
+        max_seq_length: 512
     dureader_nlp:
         num_train_epochs: 1
         batch_size: 12
diff --git a/examples/language_model/ernie-1.0/finetune/finetune.py b/examples/language_model/ernie-1.0/finetune/finetune.py
index 705efec52564..0a6db96097da 100644
--- a/examples/language_model/ernie-1.0/finetune/finetune.py
+++ b/examples/language_model/ernie-1.0/finetune/finetune.py
@@ -13,12 +13,10 @@
 # limitations under the License.
 
 import argparse
-import logging
 import os
 import sys
 import random
 import time
-import math
 import copy
 import yaml
 from functools import partial
@@ -27,24 +25,21 @@
 
 import numpy as np
 import paddle
-from paddle.io import DataLoader
 import paddle.nn as nn
 import paddle.nn.functional as F
-from paddle.metric import Accuracy
-from paddlenlp.metrics.squad import squad_evaluate, compute_prediction
 
 import paddlenlp
 from paddlenlp.datasets import load_dataset
-from paddlenlp.data import Stack, Tuple, Pad, Dict
 from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer
 from paddlenlp.transformers import AutoModelForTokenClassification
 from paddlenlp.transformers import AutoModelForQuestionAnswering
-from paddlenlp.transformers import LinearDecayWithWarmup
+from paddlenlp.transformers import AutoTokenizer
 from paddlenlp.utils.log import logger
 
 sys.path.insert(0, os.path.abspath("."))
-from sequence_classification import CLUE_TRAINING
-from question_answering import QA_TRAINING
+from sequence_classification import ClueTrainer, SeqTrainer
+from question_answering import MrcTrainer
+from token_classification import NerTrainer
 
 ALL_TASKS = {
     "SequenceClassification": [],
@@ -77,7 +72,6 @@
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    # Required parameters
 
     parser.add_argument(
         "--dataset",
@@ -115,7 +109,7 @@ def parse_args():
         help="Batch size per GPU/CPU for training.", )
     group.add_argument(
         "--weight_decay",
-        default=0.0,
+        default=None,
         type=float,
         help="Weight decay if we apply some.")
 
@@ -134,6 +128,12 @@ def parse_args():
         type=int,
         default=200,
         help="Save checkpoint every X updates steps.")
+    group.add_argument(
+        "--minimum_valid_times",
+        type=int,
+        default=None,
+        help="If under valid_steps, the valid time is less then minimum_valid_times, the config of override valid_steps."
+    )
     group.add_argument(
         "--max_steps",
         default=-1,
@@ -257,12 +257,29 @@ def do_train(args):
     if paddle.distributed.get_world_size() > 1:
         model = paddle.DataParallel(model)
 
-    if 'clue' in args.dataset:
-        trainer = CLUE_TRAINING(all_ds["train"], all_ds["dev"], model,
-                                tokenizer, args)
-    elif "Answering" in config["model"]:
-        trainer = QA_TRAINING(all_ds["train"], all_ds["dev"], model, tokenizer,
-                              args)
+    if "SequenceClassification" in config["model"]:
+        if 'clue' in args.dataset:
+            trainer = ClueTrainer(all_ds["train"], all_ds["dev"], model,
+                                  tokenizer, args)
+        else:
+            trainer = SeqTrainer(
+                all_ds["train"],
+                all_ds["dev"],
+                model,
+                tokenizer,
+                args,
+                test_ds=all_ds["test"])
+    elif "QuestionAnswering" in config["model"]:
+        trainer = MrcTrainer(all_ds["train"], all_ds["dev"], model, tokenizer,
+                             args)
+    elif 'TokenClassification' in config["model"]:
+        trainer = NerTrainer(
+            all_ds["train"],
+            all_ds["dev"],
+            model,
+            tokenizer,
+            args,
+            test_ds=all_ds["test"])
 
     trainer.train()
     trainer.eval()
diff --git a/examples/language_model/ernie-1.0/finetune/question_answering.py b/examples/language_model/ernie-1.0/finetune/question_answering.py
index c2255b831747..03be17723111 100644
--- a/examples/language_model/ernie-1.0/finetune/question_answering.py
+++ b/examples/language_model/ernie-1.0/finetune/question_answering.py
@@ -22,7 +22,7 @@
 from paddlenlp.data import Pad, Stack, Tuple, Dict
 from paddlenlp.metrics.squad import squad_evaluate, compute_prediction
 
-from sequence_classification import BaseTrainer
+from trainer_base import TrainerBase
 from paddlenlp.utils.log import logger
 
 
@@ -188,7 +188,7 @@ def prepare_validation_features(examples, tokenizer, args):
     return tokenized_examples
 
 
-class QA_TRAINING(BaseTrainer):
+class MrcTrainer(TrainerBase):
     def __init__(self, train_ds, dev_ds, model, tokenizer, args):
         super().__init__()
         self.rank = paddle.distributed.get_rank()
@@ -257,5 +257,8 @@ def train(self):
                 self.lr_scheduler.step()
                 self.optimizer.clear_grad()
 
+                if global_step % self.args.valid_steps == 0:
+                    self.eval()
+
                 if global_step == self.args.num_training_steps:
                     break
diff --git a/examples/language_model/ernie-1.0/finetune/sequence_classification.py b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
index 84a918a95e15..324399299eee 100644
--- a/examples/language_model/ernie-1.0/finetune/sequence_classification.py
+++ b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
@@ -12,133 +12,46 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
-import paddle
-from paddle.io import DataLoader
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.metric import Accuracy
-import numpy as np
-
-from paddlenlp.data import Stack, Tuple, Pad, Dict
-
-import argparse
 import os
-import sys
-import random
 import time
-import math
-import copy
-import yaml
 from functools import partial
 
-import numpy as np
 import paddle
-from paddle.io import DataLoader
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle.metric import Accuracy
-from paddlenlp.metrics.squad import squad_evaluate, compute_prediction
+import numpy as np
 
 import paddlenlp
-from paddlenlp.datasets import load_dataset
 from paddlenlp.data import Stack, Tuple, Pad, Dict
-from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer
-from paddlenlp.transformers import AutoModelForTokenClassification
-from paddlenlp.transformers import AutoModelForQuestionAnswering
-from paddlenlp.transformers import LinearDecayWithWarmup
 from paddlenlp.utils.log import logger
 
+from trainer_base import TrainerBase
 
-class BaseTrainer(object):
-    def create_dataloader(self,
-                          dataset,
-                          mode='train',
-                          batch_size=16,
-                          batchify_fn=None,
-                          trans_fn=None,
-                          batched=False):
-        if trans_fn:
-            dataset = dataset.map(trans_fn, batched=batched)
-
-        shuffle = True if mode == 'train' else False
-        if mode == 'train':
-            batch_sampler = paddle.io.DistributedBatchSampler(
-                dataset, batch_size=batch_size, shuffle=shuffle)
-        else:
-            batch_sampler = paddle.io.BatchSampler(
-                dataset, batch_size=batch_size, shuffle=shuffle)
-
-        return paddle.io.DataLoader(
-            dataset=dataset,
-            batch_sampler=batch_sampler,
-            collate_fn=batchify_fn,
-            num_workers=0,
-            return_list=True)
-
-    def prepare_train_config(self):
-        if self.args.max_steps > 0:
-            self.args.num_training_steps = self.args.max_steps
-            self.args.num_train_epochs = math.ceil(
-                self.args.num_training_steps / len(self.train_dl))
 
-        else:
-            self.args.num_training_steps = len(
-                self.train_dl) * self.args.num_train_epochs
-            self.args.num_train_epochs = self.args.num_train_epochs
-
-        if self.args.num_training_steps // self.args.valid_steps < 20:
-            exp_step = self.args.num_training_steps / 20
-            exp_step = max(int(exp_step - exp_step % 10), 10)
-            logger.info("Set eval step to %d" % exp_step)
-            self.args.valid_steps = exp_step
-
-        warmup = self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_proportion
-
-        self.lr_scheduler = LinearDecayWithWarmup(
-            self.args.learning_rate, self.args.num_training_steps, warmup)
-
-        # Generate parameter names needed to perform weight decay.
-        # All bias and LayerNorm parameters are excluded.
-        decay_params = [
-            p.name for n, p in self.model.named_parameters()
-            if not any(nd in n for nd in ["bias", "norm"])
-        ]
-
-        self.optimizer = paddle.optimizer.AdamW(
-            learning_rate=self.lr_scheduler,
-            beta1=0.9,
-            beta2=0.999,
-            epsilon=self.args.adam_epsilon,
-            parameters=self.model.parameters(),
-            weight_decay=self.args.weight_decay,
-            apply_decay_param_fun=lambda x: x in decay_params,
-            grad_clip=nn.ClipGradByGlobalNorm(self.args.max_grad_norm))
-
-    def print_config(self):
-        logger.info('{:^40}'.format("Configuration Arguments"))
-        logger.info('{:20}:{}'.format("paddle commit id",
-                                      paddle.version.commit))
-        for arg in vars(self.args):
-            logger.info('{:20}:{}'.format(arg, getattr(self.args, arg)))
+def convert_example(example, tokenizer, max_seq_length=512, is_test=False):
 
+    if "text_b" in example.keys():
+        text = example["text_a"]
+        text_pair = example["text_b"]
+    else:
+        text = example["text"]
+        text_pair = None
 
-def clue_trans_fn(examples, tokenizer, args):
-    return convert_clue(
-        examples,
-        tokenizer=tokenizer,
-        label_list=args.label_list,
-        max_seq_length=args.max_seq_length)
+    encoded_inputs = tokenizer(
+        text=text, text_pair=text_pair, max_seq_len=max_seq_length)
+    input_ids = encoded_inputs["input_ids"]
+    token_type_ids = encoded_inputs["token_type_ids"]
 
+    if is_test:
+        return input_ids, token_type_ids
+    label = np.array([example["label"]], dtype="int64")
+    return input_ids, token_type_ids, label
 
-def clue_batchify_fn(tokenizer, args):
-    batchify_fn = lambda samples, fn=Tuple(
-        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
-        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
-        Stack(dtype="int64" if args.label_list else "float32")  # label
-    ): fn(samples)
 
-    return batchify_fn
+def seq_trans_fn(example, tokenizer, args):
+    return convert_example(
+        example, tokenizer=tokenizer, max_seq_length=args.max_seq_length)
 
 
 def convert_clue(example,
@@ -199,6 +112,24 @@ def convert_clue(example,
         return example['input_ids'], example['token_type_ids']
 
 
+def clue_trans_fn(examples, tokenizer, args):
+    return convert_clue(
+        examples,
+        tokenizer=tokenizer,
+        label_list=args.label_list,
+        max_seq_length=args.max_seq_length)
+
+
+def clue_batchify_fn(tokenizer, args):
+    batchify_fn = lambda samples, fn=Tuple(
+        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
+        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
+        Stack(dtype="int64" if args.label_list else "float32")  # label
+    ): fn(samples)
+
+    return batchify_fn
+
+
 @paddle.no_grad()
 def evaluate(model, criterion, metric, data_loader, mode="dev"):
     """
@@ -228,37 +159,16 @@ def evaluate(model, criterion, metric, data_loader, mode="dev"):
     return accu
 
 
-def create_dataloader(dataset,
-                      mode='train',
-                      batch_size=16,
-                      batched=False,
-                      batchify_fn=None,
-                      trans_fn=None):
-    if trans_fn:
-        dataset = dataset.map(trans_fn, batched=False)
-
-    shuffle = True if mode == 'train' else False
-    if mode == 'train':
-        batch_sampler = paddle.io.DistributedBatchSampler(
-            dataset, batch_size=batch_size, shuffle=shuffle)
-    else:
-        batch_sampler = paddle.io.BatchSampler(
-            dataset, batch_size=batch_size, shuffle=shuffle)
-
-    return paddle.io.DataLoader(
-        dataset=dataset,
-        batch_sampler=batch_sampler,
-        collate_fn=batchify_fn,
-        num_workers=0,
-        return_list=True)
-
-
-class CLUE_TRAINING(BaseTrainer):
-    def __init__(self, train_ds, dev_ds, model, tokenizer, args):
+class ClueTrainer(TrainerBase):
+    def __init__(self, train_ds, dev_ds, model, tokenizer, args, *arg,
+                 **kwargs):
         super().__init__()
         self.rank = paddle.distributed.get_rank()
         self.train_ds = train_ds
         self.dev_ds = dev_ds
+        if "test_ds" in kwargs.keys():
+            self.test_ds = kwargs["test_ds"]
+
         self.model = model
         self.tokenizer = tokenizer
         self.args = args
@@ -277,6 +187,8 @@ def dataloader_inner(self):
         self.dev_dl = self.create_dataloader(
             self.dev_ds, "dev", self.args.batch_size, batchify_fn, trans_fn)
 
+        self.test_dl = None
+
     def eval(self):
         pass
 
@@ -341,7 +253,12 @@ def train(self):
                     else:
                         dev_acc = -1.0
                     metric.reset()
-                    test_acc = -1
+
+                    if self.test_dl is not None:
+                        test_acc = evaluate(self.model, loss_fct, metric,
+                                            self.test_dl, "test")
+                    else:
+                        test_acc = -1.0
                     metric.reset()
 
                     logger.info("eval done total : %s s" %
@@ -350,10 +267,26 @@ def train(self):
                         best_dev_acc = dev_acc
                         corr_test_acc = test_acc
 
+                    logger.warning(
+                        "best_dev_acc: {:.6f}, corr_test_acc: {:.6f}".format(
+                            best_dev_acc, corr_test_acc))
+
                 if global_step >= self.args.num_training_steps:
-                    logger.info("best_dev_acc: {:.6f}".format(best_dev_acc))
-                    logger.info("corr_test_acc: {:.6f}".format(corr_test_acc))
                     return
 
-        logger.info("best_dev_acc: {:.6f}".format(best_dev_acc))
-        logger.info("corr_test_acc: {:.6f}".format(corr_test_acc))
+        logger.warning("best_dev_acc: {:.6f}, corr_test_acc: {:.6f}".format(
+            best_dev_acc, corr_test_acc))
+
+
+class SeqTrainer(ClueTrainer):
+    def dataloader_inner(self):
+        trans_fn = partial(
+            seq_trans_fn, tokenizer=self.tokenizer, args=self.args)
+        batchify_fn = clue_batchify_fn(self.tokenizer, self.args)
+
+        self.train_dl = self.create_dataloader(
+            self.train_ds, "train", self.args.batch_size, batchify_fn, trans_fn)
+        self.dev_dl = self.create_dataloader(
+            self.dev_ds, "dev", self.args.batch_size, batchify_fn, trans_fn)
+        self.test_dl = self.create_dataloader(
+            self.test_ds, "dev", self.args.batch_size, batchify_fn, trans_fn)
diff --git a/examples/language_model/ernie-1.0/finetune/token_classification.py b/examples/language_model/ernie-1.0/finetune/token_classification.py
index e69de29bb2d1..bb20a1764026 100644
--- a/examples/language_model/ernie-1.0/finetune/token_classification.py
+++ b/examples/language_model/ernie-1.0/finetune/token_classification.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import random
+import time
+import math
+from functools import partial
+
+import numpy as np
+import paddle
+
+import paddlenlp as ppnlp
+from paddlenlp.transformers import LinearDecayWithWarmup
+from paddlenlp.metrics import ChunkEvaluator
+from paddlenlp.datasets import load_dataset
+from paddlenlp.data import Stack, Tuple, Pad, Dict
+from paddlenlp.utils.log import logger
+
+from trainer_base import TrainerBase
+
+
+@paddle.no_grad()
+def evaluate(model, loss_fct, metric, data_loader, label_num, mode="valid"):
+    model.eval()
+    metric.reset()
+    avg_loss, precision, recall, f1_score = 0, 0, 0, 0
+    for batch in data_loader:
+        input_ids, token_type_ids, length, labels = batch
+        logits = model(input_ids, token_type_ids)
+        loss = loss_fct(logits, labels)
+        avg_loss = paddle.mean(loss)
+        preds = logits.argmax(axis=2)
+        num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute(
+            length, preds, labels)
+        metric.update(num_infer_chunks.numpy(),
+                      num_label_chunks.numpy(), num_correct_chunks.numpy())
+        precision, recall, f1_score = metric.accumulate()
+    logger.info("%s: eval loss: %f, precision: %f, recall: %f, f1: %f" %
+                (mode, avg_loss, precision, recall, f1_score))
+    model.train()
+
+    return f1_score
+
+
+def tokenize_and_align_labels(example, tokenizer, no_entity_id,
+                              max_seq_len=512):
+    labels = example['labels']
+    example = example['tokens']
+    tokenized_input = tokenizer(
+        example,
+        return_length=True,
+        is_split_into_words=True,
+        max_seq_len=max_seq_len)
+
+    # -2 for [CLS] and [SEP]
+    if len(tokenized_input['input_ids']) - 2 < len(labels):
+        labels = labels[:len(tokenized_input['input_ids']) - 2]
+    tokenized_input['labels'] = [no_entity_id] + labels + [no_entity_id]
+    tokenized_input['labels'] += [no_entity_id] * (
+        len(tokenized_input['input_ids']) - len(tokenized_input['labels']))
+    return tokenized_input
+
+
+class NerTrainer(TrainerBase):
+    def __init__(self, train_ds, dev_ds, model, tokenizer, args, *arg,
+                 **kwargs):
+        super().__init__()
+        self.rank = paddle.distributed.get_rank()
+        self.train_ds = train_ds
+        self.dev_ds = dev_ds
+        if "test_ds" in kwargs.keys():
+            self.test_ds = kwargs["test_ds"]
+        self.model = model
+        self.tokenizer = tokenizer
+        self.args = args
+        self.dataloader_inner()
+        self.prepare_train_config()
+        self.print_config()
+
+    def dataloader_inner(self):
+        label_list = self.train_ds.label_list
+        label_num = len(label_list)
+        no_entity_id = label_num - 1
+
+        trans_fn = partial(
+            tokenize_and_align_labels,
+            tokenizer=self.tokenizer,
+            no_entity_id=no_entity_id,
+            max_seq_len=self.args.max_seq_length)
+
+        ignore_label = -100
+
+        batchify_fn = lambda samples, fn=Dict({
+            'input_ids': Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype='int32'),  # input
+            'token_type_ids': Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id, dtype='int32'),  # segment
+            'seq_len': Stack(dtype='int64'),  # seq_len
+            'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64')  # label
+        }): fn(samples)
+
+        self.train_dl = self.create_dataloader(
+            self.train_ds, "train", self.args.batch_size, batchify_fn, trans_fn)
+        self.dev_dl = self.create_dataloader(
+            self.dev_ds, "dev", self.args.batch_size, batchify_fn, trans_fn)
+        self.test_dl = self.create_dataloader(
+            self.test_ds, "test", self.args.batch_size, batchify_fn, trans_fn)
+
+    def train(self):
+        ignore_label = -100
+        label_num = len(self.train_ds.label_list)
+
+        loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)
+        metric = ChunkEvaluator(label_list=self.args.label_list)
+
+        global_step = 0
+        tic_train = time.time()
+        best_dev_f1 = -1
+        corr_test_f1 = -1
+
+        for epoch in range(self.args.num_train_epochs):
+            for step, batch in enumerate(self.train_dl):
+                global_step += 1
+                input_ids, token_type_ids, _, labels = batch
+                logits = self.model(input_ids, token_type_ids)
+                loss = loss_fct(logits, labels)
+                avg_loss = paddle.mean(loss)
+
+                if global_step % self.args.logging_steps == 0:
+                    logger.info(
+                        "global step %d/%d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
+                        % (global_step, self.args.num_training_steps, epoch,
+                           step, avg_loss,
+                           self.args.logging_steps / (time.time() - tic_train)))
+                    tic_train = time.time()
+
+                avg_loss.backward()
+                self.optimizer.step()
+                self.lr_scheduler.step()
+                self.optimizer.clear_grad()
+
+                if global_step % self.args.valid_steps == 0 or global_step == self.args.num_training_steps:
+                    if paddle.distributed.get_rank() == 0:
+                        dev_f1 = evaluate(self.model, loss_fct, metric,
+                                          self.dev_dl, label_num, "valid")
+                        test_f1 = evaluate(self.model, loss_fct, metric,
+                                           self.test_dl, label_num, "test")
+                        if dev_f1 > best_dev_f1:
+                            best_dev_f1 = dev_f1
+                            corr_test_f1 = test_f1
+                        logger.warning(
+                            "Currently, best_dev_f1: %.4f, corr_test_f1: %.4f" %
+                            (best_dev_f1, corr_test_f1))
+
+                if global_step >= self.args.num_training_steps:
+                    logger.warning(
+                        "Currently, best_dev_f1: %.4f, corr_test_f1: %.4f" %
+                        (best_dev_f1, corr_test_f1))
+                    return
diff --git a/examples/language_model/ernie-1.0/finetune/trainer_base.py b/examples/language_model/ernie-1.0/finetune/trainer_base.py
new file mode 100644
index 000000000000..6b1790063aa4
--- /dev/null
+++ b/examples/language_model/ernie-1.0/finetune/trainer_base.py
@@ -0,0 +1,372 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlenlp.transformers import LinearDecayWithWarmup
+from paddlenlp.utils.log import logger
+from paddle.io import DataLoader
+
+
+class TrainerBase(object):
+    """
+    """
+
+    def create_dataloader(self,
+                          dataset,
+                          mode='train',
+                          batch_size=16,
+                          batchify_fn=None,
+                          trans_fn=None,
+                          batched=False):
+        """
+        """
+        if trans_fn:
+            dataset = dataset.map(trans_fn, batched=batched)
+
+        shuffle = True if mode == 'train' else False
+        if mode == 'train':
+            batch_sampler = paddle.io.DistributedBatchSampler(
+                dataset, batch_size=batch_size, shuffle=shuffle)
+        else:
+            batch_sampler = paddle.io.BatchSampler(
+                dataset, batch_size=batch_size, shuffle=shuffle)
+
+        return paddle.io.DataLoader(
+            dataset=dataset,
+            batch_sampler=batch_sampler,
+            collate_fn=batchify_fn,
+            num_workers=0,
+            return_list=True)
+
+    def train(self, *args, **kwargs):
+        """
+        """
+        pass
+
+    def eval(self, *args, **kwargs):
+        """
+        """
+        pass
+
+    def prepare_train_config(self):
+        """
+        """
+        if self.args.max_steps > 0:
+            self.args.num_training_steps = self.args.max_steps
+            self.args.num_train_epochs = math.ceil(
+                self.args.num_training_steps / len(self.train_dl))
+
+        else:
+            self.args.num_training_steps = len(
+                self.train_dl) * self.args.num_train_epochs
+            self.args.num_train_epochs = self.args.num_train_epochs
+
+        if self.args.num_training_steps // self.args.valid_steps < self.args.minimum_valid_times:
+            exp_step = self.args.num_training_steps / self.args.minimum_valid_times
+            exp_step = max(int(exp_step - exp_step % 10), 10)
+            logger.info("Set eval step to %d" % exp_step)
+            self.args.valid_steps = exp_step
+
+        warmup = self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_proportion
+
+        self.lr_scheduler = LinearDecayWithWarmup(
+            self.args.learning_rate, self.args.num_training_steps, warmup)
+
+        # Generate parameter names needed to perform weight decay.
+        # All bias and LayerNorm parameters are excluded.
+        decay_params = [
+            p.name for n, p in self.model.named_parameters()
+            if not any(nd in n for nd in ["bias", "norm"])
+        ]
+
+        self.optimizer = paddle.optimizer.AdamW(
+            learning_rate=self.lr_scheduler,
+            beta1=0.9,
+            beta2=0.999,
+            epsilon=self.args.adam_epsilon,
+            parameters=self.model.parameters(),
+            weight_decay=self.args.weight_decay,
+            apply_decay_param_fun=lambda x: x in decay_params,
+            grad_clip=nn.ClipGradByGlobalNorm(self.args.max_grad_norm))
+
+    def print_config(self):
+        """
+        """
+        logger.info('{:^40}'.format("Configuration Arguments"))
+        logger.info('{:20}:{}'.format("paddle commit id",
+                                      paddle.version.commit))
+        for arg in vars(self.args):
+            logger.info('{:20}:{}'.format(arg, getattr(self.args, arg)))
+
+
+class Trainer:
+    """
+    """
+
+    def __init__(
+            self,
+            model: Union[PreTrainedModel, nn.Layer]=None,
+            args: TrainingArguments=None,
+            data_collator: Optional[DataCollator]=None,
+            train_dataset: Optional[Dataset]=None,
+            eval_dataset: Optional[Dataset]=None,
+            tokenizer: Optional[PreTrainedTokenizerBase]=None,
+            compute_metrics: Optional[Callable[[EvalPrediction], Dict]]=None,
+            optimizers: Tuple[paddle.optim.Optimizer, paddle.optim.lr_scheduler.
+                              LambdaLR]=(None, None), ):
+        if args is None:
+            output_dir = "tmp_trainer"
+            logger.info(
+                f"No `TrainingArguments` passed, using `output_dir={output_dir}`."
+            )
+            args = TrainingArguments(output_dir=output_dir)
+        self.args = args
+        # Seed must be set before instantiating the model when using model
+        set_seed(self.args.seed)
+        if model is None:
+            raise RuntimeError(
+                "`Trainer` requires either a `model` or `model_init` argument")
+
+        default_collator = default_data_collator if tokenizer is None else DataCollatorWithPadding(
+            tokenizer)
+        self.data_collator = data_collator if data_collator is not None else default_collator
+        self.train_dataset = train_dataset
+        self.eval_dataset = eval_dataset
+        self.tokenizer = tokenizer
+
+        self.model_wrapped = model
+        self.model = model
+
+        self.compute_metrics = compute_metrics
+        self.optimizer, self.lr_scheduler = optimizers
+
+        if args.max_steps > 0:
+            logger.info(
+                "max_steps is given, it will override any value given in num_train_epochs"
+            )
+
+        if train_dataset is not None and not isinstance(
+                train_dataset, collections.abc.Sized) and args.max_steps <= 0:
+            raise ValueError(
+                "train_dataset does not implement __len__, max_steps has to be specified"
+            )
+
+        if args.fp16:
+            logger.info(f"Using  half precision backend")
+
+    def train(
+            self,
+            resume_from_checkpoint: Optional[Union[str, bool]]=None,
+            trial: Union["optuna.Trial", Dict[str, Any]]=None,
+            ignore_keys_for_eval: Optional[List[str]]=None,
+            **kwargs, ):
+        train_dataloader = self.get_train_dataloader()
+        model = self._wrap_model(self.model_wrapped)
+        self.create_optimizer_and_scheduler(num_training_steps=max_steps)
+
+        for epoch in range(epochs_trained, num_train_epochs):
+            step = -1
+            for step, inputs in enumerate(epoch_iterator):
+                tr_loss_step = self.training_step(model, inputs)
+                self.scaler.step(self.optimizer)
+                self.scaler.update()
+                self.optimizer.step()
+
+                self.lr_scheduler.step()
+                model.zero_grad()
+
+    def training_step(
+            self, model: nn.Layer,
+            inputs: Dict[str, Union[paddle.Tensor, Any]]) -> paddle.Tensor:
+        model.train()
+        inputs = self._prepare_inputs(inputs)
+
+        with self.autocast_smart_context_manager():
+            loss = self.compute_loss(model, inputs)
+
+        loss.backward()
+
+        return loss.detach()
+
+    def get_train_dataloader(self):
+        pass
+
+    def _get_eval_sampler(self, eval_dataset: Dataset):
+        pass
+
+    def get_eval_dataloader(self,
+                            eval_dataset: Optional[Dataset]=None) -> DataLoader:
+        pass
+
+    def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
+        pass
+
+    def create_optimizer_and_scheduler(self, num_training_steps: int):
+        pass
+
+    def create_optimizer(self):
+        pass
+
+    @staticmethod
+    def get_optimizer_cls_and_kwargs(
+            args: TrainingArguments) -> Tuple[Any, Any]:
+        pass
+
+    def create_scheduler(self,
+                         num_training_steps: int,
+                         optimizer: paddle.optim.Optimizer=None):
+        pass
+
+    def _wrap_model(self, model, training=True):
+        pass
+
+    def _prepare_input(
+            self, data: Union[paddle.Tensor, Any]) -> Union[paddle.Tensor, Any]:
+        pass
+
+    def _prepare_inputs(self, inputs: Dict[str, Union[paddle.Tensor, Any]]
+                        ) -> Dict[str, Union[paddle.Tensor, Any]]:
+        pass
+
+    def autocast_smart_context_manager(self):
+        pass
+
+    def training_step(
+            self, model: nn.Layer,
+            inputs: Dict[str, Union[paddle.Tensor, Any]]) -> paddle.Tensor:
+        pass
+
+    def save_model(self,
+                   output_dir: Optional[str]=None,
+                   _internal_call: bool=False):
+        pass
+
+    def _save(self, output_dir: Optional[str]=None, state_dict=None):
+        pass
+
+    def _load_optimizer_and_scheduler(self, checkpoint):
+        pass
+
+    def evaluate(
+            self,
+            eval_dataset: Optional[Dataset]=None,
+            ignore_keys: Optional[List[str]]=None,
+            metric_key_prefix: str="eval", ) -> Dict[str, float]:
+        pass
+
+    def predict(self,
+                test_dataset: Dataset,
+                ignore_keys: Optional[List[str]]=None,
+                metric_key_prefix: str="test") -> PredictionOutput:
+        pass
+
+    def prediction_step(
+            self,
+            model: nn.Layer,
+            inputs: Dict[str, Union[paddle.Tensor, Any]],
+            prediction_loss_only: bool,
+            ignore_keys: Optional[List[str]]=None, ) -> Tuple[Optional[
+                paddle.Tensor], Optional[paddle.Tensor], Optional[
+                    paddle.Tensor]]:
+        pass
+
+    def create_dataloader(self,
+                          dataset,
+                          mode='train',
+                          batch_size=16,
+                          batchify_fn=None,
+                          trans_fn=None,
+                          batched=False):
+        """
+        """
+        if trans_fn:
+            dataset = dataset.map(trans_fn, batched=batched)
+
+        shuffle = True if mode == 'train' else False
+        if mode == 'train':
+            batch_sampler = paddle.io.DistributedBatchSampler(
+                dataset, batch_size=batch_size, shuffle=shuffle)
+        else:
+            batch_sampler = paddle.io.BatchSampler(
+                dataset, batch_size=batch_size, shuffle=shuffle)
+
+        return paddle.io.DataLoader(
+            dataset=dataset,
+            batch_sampler=batch_sampler,
+            collate_fn=batchify_fn,
+            num_workers=0,
+            return_list=True)
+
+    def train(self, *args, **kwargs):
+        """
+        """
+        pass
+
+    def eval(self, *args, **kwargs):
+        """
+        """
+        pass
+
+    def prepare_train_config(self):
+        """
+        """
+        if self.args.max_steps > 0:
+            self.args.num_training_steps = self.args.max_steps
+            self.args.num_train_epochs = math.ceil(
+                self.args.num_training_steps / len(self.train_dl))
+
+        else:
+            self.args.num_training_steps = len(
+                self.train_dl) * self.args.num_train_epochs
+            self.args.num_train_epochs = self.args.num_train_epochs
+
+        if self.args.num_training_steps // self.args.valid_steps < self.args.minimum_valid_times:
+            exp_step = self.args.num_training_steps / self.args.minimum_valid_times
+            exp_step = max(int(exp_step - exp_step % 10), 10)
+            logger.info("Set eval step to %d" % exp_step)
+            self.args.valid_steps = exp_step
+
+        warmup = self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_proportion
+
+        self.lr_scheduler = LinearDecayWithWarmup(
+            self.args.learning_rate, self.args.num_training_steps, warmup)
+
+        # Generate parameter names needed to perform weight decay.
+        # All bias and LayerNorm parameters are excluded.
+        decay_params = [
+            p.name for n, p in self.model.named_parameters()
+            if not any(nd in n for nd in ["bias", "norm"])
+        ]
+
+        self.optimizer = paddle.optimizer.AdamW(
+            learning_rate=self.lr_scheduler,
+            beta1=0.9,
+            beta2=0.999,
+            epsilon=self.args.adam_epsilon,
+            parameters=self.model.parameters(),
+            weight_decay=self.args.weight_decay,
+            apply_decay_param_fun=lambda x: x in decay_params,
+            grad_clip=nn.ClipGradByGlobalNorm(self.args.max_grad_norm))
+
+    def print_config(self):
+        """
+        """
+        logger.info('{:^40}'.format("Configuration Arguments"))
+        logger.info('{:20}:{}'.format("paddle commit id",
+                                      paddle.version.commit))
+        for arg in vars(self.args):
+            logger.info('{:20}:{}'.format(arg, getattr(self.args, arg)))

From 68dea62b5dae82e6ea15f6499e639bea3d18a3df Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Fri, 18 Mar 2022 19:46:02 +0800
Subject: [PATCH 04/15] init verison for paddlenlp trainer.

---
 .../ernie-1.0/finetune/finetune.py            |    1 -
 .../finetune/sequence_classification.py       |   87 +-
 .../ernie-1.0/finetune/trainer_args.py        |  997 +++++++++++++
 .../ernie-1.0/finetune/trainer_base.py        | 1289 +++++++++++++++--
 .../ernie-1.0/finetune/trainer_callback.py    |  660 +++++++++
 .../ernie-1.0/finetune/trainer_utils.py       |  202 +++
 paddlenlp/transformers/model_utils.py         |    6 +
 7 files changed, 3125 insertions(+), 117 deletions(-)
 create mode 100644 examples/language_model/ernie-1.0/finetune/trainer_args.py
 create mode 100644 examples/language_model/ernie-1.0/finetune/trainer_callback.py
 create mode 100644 examples/language_model/ernie-1.0/finetune/trainer_utils.py

diff --git a/examples/language_model/ernie-1.0/finetune/finetune.py b/examples/language_model/ernie-1.0/finetune/finetune.py
index 0a6db96097da..70ae65b32964 100644
--- a/examples/language_model/ernie-1.0/finetune/finetune.py
+++ b/examples/language_model/ernie-1.0/finetune/finetune.py
@@ -280,7 +280,6 @@ def do_train(args):
             tokenizer,
             args,
             test_ds=all_ds["test"])
-
     trainer.train()
     trainer.eval()
 
diff --git a/examples/language_model/ernie-1.0/finetune/sequence_classification.py b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
index 324399299eee..cdbf887cc1e0 100644
--- a/examples/language_model/ernie-1.0/finetune/sequence_classification.py
+++ b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
@@ -23,10 +23,10 @@
 import numpy as np
 
 import paddlenlp
-from paddlenlp.data import Stack, Tuple, Pad, Dict
+from paddlenlp.data import Stack, Tuple, Pad
 from paddlenlp.utils.log import logger
 
-from trainer_base import TrainerBase
+from trainer_base import TrainerBase, Trainer
 
 
 def convert_example(example, tokenizer, max_seq_length=512, is_test=False):
@@ -46,7 +46,12 @@ def convert_example(example, tokenizer, max_seq_length=512, is_test=False):
     if is_test:
         return input_ids, token_type_ids
     label = np.array([example["label"]], dtype="int64")
-    return input_ids, token_type_ids, label
+    # return input_ids, token_type_ids, label
+    return {
+        "input_ids": input_ids,
+        "token_type_ids": token_type_ids,
+        "labels": label
+    }
 
 
 def seq_trans_fn(example, tokenizer, args):
@@ -130,6 +135,39 @@ def clue_batchify_fn(tokenizer, args):
     return batchify_fn
 
 
+class Dict(object):
+    def __init__(self, fn):
+        assert isinstance(fn, (dict)), 'Input pattern not understood. The input of Dict must be a dict with key of input column name and value of collate_fn ' \
+                                   'Received fn=%s' % (str(fn))
+
+        self._fn = fn
+
+        for col_name, ele_fn in self._fn.items():
+            assert callable(
+                ele_fn
+            ), 'Batchify functions must be callable! type(fn[%d]) = %s' % (
+                col_name, str(type(ele_fn)))
+
+    def __call__(self, data):
+
+        ret = {}
+        for col_name, ele_fn in self._fn.items():
+            result = ele_fn([ele[col_name] for ele in data])
+            ret[col_name] = result
+
+        return ret
+
+
+def clue_batchify_fn_dict(tokenizer, args):
+    batchify_fn = lambda samples, fn=Dict({
+        'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
+        "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
+        "labels": Stack(dtype="int64" if args.label_list else "float32")  # label
+    }): fn(samples)
+
+    return batchify_fn
+
+
 @paddle.no_grad()
 def evaluate(model, criterion, metric, data_loader, mode="dev"):
     """
@@ -278,7 +316,7 @@ def train(self):
             best_dev_acc, corr_test_acc))
 
 
-class SeqTrainer(ClueTrainer):
+class SeqTrainer2(ClueTrainer):
     def dataloader_inner(self):
         trans_fn = partial(
             seq_trans_fn, tokenizer=self.tokenizer, args=self.args)
@@ -290,3 +328,44 @@ def dataloader_inner(self):
             self.dev_ds, "dev", self.args.batch_size, batchify_fn, trans_fn)
         self.test_dl = self.create_dataloader(
             self.test_ds, "dev", self.args.batch_size, batchify_fn, trans_fn)
+
+
+class SeqTrainer(Trainer):
+    def __init__(self, train_ds, dev_ds, model, tokenizer, args, *arg,
+                 **kwargs):
+
+        trans_fn = partial(seq_trans_fn, tokenizer=tokenizer, args=args)
+        batchify_fn = clue_batchify_fn_dict(tokenizer, args)
+
+        train_ds = train_ds.map(trans_fn)
+        dev_ds = dev_ds.map(trans_fn)
+
+        loss_fct = paddle.nn.loss.CrossEntropyLoss(
+        ) if train_ds.label_list else paddle.nn.loss.MSELoss()
+
+        def compute_metrics(p):
+            preds = p.predictions[0] if isinstance(p.predictions,
+                                                   tuple) else p.predictions
+            probs = F.softmax(preds, axis=1)
+            metric = Accuracy()
+            metric.reset()
+            result = metric.compute(preds, p.label_ids)
+            metric.update(result)
+            accu = metric.accumulate()
+            metric.reset()
+            return {"eval_accuracy": accu}
+
+            # return {
+            #     "accuracy": (preds == p.label_ids).astype(np.float32).mean()
+            #     .item()
+            # }
+
+        super().__init__(
+            model,
+            loss_fct,
+            args,
+            batchify_fn,
+            train_ds,
+            dev_ds,
+            tokenizer,
+            compute_metrics=compute_metrics)
diff --git a/examples/language_model/ernie-1.0/finetune/trainer_args.py b/examples/language_model/ernie-1.0/finetune/trainer_args.py
new file mode 100644
index 000000000000..e0ad30e6361a
--- /dev/null
+++ b/examples/language_model/ernie-1.0/finetune/trainer_args.py
@@ -0,0 +1,997 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import json
+import math
+import os
+import warnings
+from dataclasses import asdict, dataclass, field
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from utils import logging
+
+
+class ExplicitEnum(Enum):
+    """
+    Enum with more explicit error message for missing values.
+    """
+
+    @classmethod
+    def _missing_(cls, value):
+        raise ValueError(
+            f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
+        )
+
+
+class IntervalStrategy(ExplicitEnum):
+    NO = "no"
+    STEPS = "steps"
+    EPOCH = "epoch"
+
+
+class EvaluationStrategy(ExplicitEnum):
+    NO = "no"
+    STEPS = "steps"
+    EPOCH = "epoch"
+
+
+# logger = logging.get_logger(__name__)
+log_levels = logging.get_log_levels_dict().copy()
+trainer_log_levels = dict(**log_levels, passive=-1)
+
+
+def default_logdir() -> str:
+    """
+    Same default as PyTorch
+    """
+    import socket
+    from datetime import datetime
+
+    current_time = datetime.now().strftime("%b%d_%H-%M-%S")
+    return os.path.join("runs", current_time + "_" + socket.gethostname())
+
+
+@dataclass
+class TrainingArguments:
+    """
+    TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
+    itself**.
+
+    Using [`HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters:
+        output_dir (`str`):
+            The output directory where the model predictions and checkpoints will be written.
+        overwrite_output_dir (`bool`, *optional*, defaults to `False`):
+            If `True`, overwrite the content of the output directory. Use this to continue training if `output_dir`
+            points to a checkpoint directory.
+        do_train (`bool`, *optional*, defaults to `False`):
+            Whether to run training or not. This argument is not directly used by [`Trainer`], it's intended to be used
+            by your training/evaluation scripts instead. See the [example
+            scripts](https://github.com/huggingface/transformers/tree/master/examples) for more details.
+        do_eval (`bool`, *optional*):
+            Whether to run evaluation on the validation set or not. Will be set to `True` if `evaluation_strategy` is
+            different from `"no"`. This argument is not directly used by [`Trainer`], it's intended to be used by your
+            training/evaluation scripts instead. See the [example
+            scripts](https://github.com/huggingface/transformers/tree/master/examples) for more details.
+        do_predict (`bool`, *optional*, defaults to `False`):
+            Whether to run predictions on the test set or not. This argument is not directly used by [`Trainer`], it's
+            intended to be used by your training/evaluation scripts instead. See the [example
+            scripts](https://github.com/huggingface/transformers/tree/master/examples) for more details.
+        evaluation_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
+            The evaluation strategy to adopt during training. Possible values are:
+
+                - `"no"`: No evaluation is done during training.
+                - `"steps"`: Evaluation is done (and logged) every `eval_steps`.
+                - `"epoch"`: Evaluation is done at the end of each epoch.
+
+        prediction_loss_only (`bool`, *optional*, defaults to `False`):
+            When performing evaluation and generating predictions, only returns the loss.
+        per_device_train_batch_size (`int`, *optional*, defaults to 8):
+            The batch size per GPU/TPU core/CPU for training.
+        per_device_eval_batch_size (`int`, *optional*, defaults to 8):
+            The batch size per GPU/TPU core/CPU for evaluation.
+        gradient_accumulation_steps (`int`, *optional*, defaults to 1):
+            Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
+
+            <Tip warning={true}>
+
+            When using gradient accumulation, one step is counted as one step with backward pass. Therefore, logging,
+            evaluation, save will be conducted every `gradient_accumulation_steps * xxx_step` training examples.
+
+            </Tip>
+
+        eval_accumulation_steps (`int`, *optional*):
+            Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If
+            left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster but
+            requires more memory).
+        learning_rate (`float`, *optional*, defaults to 5e-5):
+            The initial learning rate for [`AdamW`] optimizer.
+        weight_decay (`float`, *optional*, defaults to 0):
+            The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in [`AdamW`]
+            optimizer.
+        adam_beta1 (`float`, *optional*, defaults to 0.9):
+            The beta1 hyperparameter for the [`AdamW`] optimizer.
+        adam_beta2 (`float`, *optional*, defaults to 0.999):
+            The beta2 hyperparameter for the [`AdamW`] optimizer.
+        adam_epsilon (`float`, *optional*, defaults to 1e-8):
+            The epsilon hyperparameter for the [`AdamW`] optimizer.
+        max_grad_norm (`float`, *optional*, defaults to 1.0):
+            Maximum gradient norm (for gradient clipping).
+        num_train_epochs(`float`, *optional*, defaults to 3.0):
+            Total number of training epochs to perform (if not an integer, will perform the decimal part percents of
+            the last epoch before stopping training).
+        max_steps (`int`, *optional*, defaults to -1):
+            If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`.
+            In case of using a finite iterable dataset the training may stop before reaching the set number of steps
+            when all data is exhausted
+        lr_scheduler_type (`str` or [`SchedulerType`], *optional*, defaults to `"linear"`):
+            The scheduler type to use. See the documentation of [`SchedulerType`] for all possible values.
+        warmup_ratio (`float`, *optional*, defaults to 0.0):
+            Ratio of total training steps used for a linear warmup from 0 to `learning_rate`.
+        warmup_steps (`int`, *optional*, defaults to 0):
+            Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of `warmup_ratio`.
+        log_level (`str`, *optional*, defaults to `passive`):
+            Logger log level to use on the main process. Possible choices are the log levels as strings: 'debug',
+            'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and lets the
+            application set the level.
+        log_level_replica (`str`, *optional*, defaults to `passive`):
+            Logger log level to use on replicas. Same choices as `log_level`"
+        log_on_each_node (`bool`, *optional*, defaults to `True`):
+            In multinode distributed training, whether to log using `log_level` once per node, or only on the main
+            node.
+        logging_dir (`str`, *optional*):
+            [TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to
+            *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***.
+        logging_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
+            The logging strategy to adopt during training. Possible values are:
+
+                - `"no"`: No logging is done during training.
+                - `"epoch"`: Logging is done at the end of each epoch.
+                - `"steps"`: Logging is done every `logging_steps`.
+
+        logging_first_step (`bool`, *optional*, defaults to `False`):
+            Whether to log and evaluate the first `global_step` or not.
+        logging_steps (`int`, *optional*, defaults to 500):
+            Number of update steps between two logs if `logging_strategy="steps"`.
+        logging_nan_inf_filter (`bool`, *optional*, defaults to `True`):
+            Whether to filter `nan` and `inf` losses for logging. If set to `True` the loss of every step that is `nan`
+            or `inf` is filtered and the average loss of the current logging window is taken instead.
+
+            <Tip>
+
+            `logging_nan_inf_filter` only influences the logging of loss values, it does not change the behavior the
+            gradient is computed or applied to the model.
+
+            </Tip>
+
+        save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
+            The checkpoint save strategy to adopt during training. Possible values are:
+
+                - `"no"`: No save is done during training.
+                - `"epoch"`: Save is done at the end of each epoch.
+                - `"steps"`: Save is done every `save_steps`.
+        save_steps (`int`, *optional*, defaults to 500):
+            Number of updates steps before two checkpoint saves if `save_strategy="steps"`.
+        save_total_limit (`int`, *optional*):
+            If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
+            `output_dir`.
+        save_on_each_node (`bool`, *optional*, defaults to `False`):
+            When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on
+            the main one.
+
+            This should not be activated when the different nodes use the same storage as the files will be saved with
+            the same names for each node.
+        no_cuda (`bool`, *optional*, defaults to `False`):
+            Whether to not use CUDA even when it is available or not.
+        seed (`int`, *optional*, defaults to 42):
+            Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the
+            [`~Trainer.model_init`] function to instantiate the model if it has some randomly initialized parameters.
+        bf16 (`bool`, *optional*, defaults to `False`):
+            Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher
+            NVIDIA architecture. This is an experimental API and it may change.
+        fp16 (`bool`, *optional*, defaults to `False`):
+            Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
+        fp16_opt_level (`str`, *optional*, defaults to 'O1'):
+            For `fp16` training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details on
+            the [Apex documentation](https://nvidia.github.io/apex/amp).
+        fp16_backend (`str`, *optional*, defaults to `"auto"`):
+            This argument is deprecated. Use `half_precision_backend` instead.
+        half_precision_backend (`str`, *optional*, defaults to `"auto"`):
+            The backend to use for mixed precision training. Must be one of `"auto"`, `"amp"` or `"apex"`. `"auto"`
+            will use AMP or APEX depending on the PyTorch version detected, while the other choices will force the
+            requested backend.
+        bf16_full_eval (`bool`, *optional*, defaults to `False`):
+            Whether to use full bfloat16 evaluation instead of 32-bit. This will be faster and save memory but can harm
+            metric values. This is an experimental API and it may change.
+        fp16_full_eval (`bool`, *optional*, defaults to `False`):
+            Whether to use full float16 evaluation instead of 32-bit. This will be faster and save memory but can harm
+            metric values.
+        tf32 (`bool`, *optional*):
+            Whether to enable tf32 mode, available in Ampere and newer GPU architectures. This is an experimental API
+            and it may change.
+        local_rank (`int`, *optional*, defaults to -1):
+            Rank of the process during distributed training.
+        xpu_backend (`str`, *optional*):
+            The backend to use for xpu distributed training. Must be one of `"mpi"` or `"ccl"`.
+        tpu_num_cores (`int`, *optional*):
+            When training on TPU, the number of TPU cores (automatically passed by launcher script).
+        dataloader_drop_last (`bool`, *optional*, defaults to `False`):
+            Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
+            or not.
+        eval_steps (`int`, *optional*):
+            Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same
+            value as `logging_steps` if not set.
+        dataloader_num_workers (`int`, *optional*, defaults to 0):
+            Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the
+            main process.
+        past_index (`int`, *optional*, defaults to -1):
+            Some models like [TransformerXL](../model_doc/transformerxl) or [XLNet](../model_doc/xlnet) can make use of
+            the past hidden states for their predictions. If this argument is set to a positive int, the `Trainer` will
+            use the corresponding output (usually index 2) as the past state and feed it to the model at the next
+            training step under the keyword argument `mems`.
+        run_name (`str`, *optional*):
+            A descriptor for the run. Typically used for [wandb](https://www.wandb.com/) and
+            [mlflow](https://www.mlflow.org/) logging.
+        disable_tqdm (`bool`, *optional*):
+            Whether or not to disable the tqdm progress bars and table of metrics produced by
+            [`~notebook.NotebookTrainingTracker`] in Jupyter Notebooks. Will default to `True` if the logging level is
+            set to warn or lower (default), `False` otherwise.
+        remove_unused_columns (`bool`, *optional*, defaults to `True`):
+            If using `datasets.Dataset` datasets, whether or not to automatically remove the columns unused by the
+            model forward method.
+
+            (Note that this behavior is not implemented for [`TFTrainer`] yet.)
+        label_names (`List[str]`, *optional*):
+            The list of keys in your dictionary of inputs that correspond to the labels.
+
+            Will eventually default to `["labels"]` except if the model used is one of the `XxxForQuestionAnswering` in
+            which case it will default to `["start_positions", "end_positions"]`.
+        load_best_model_at_end (`bool`, *optional*, defaults to `False`):
+            Whether or not to load the best model found during training at the end of training.
+
+            <Tip>
+
+            When set to `True`, the parameters `save_strategy` needs to be the same as `eval_strategy`, and in the case
+            it is "steps", `save_steps` must be a round multiple of `eval_steps`.
+
+            </Tip>
+
+        metric_for_best_model (`str`, *optional*):
+            Use in conjunction with `load_best_model_at_end` to specify the metric to use to compare two different
+            models. Must be the name of a metric returned by the evaluation with or without the prefix `"eval_"`. Will
+            default to `"loss"` if unspecified and `load_best_model_at_end=True` (to use the evaluation loss).
+
+            If you set this value, `greater_is_better` will default to `True`. Don't forget to set it to `False` if
+            your metric is better when lower.
+        greater_is_better (`bool`, *optional*):
+            Use in conjunction with `load_best_model_at_end` and `metric_for_best_model` to specify if better models
+            should have a greater metric or not. Will default to:
+
+            - `True` if `metric_for_best_model` is set to a value that isn't `"loss"` or `"eval_loss"`.
+            - `False` if `metric_for_best_model` is not set, or set to `"loss"` or `"eval_loss"`.
+        ignore_data_skip (`bool`, *optional*, defaults to `False`):
+            When resuming training, whether or not to skip the epochs and batches to get the data loading at the same
+            stage as in the previous training. If set to `True`, the training will begin faster (as that skipping step
+            can take a long time) but will not yield the same results as the interrupted training would have.
+        sharded_ddp (`bool`, `str` or list of [`~trainer_utils.ShardedDDPOption`], *optional*, defaults to `False`):
+            Use Sharded DDP training from [FairScale](https://github.com/facebookresearch/fairscale) (in distributed
+            training only). This is an experimental feature.
+
+            A list of options along the following:
+
+            - `"simple"`: to use first instance of sharded DDP released by fairscale (`ShardedDDP`) similar to ZeRO-2.
+            - `"zero_dp_2"`: to use the second instance of sharded DPP released by fairscale (`FullyShardedDDP`) in
+              Zero-2 mode (with `reshard_after_forward=False`).
+            - `"zero_dp_3"`: to use the second instance of sharded DPP released by fairscale (`FullyShardedDDP`) in
+              Zero-3 mode (with `reshard_after_forward=True`).
+            - `"offload"`: to add ZeRO-offload (only compatible with `"zero_dp_2"` and `"zero_dp_3"`).
+
+            If a string is passed, it will be split on space. If a bool is passed, it will be converted to an empty
+            list for `False` and `["simple"]` for `True`.
+        deepspeed (`str` or `dict`, *optional*):
+            Use [Deepspeed](https://github.com/microsoft/deepspeed). This is an experimental feature and its API may
+            evolve in the future. The value is either the location of DeepSpeed json config file (e.g.,
+            `ds_config.json`) or an already loaded json file as a `dict`"
+        label_smoothing_factor (`float`, *optional*, defaults to 0.0):
+            The label smoothing factor to use. Zero means no label smoothing, otherwise the underlying onehot-encoded
+            labels are changed from 0s and 1s to `label_smoothing_factor/num_labels` and `1 - label_smoothing_factor +
+            label_smoothing_factor/num_labels` respectively.
+        debug (`str` or list of [`~debug_utils.DebugOption`], *optional*, defaults to `""`):
+            Enable one or more debug features. This is an experimental feature.
+
+            Possible options are:
+
+            - `"underflow_overflow"`: detects overflow in model's input/outputs and reports the last frames that led to
+              the event
+            - `"tpu_metrics_debug"`: print debug metrics on TPU
+
+            The options should be separated by whitespaces.
+        optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_hf"`):
+            The optimizer to use: adamw_hf, adamw_torch, adamw_apex_fused, or adafactor.
+        adafactor (`bool`, *optional*, defaults to `False`):
+            This argument is deprecated. Use `--optim adafactor` instead.
+        group_by_length (`bool`, *optional*, defaults to `False`):
+            Whether or not to group together samples of roughly the same length in the training dataset (to minimize
+            padding applied and be more efficient). Only useful if applying dynamic padding.
+        length_column_name (`str`, *optional*, defaults to `"length"`):
+            Column name for precomputed lengths. If the column exists, grouping by length will use these values rather
+            than computing them on train startup. Ignored unless `group_by_length` is `True` and the dataset is an
+            instance of `Dataset`.
+        report_to (`str` or `List[str]`, *optional*, defaults to `"all"`):
+            The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
+            `"comet_ml"`, `"mlflow"`, `"tensorboard"` and `"wandb"`. Use `"all"` to report to all integrations
+            installed, `"none"` for no integrations.
+        ddp_find_unused_parameters (`bool`, *optional*):
+            When using distributed training, the value of the flag `find_unused_parameters` passed to
+            `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.
+        ddp_bucket_cap_mb (`int`, *optional*):
+            When using distributed training, the value of the flag `bucket_cap_mb` passed to `DistributedDataParallel`.
+        dataloader_pin_memory (`bool`, *optional*, defaults to `True`):
+            Whether you want to pin memory in data loaders or not. Will default to `True`.
+        skip_memory_metrics (`bool`, *optional*, defaults to `True`):
+            Whether to skip adding of memory profiler reports to metrics. This is skipped by default because it slows
+            down the training and evaluation speed.
+        push_to_hub (`bool`, *optional*, defaults to `False`):
+            Whether or not to push the model to the Hub every time the model is saved. If this is activated,
+            `output_dir` will begin a git directory synced with the the repo (determined by `hub_model_id`) and the
+            content will be pushed each time a save is triggered (depending on your `save_strategy`). Calling
+            [`~Trainer.save_model`] will also trigger a push.
+
+            <Tip warning={true}>
+
+            If `output_dir` exists, it needs to be a local clone of the repository to which the [`Trainer`] will be
+            pushed.
+
+            </Tip>
+
+        resume_from_checkpoint (`str`, *optional*):
+            The path to a folder with a valid checkpoint for your model. This argument is not directly used by
+            [`Trainer`], it's intended to be used by your training/evaluation scripts instead. See the [example
+            scripts](https://github.com/huggingface/transformers/tree/master/examples) for more details.
+        hub_model_id (`str`, *optional*):
+            The name of the repository to keep in sync with the local *output_dir*. It can be a simple model ID in
+            which case the model will be pushed in your namespace. Otherwise it should be the whole repository name,
+            for instance `"user_name/model"`, which allows you to push to an organization you are a member of with
+            `"organization_name/model"`. Will default to `user_name/output_dir_name` with *output_dir_name* being the
+            name of `output_dir`.
+
+            Will default to to the name of `output_dir`.
+        hub_strategy (`str` or [`~trainer_utils.HubStrategy`], *optional*, defaults to `"every_save"`):
+            Defines the scope of what is pushed to the Hub and when. Possible values are:
+
+            - `"end"`: push the model, its configuration, the tokenizer (if passed along to the [`Trainer`]) and a
+              draft of a model card when the [`~Trainer.save_model`] method is called.
+            - `"every_save"`: push the model, its configuration, the tokenizer (if passed along to the [`Trainer`]) and
+              a draft of a model card each time there is a model save. The pushes are asynchronous to not block
+              training, and in case the save are very frequent, a new push is only attempted if the previous one is
+              finished. A last push is made with the final model at the end of training.
+            - `"checkpoint"`: like `"every_save"` but the latest checkpoint is also pushed in a subfolder named
+              last-checkpoint, allowing you to resume training easily with
+              `trainer.train(resume_from_checkpoint="last-checkpoint")`.
+            - `"all_checkpoints"`: like `"checkpoint"` but all checkpoints are pushed like they appear in the output
+              folder (so you will get one checkpoint folder per folder in your final repository)
+
+        hub_token (`str`, *optional*):
+            The token to use to push the model to the Hub. Will default to the token in the cache folder obtained with
+            `huggingface-cli login`.
+        gradient_checkpointing (`bool`, *optional*, defaults to `False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+    """
+
+    output_dir: str = field(
+        metadata={
+            "help":
+            "The output directory where the model predictions and checkpoints will be written."
+        }, )
+    overwrite_output_dir: bool = field(
+        default=False,
+        metadata={
+            "help":
+            ("Overwrite the content of the output directory. "
+             "Use this to continue training if output_dir points to a checkpoint directory."
+             )
+        }, )
+
+    do_train: bool = field(
+        default=False, metadata={"help": "Whether to run training."})
+    do_eval: bool = field(
+        default=False,
+        metadata={"help": "Whether to run eval on the dev set."})
+    do_predict: bool = field(
+        default=False,
+        metadata={"help": "Whether to run predictions on the test set."})
+    evaluation_strategy: IntervalStrategy = field(
+        default="no",
+        metadata={"help": "The evaluation strategy to use."}, )
+    prediction_loss_only: bool = field(
+        default=False,
+        metadata={
+            "help":
+            "When performing evaluation and predictions, only returns the loss."
+        }, )
+
+    per_device_train_batch_size: int = field(
+        default=8,
+        metadata={"help": "Batch size per GPU/TPU core/CPU for training."})
+    per_device_eval_batch_size: int = field(
+        default=8,
+        metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."})
+
+    per_gpu_train_batch_size: Optional[int] = field(
+        default=None,
+        metadata={
+            "help":
+            "Deprecated, the use of `--per_device_train_batch_size` is preferred. "
+            "Batch size per GPU/TPU core/CPU for training."
+        }, )
+    per_gpu_eval_batch_size: Optional[int] = field(
+        default=None,
+        metadata={
+            "help":
+            "Deprecated, the use of `--per_device_eval_batch_size` is preferred. "
+            "Batch size per GPU/TPU core/CPU for evaluation."
+        }, )
+
+    gradient_accumulation_steps: int = field(
+        default=1,
+        metadata={
+            "help":
+            "Number of updates steps to accumulate before performing a backward/update pass."
+        }, )
+    eval_accumulation_steps: Optional[int] = field(
+        default=None,
+        metadata={
+            "help":
+            "Number of predictions steps to accumulate before moving the tensors to the CPU."
+        }, )
+
+    learning_rate: float = field(
+        default=5e-5,
+        metadata={"help": "The initial learning rate for AdamW."})
+    weight_decay: float = field(
+        default=0.0,
+        metadata={"help": "Weight decay for AdamW if we apply some."})
+    adam_beta1: float = field(
+        default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
+    adam_beta2: float = field(
+        default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
+    adam_epsilon: float = field(
+        default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
+    max_grad_norm: float = field(
+        default=1.0, metadata={"help": "Max gradient norm."})
+
+    num_train_epochs: float = field(
+        default=3.0,
+        metadata={"help": "Total number of training epochs to perform."})
+    max_steps: int = field(
+        default=-1,
+        metadata={
+            "help":
+            "If > 0: set total number of training steps to perform. Override num_train_epochs."
+        }, )
+    lr_scheduler_type: str = field(
+        default="linear",
+        metadata={"help": "The scheduler type to use."}, )
+    warmup_ratio: float = field(
+        default=0.0,
+        metadata={
+            "help": "Linear warmup over warmup_ratio fraction of total steps."
+        })
+    warmup_steps: int = field(
+        default=0, metadata={"help": "Linear warmup over warmup_steps."})
+
+    log_level: Optional[str] = field(
+        default="passive",
+        metadata={
+            "help":
+            "Logger log level to use on the main node. Possible choices are the log levels as strings: 'debug', 'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and lets the application set the level. Defaults to 'passive'.",
+            "choices": trainer_log_levels.keys(),
+        }, )
+    log_level_replica: Optional[str] = field(
+        default="passive",
+        metadata={
+            "help":
+            "Logger log level to use on replica nodes. Same choices and defaults as ``log_level``",
+            "choices": trainer_log_levels.keys(),
+        }, )
+    log_on_each_node: bool = field(
+        default=True,
+        metadata={
+            "help":
+            "When doing a multinode distributed training, whether to log once per node or just once on the main node."
+        }, )
+    logging_dir: Optional[str] = field(
+        default=None, metadata={"help": "Tensorboard log dir."})
+    logging_strategy: IntervalStrategy = field(
+        default="steps",
+        metadata={"help": "The logging strategy to use."}, )
+    logging_first_step: bool = field(
+        default=False, metadata={"help": "Log the first global_step"})
+    logging_steps: int = field(
+        default=500, metadata={"help": "Log every X updates steps."})
+
+    save_strategy: IntervalStrategy = field(
+        default="steps",
+        metadata={"help": "The checkpoint save strategy to use."}, )
+    save_steps: int = field(
+        default=500,
+        metadata={"help": "Save checkpoint every X updates steps."})
+    save_total_limit: Optional[int] = field(
+        default=None,
+        metadata={
+            "help":
+            ("Limit the total amount of checkpoints. "
+             "Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints"
+             )
+        }, )
+    save_on_each_node: bool = field(
+        default=False,
+        metadata={
+            "help":
+            "When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on the main one"
+        }, )
+    no_cuda: bool = field(
+        default=False,
+        metadata={"help": "Do not use CUDA even when it is available"})
+    seed: int = field(
+        default=42,
+        metadata={
+            "help": "Random seed that will be set at the beginning of training."
+        })
+
+    fp16: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use fp16 (mixed) precision instead of 32-bit"
+        }, )
+    fp16_opt_level: str = field(
+        default="O1",
+        metadata={
+            "help":
+            ("For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
+             "See details at https://nvidia.github.io/apex/amp.html")
+        }, )
+    local_rank: int = field(
+        default=-1, metadata={"help": "For distributed training: local_rank"})
+
+    debug: str = field(
+        default="",
+        metadata={
+            "help": "Whether or not to enable debug mode. Current options: "
+            "`underflow_overflow` (Detect underflow and overflow in activations and weights), "
+            "`tpu_metrics_debug` (print debug metrics on TPU)."
+        }, )
+
+    dataloader_drop_last: bool = field(
+        default=False,
+        metadata={
+            "help":
+            "Drop the last incomplete batch if it is not divisible by the batch size."
+        })
+    eval_steps: int = field(
+        default=None, metadata={"help": "Run an evaluation every X steps."})
+    dataloader_num_workers: int = field(
+        default=0,
+        metadata={
+            "help":
+            "Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the main process."
+        }, )
+
+    past_index: int = field(
+        default=-1,
+        metadata={
+            "help":
+            "If >=0, uses the corresponding part of the output as the past state for next step."
+        }, )
+
+    run_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help":
+            "An optional descriptor for the run. Notably used for wandb logging."
+        })
+
+    label_names: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help":
+            "The list of keys in your dictionary of inputs that correspond to the labels."
+        })
+
+    load_best_model_at_end: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help":
+            "Whether or not to load the best model found during training at the end of training."
+        }, )
+    metric_for_best_model: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The metric to use to compare two different models."
+        })
+    greater_is_better: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help":
+            "Whether the `metric_for_best_model` should be maximized or not."
+        })
+    ignore_data_skip: bool = field(
+        default=False,
+        metadata={
+            "help":
+            "When resuming training, whether or not to skip the first epochs and batches to get to the same training data."
+        }, )
+    optim: str = field(
+        default="adamw",
+        metadata={"help": "The optimizer to use."}, )
+    report_to: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help":
+            "The list of integrations to report the results and logs to."
+        })
+
+    dataloader_pin_memory: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to pin memory for DataLoader."})
+    skip_memory_metrics: bool = field(
+        default=True,
+        metadata={
+            "help":
+            "Whether or not to skip adding of memory profiler reports to metrics."
+        })
+
+    resume_from_checkpoint: Optional[str] = field(
+        default=None,
+        metadata={
+            "help":
+            "The path to a folder with a valid checkpoint for your model."
+        }, )
+
+    gradient_checkpointing: bool = field(
+        default=False,
+        metadata={
+            "help":
+            "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        }, )
+    _n_gpu: int = field(init=False, repr=False, default=-1)
+
+    def __post_init__(self):
+        # Handle --use_env option in torch.distributed.launch (local_rank not passed as an arg then).
+        # This needs to happen before any call to self.device or self.n_gpu.
+        env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+        if env_local_rank != -1 and env_local_rank != self.local_rank:
+            self.local_rank = env_local_rank
+
+        # convert to int
+        self.log_level = trainer_log_levels[self.log_level]
+        self.log_level_replica = trainer_log_levels[self.log_level_replica]
+
+        # expand paths, if not os.makedirs("~/bar") will make directory
+        # in the current directory instead of the actual home
+        #  see https://github.com/huggingface/transformers/issues/10628
+        if self.output_dir is not None:
+            self.output_dir = os.path.expanduser(self.output_dir)
+        if self.logging_dir is None and self.output_dir is not None:
+            self.logging_dir = os.path.join(self.output_dir, default_logdir())
+        if self.logging_dir is not None:
+            self.logging_dir = os.path.expanduser(self.logging_dir)
+
+        if self.disable_tqdm is None:
+            self.disable_tqdm = logger.getEffectiveLevel() > logging.WARN
+
+        if isinstance(self.evaluation_strategy, EvaluationStrategy):
+            warnings.warn(
+                "using `EvaluationStrategy` for `evaluation_strategy` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `IntervalStrategy` instead",
+                FutureWarning, )
+            # Go back to the underlying string or we won't be able to instantiate `IntervalStrategy` on it.
+            self.evaluation_strategy = self.evaluation_strategy.value
+
+        self.evaluation_strategy = IntervalStrategy(self.evaluation_strategy)
+        self.logging_strategy = IntervalStrategy(self.logging_strategy)
+        self.save_strategy = IntervalStrategy(self.save_strategy)
+
+        self.lr_scheduler_type = SchedulerType(self.lr_scheduler_type)
+        if self.do_eval is False and self.evaluation_strategy != IntervalStrategy.NO:
+            self.do_eval = True
+
+        # eval_steps has to be defined and non-zero, fallbacks to logging_steps if the latter is non-zero
+        if self.evaluation_strategy == IntervalStrategy.STEPS and (
+                self.eval_steps is None or self.eval_steps == 0):
+            if self.logging_steps > 0:
+                logger.info(
+                    f"using `logging_steps` to initialize `eval_steps` to {self.logging_steps}"
+                )
+                self.eval_steps = self.logging_steps
+            else:
+                raise ValueError(
+                    f"evaluation strategy {self.evaluation_strategy} requires either non-zero --eval_steps or --logging_steps"
+                )
+
+        # logging_steps must be non-zero for logging_strategy that is other than 'no'
+        if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps == 0:
+            raise ValueError(
+                f"logging strategy {self.logging_strategy} requires non-zero --logging_steps"
+            )
+
+        # Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible.
+        if self.load_best_model_at_end:
+            if self.evaluation_strategy != self.save_strategy:
+                raise ValueError(
+                    "--load_best_model_at_end requires the save and eval strategy to match, but found\n- Evaluation "
+                    f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}"
+                )
+            if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
+                raise ValueError(
+                    "--load_best_model_at_end requires the saving steps to be a round multiple of the evaluation "
+                    f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}."
+                )
+
+        if self.load_best_model_at_end and self.metric_for_best_model is None:
+            self.metric_for_best_model = "loss"
+        if self.greater_is_better is None and self.metric_for_best_model is not None:
+            self.greater_is_better = self.metric_for_best_model not in [
+                "loss", "eval_loss"
+            ]
+        if self.run_name is None:
+            self.run_name = self.output_dir
+
+        self.optim = OptimizerNames(self.optim)
+
+        if self.warmup_ratio < 0 or self.warmup_ratio > 1:
+            raise ValueError("warmup_ratio must lie in range [0,1]")
+        elif self.warmup_ratio > 0 and self.warmup_steps > 0:
+            logger.info(
+                "Both warmup_ratio and warmup_steps given, warmup_steps will override any effect of warmup_ratio during training"
+            )
+
+        if isinstance(self.debug, str):
+            self.debug = [DebugOption(s) for s in self.debug.split()]
+
+    def __str__(self):
+        self_as_dict = asdict(self)
+
+        # Remove deprecated arguments. That code should be removed once
+        # those deprecated arguments are removed from TrainingArguments. (TODO: v5)
+        del self_as_dict["per_gpu_train_batch_size"]
+        del self_as_dict["per_gpu_eval_batch_size"]
+
+        self_as_dict = {
+            k: f"<{k.upper()}>" if k.endswith("_token") else v
+            for k, v in self_as_dict.items()
+        }
+
+        attrs_as_str = [f"{k}={v},\n" for k, v in sorted(self_as_dict.items())]
+        return f"{self.__class__.__name__}(\n{''.join(attrs_as_str)})"
+
+    __repr__ = __str__
+
+    @property
+    def train_batch_size(self) -> int:
+        """
+        The actual batch size for training (may differ from `per_gpu_train_batch_size` in distributed training).
+        """
+        if self.per_gpu_train_batch_size:
+            logger.warning(
+                "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future "
+                "version. Using `--per_device_train_batch_size` is preferred.")
+        per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size
+        train_batch_size = per_device_batch_size * max(1, self.n_gpu)
+        return train_batch_size
+
+    @property
+    def eval_batch_size(self) -> int:
+        """
+        The actual batch size for evaluation (may differ from `per_gpu_eval_batch_size` in distributed training).
+        """
+        if self.per_gpu_eval_batch_size:
+            logger.warning(
+                "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future "
+                "version. Using `--per_device_eval_batch_size` is preferred.")
+        per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size
+        eval_batch_size = per_device_batch_size * max(1, self.n_gpu)
+        return eval_batch_size
+
+    @property
+    def n_gpu(self):
+        """
+        The number of GPUs used by this process.
+
+        Note:
+            This will only be greater than one when you have multiple GPUs available but are not using distributed
+            training. For distributed training, it will always be 1.
+        """
+        # Make sure `self._n_gpu` is properly setup.
+        # _ = self._setup_devices
+        return self._n_gpu
+
+    @property
+    def world_size(self):
+        """
+        The number of processes used in parallel.
+        """
+        if self.local_rank != -1:
+            return torch.distributed.get_world_size()
+        return 1
+
+    @property
+    def process_index(self):
+        """
+        The index of the current process used.
+        """
+        if self.local_rank != -1:
+            return torch.distributed.get_rank()
+        return 0
+
+    @property
+    def local_process_index(self):
+        """
+        The index of the local process used.
+        """
+        if self.local_rank != -1:
+            return self.local_rank
+        return 0
+
+    @property
+    def should_log(self):
+        """
+        Whether or not the current process should produce log.
+        """
+        if self.log_on_each_node:
+            return self.local_process_index == 0
+        else:
+            return self.process_index == 0
+
+    @property
+    def should_save(self):
+        """
+        Whether or not the current process should write to disk, e.g., to save models and checkpoints.
+        """
+        if self.save_on_each_node:
+            return self.local_process_index == 0
+        else:
+            if is_sagemaker_mp_enabled():
+                return smp.rank() == 0
+            else:
+                return self.process_index == 0
+
+    def get_process_log_level(self):
+        """
+        Returns the log level to be used depending on whether this process is the main process of node 0, main process
+        of node non-0, or a non-main process.
+
+        For the main process the log level defaults to `logging.INFO` unless overridden by `log_level` argument.
+
+        For the replica processes the log level defaults to `logging.WARNING` unless overridden by `log_level_replica`
+        argument.
+
+        The choice between the main and replica process settings is made according to the return value of `should_log`.
+        """
+
+        log_level_main_node = logging.INFO if self.log_level == -1 else self.log_level
+        log_level_replica_node = logging.WARNING if self.log_level_replica == -1 else self.log_level_replica
+        return log_level_main_node if self.should_log else log_level_replica_node
+
+    @contextlib.contextmanager
+    def main_process_first(self, local=True, desc="work"):
+        """
+        A context manager for torch distributed environment where on needs to do something on the main process, while
+        blocking replicas, and when it's finished releasing the replicas.
+
+        One such use is for `datasets`'s `map` feature which to be efficient should be run once on the main process,
+        which upon completion saves a cached version of results and which then automatically gets loaded by the
+        replicas.
+
+        Args:
+            local (`bool`, *optional*, defaults to `True`):
+                if `True` first means process of rank 0 of each node if `False` first means process of rank 0 of node
+                rank 0 In multi-node environment with a shared filesystem you most likely will want to use
+                `local=False` so that only the main process of the first node will do the processing. If however, the
+                filesystem is not shared, then the main process of each node will need to do the processing, which is
+                the default behavior.
+            desc (`str`, *optional*, defaults to `"work"`):
+                a work description to be used in debug logs
+
+        """
+        if is_torch_available() and self.world_size > 1:
+            if local:
+                is_main_process = self.local_process_index == 0
+                main_process_desc = "main local process"
+            else:
+                is_main_process = self.process_index == 0
+                main_process_desc = "main process"
+
+            try:
+                if not is_main_process:
+                    # tell all replicas to wait
+                    logger.debug(
+                        f"{self.process_index}: waiting for the {main_process_desc} to perform {desc}"
+                    )
+                    if is_torch_tpu_available():
+                        xm.rendezvous(desc)
+                    elif is_sagemaker_dp_enabled():
+                        sm_dist.barrier()
+                    else:
+                        torch.distributed.barrier()
+                yield
+            finally:
+                if is_main_process:
+                    # the wait is over
+                    logger.debug(
+                        f"{self.process_index}: {main_process_desc} completed {desc}, releasing all replicas"
+                    )
+                    if is_torch_tpu_available():
+                        xm.rendezvous(desc)
+                    elif is_sagemaker_dp_enabled():
+                        sm_dist.barrier()
+                    else:
+                        torch.distributed.barrier()
+        else:
+            yield
+
+    def get_warmup_steps(self, num_training_steps: int):
+        """
+        Get number of steps used for a linear warmup.
+        """
+        warmup_steps = (self.warmup_steps if self.warmup_steps > 0 else
+                        math.ceil(num_training_steps * self.warmup_ratio))
+        return warmup_steps
+
+    def to_dict(self):
+        """
+        Serializes this instance while replace `Enum` by their values (for JSON serialization support). It obfuscates
+        the token values by removing their value.
+        """
+        d = asdict(self)
+        for k, v in d.items():
+            if isinstance(v, Enum):
+                d[k] = v.value
+            if isinstance(v, list) and len(v) > 0 and isinstance(v[0], Enum):
+                d[k] = [x.value for x in v]
+            if k.endswith("_token"):
+                d[k] = f"<{k.upper()}>"
+        return d
+
+    def to_json_string(self):
+        """
+        Serializes this instance to a JSON string.
+        """
+        return json.dumps(self.to_dict(), indent=2)
+
+    def to_sanitized_dict(self) -> Dict[str, Any]:
+        """
+        Sanitized serialization to use with TensorBoard’s hparams
+        """
+        d = self.to_dict()
+        d = {
+            ** d, ** {
+                "train_batch_size": self.train_batch_size,
+                "eval_batch_size": self.eval_batch_size
+            }
+        }
+
+        valid_types = [bool, int, float, str]
+        if is_torch_available():
+            valid_types.append(torch.Tensor)
+
+        return {
+            k: v if type(v) in valid_types else str(v)
+            for k, v in d.items()
+        }
diff --git a/examples/language_model/ernie-1.0/finetune/trainer_base.py b/examples/language_model/ernie-1.0/finetune/trainer_base.py
index 6b1790063aa4..914481545458 100644
--- a/examples/language_model/ernie-1.0/finetune/trainer_base.py
+++ b/examples/language_model/ernie-1.0/finetune/trainer_base.py
@@ -12,128 +12,258 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import collections
+import contextlib
+import inspect
+import math
+import os
+import random
+import re
+import shutil
+import sys
+import time
+import warnings
+from collections.abc import Mapping
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+
+from tqdm.auto import tqdm
+
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 
 from paddlenlp.transformers import LinearDecayWithWarmup
 from paddlenlp.utils.log import logger
-from paddle.io import DataLoader
+from paddle.io import DataLoader, DistributedBatchSampler
 
+import numpy as np
 
-class TrainerBase(object):
-    """
-    """
+from trainer_args import TrainingArguments
+# from trainer_callback import TrainerState, TrainerControl
 
-    def create_dataloader(self,
-                          dataset,
-                          mode='train',
-                          batch_size=16,
-                          batchify_fn=None,
-                          trans_fn=None,
-                          batched=False):
-        """
-        """
-        if trans_fn:
-            dataset = dataset.map(trans_fn, batched=batched)
+from trainer_utils import (
+    IntervalStrategy,
+    EvaluationStrategy,
+    EvalPrediction,
+    PredictionOutput,
+    EvalLoopOutput,
+    speed_metrics, )
 
-        shuffle = True if mode == 'train' else False
-        if mode == 'train':
-            batch_sampler = paddle.io.DistributedBatchSampler(
-                dataset, batch_size=batch_size, shuffle=shuffle)
-        else:
-            batch_sampler = paddle.io.BatchSampler(
-                dataset, batch_size=batch_size, shuffle=shuffle)
+from trainer_callback import (
+    CallbackHandler,
+    DefaultFlowCallback,
+    PrinterCallback,
+    ProgressCallback,
+    TrainerCallback,
+    TrainerControl,
+    TrainerState, )
 
-        return paddle.io.DataLoader(
-            dataset=dataset,
-            batch_sampler=batch_sampler,
-            collate_fn=batchify_fn,
-            num_workers=0,
-            return_list=True)
+DEFAULT_CALLBACKS = [DefaultFlowCallback]
 
-    def train(self, *args, **kwargs):
-        """
-        """
-        pass
+from utils import logging
 
-    def eval(self, *args, **kwargs):
-        """
-        """
-        pass
+from paddlenlp.transformers.model_utils import PretrainedModel, unwrap_model
+from paddlenlp.transformers.tokenizer_utils import PretrainedTokenizer
 
-    def prepare_train_config(self):
-        """
-        """
-        if self.args.max_steps > 0:
-            self.args.num_training_steps = self.args.max_steps
-            self.args.num_train_epochs = math.ceil(
-                self.args.num_training_steps / len(self.train_dl))
+# logger = logging.get_logger(__name__)
 
-        else:
-            self.args.num_training_steps = len(
-                self.train_dl) * self.args.num_train_epochs
-            self.args.num_train_epochs = self.args.num_train_epochs
+from paddle.io import Dataset
 
-        if self.args.num_training_steps // self.args.valid_steps < self.args.minimum_valid_times:
-            exp_step = self.args.num_training_steps / self.args.minimum_valid_times
-            exp_step = max(int(exp_step - exp_step % 10), 10)
-            logger.info("Set eval step to %d" % exp_step)
-            self.args.valid_steps = exp_step
 
-        warmup = self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_proportion
+class DataCollator:
+    pass
 
-        self.lr_scheduler = LinearDecayWithWarmup(
-            self.args.learning_rate, self.args.num_training_steps, warmup)
 
-        # Generate parameter names needed to perform weight decay.
-        # All bias and LayerNorm parameters are excluded.
-        decay_params = [
-            p.name for n, p in self.model.named_parameters()
-            if not any(nd in n for nd in ["bias", "norm"])
-        ]
+class DataCollatorWithPadding:
+    def __init__(self, *args, **kwargs):
+        pass
 
-        self.optimizer = paddle.optimizer.AdamW(
-            learning_rate=self.lr_scheduler,
-            beta1=0.9,
-            beta2=0.999,
-            epsilon=self.args.adam_epsilon,
-            parameters=self.model.parameters(),
-            weight_decay=self.args.weight_decay,
-            apply_decay_param_fun=lambda x: x in decay_params,
-            grad_clip=nn.ClipGradByGlobalNorm(self.args.max_grad_norm))
 
-    def print_config(self):
-        """
-        """
-        logger.info('{:^40}'.format("Configuration Arguments"))
-        logger.info('{:20}:{}'.format("paddle commit id",
-                                      paddle.version.commit))
-        for arg in vars(self.args):
-            logger.info('{:20}:{}'.format(arg, getattr(self.args, arg)))
+def paddle_pad_and_concatenate(tensor1, tensor2, padding_index=-100):
+    """Concatenates `tensor1` and `tensor2` on first axis, applying padding on the second if necessary."""
+    if len(tensor1.shape) == 1 or tensor1.shape[1] == tensor2.shape[1]:
+        return paddle.concat((tensor1, tensor2), axis=0)
+
+    raise ValueError("pass")
+    # Let's figure out the new shape
+    new_shape = (tensor1.shape[0] + tensor2.shape[0], max(
+        tensor1.shape[1], tensor2.shape[1])) + tensor1.shape[2:]
+
+    # Now let's fill the result tensor
+    result = tensor1.new_full(new_shape, padding_index)
+    result[:tensor1.shape[0], :tensor1.shape[1]] = tensor1
+    result[tensor1.shape[0]:, :tensor2.shape[1]] = tensor2
+    return result
+
+
+def nested_concat(tensors, new_tensors, padding_index=-100):
+    """
+    Concat the `new_tensors` to `tensors` on the first dim and pad them on the second if needed. Works for tensors or
+    nested list/tuples of tensors.
+    """
+    assert type(tensors) == type(
+        new_tensors
+    ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
+    if isinstance(tensors, (list, tuple)):
+        return type(tensors)(nested_concat(
+            t, n, padding_index=padding_index)
+                             for t, n in zip(tensors, new_tensors))
+    elif isinstance(tensors, paddle.Tensor):
+        return paddle_pad_and_concatenate(
+            tensors, new_tensors, padding_index=padding_index)
+    elif isinstance(tensors, np.ndarray):
+        return numpy_pad_and_concatenate(
+            tensors, new_tensors, padding_index=padding_index)
+    else:
+        raise TypeError(
+            f"Unsupported type for concatenation: got {type(tensors)}")
+
+
+def nested_detach(tensors):
+    "Detach `tensors` (even if it's a nested list/tuple of tensors)."
+    if isinstance(tensors, (list, tuple)):
+        return type(tensors)(nested_detach(t) for t in tensors)
+    return tensors.detach()
+
+
+# Name of the files used for checkpointing
+TRAINING_ARGS_NAME = "training_args.bin"
+TRAINER_STATE_NAME = "trainer_state.json"
+OPTIMIZER_NAME = "optimizer.pdparams"
+SCHEDULER_NAME = "scheduler.pdparams"
+SCALER_NAME = "scaler.pdparams"
+
+PREFIX_CHECKPOINT_DIR = "training"
+
+
+def set_seed(seed):
+    # Use the same data seed(for data shuffle) for all procs to guarantee data
+    # consistency after sharding.
+    random.seed(seed)
+    np.random.seed(seed)
+    # Maybe different op seeds(for dropout) for different procs is better. By:
+    # `paddle.seed(args.seed + paddle.distributed.get_rank())`
+    paddle.seed(seed)
 
 
 class Trainer:
     """
+    Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for 🤗 Transformers.
+
+    Args:
+        model ([`PretrainedModel`] or `paddle.nn.Module`, *optional*):
+            The model to train, evaluate or use for predictions. If not provided, a `model_init` must be passed.
+
+            <Tip>
+
+            [`Trainer`] is optimized to work with the [`PretrainedModel`] provided by the library. You can still use
+            your own models defined as `paddle.nn.Module` as long as they work the same way as the 🤗 Transformers
+            models.
+
+            </Tip>
+
+        args ([`TrainingArguments`], *optional*):
+            The arguments to tweak for training. Will default to a basic instance of [`TrainingArguments`] with the
+            `output_dir` set to a directory named *tmp_trainer* in the current directory if not provided.
+        data_collator (`DataCollator`, *optional*):
+            The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. Will
+            default to [`default_data_collator`] if no `tokenizer` is provided, an instance of
+            [`DataCollatorWithPadding`] otherwise.
+        train_dataset (`paddle.utils.data.Dataset` or `paddle.utils.data.IterableDataset`, *optional*):
+            The dataset to use for training. If it is an `datasets.Dataset`, columns not accepted by the
+            `model.forward()` method are automatically removed.
+
+            Note that if it's a `paddle.utils.data.IterableDataset` with some randomization and you are training in a
+            distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a
+            `paddle.Generator` for the randomization that must be identical on all processes (and the Trainer will
+            manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally
+            sets the seed of the RNGs used.
+        eval_dataset (`paddle.utils.data.Dataset`, *optional*):
+             The dataset to use for evaluation. If it is an `datasets.Dataset`, columns not accepted by the
+             `model.forward()` method are automatically removed.
+        tokenizer ([`PretrainedTokenizer`], *optional*):
+            The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs the
+            maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an
+            interrupted training or reuse the fine-tuned model.
+        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
+            The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return
+            a dictionary string to metric values.
+        optimizers (`Tuple[paddle.optimizer.Optimizer, paddle.optimizer.lr.LRScheduler]`, *optional*): A tuple
+            containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your model
+            and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+
+    Important attributes:
+
+        - **model** -- Always points to the core model. If using a transformers model, it will be a [`PretrainedModel`]
+          subclass.
+        - **model_wrapped** -- Always points to the most external model in case one or more other modules wrap the
+          original model. This is the model that should be used for the forward pass. For example, under `DeepSpeed`,
+          the inner model is wrapped in `DeepSpeed` and then again in `paddle.nn.DistributedDataParallel`. If the inner
+          model hasn't been wrapped, then `self.model_wrapped` is the same as `self.model`.
+        - **is_model_parallel** -- Whether or not a model has been switched to a model parallel mode (different from
+          data parallelism, this means some of the model layers are split on different GPUs).
+        - **place_model_on_device** -- Whether or not to automatically place the model on the device - it will be set
+          to `False` if model parallel or deepspeed is used, or if the default
+          `TrainingArguments.place_model_on_device` is overridden to return `False` .
+        - **is_in_train** -- Whether or not a model is currently running `train` (e.g. when `evaluate` is called while
+          in `train`)
+
     """
 
     def __init__(
             self,
-            model: Union[PreTrainedModel, nn.Layer]=None,
+            model: Union[PretrainedModel, nn.Layer]=None,
+            criterion: Union[nn.Layer]=None,
             args: TrainingArguments=None,
             data_collator: Optional[DataCollator]=None,
             train_dataset: Optional[Dataset]=None,
             eval_dataset: Optional[Dataset]=None,
-            tokenizer: Optional[PreTrainedTokenizerBase]=None,
+            tokenizer: Optional[PretrainedTokenizer]=None,
             compute_metrics: Optional[Callable[[EvalPrediction], Dict]]=None,
-            optimizers: Tuple[paddle.optim.Optimizer, paddle.optim.lr_scheduler.
-                              LambdaLR]=(None, None), ):
+            optimizers: Tuple[paddle.optimizer.Optimizer,
+                              paddle.optimizer.lr.LRScheduler]=(None, None), ):
+        logger.info("init!!!!")
+
         if args is None:
             output_dir = "tmp_trainer"
             logger.info(
                 f"No `TrainingArguments` passed, using `output_dir={output_dir}`."
             )
             args = TrainingArguments(output_dir=output_dir)
+        args.world_size = 1
+        args.fp16 = args.use_amp
+        args.do_grad_scaling = args.use_amp
+        self.do_grad_scaling = args.do_grad_scaling
+        args.train_batch_size = args.batch_size
+        args.eval_batch_size = args.batch_size
+
+        args.dataloader_drop_last = True
+        args.dataloader_num_workers = 0
+        args.dataloader_pin_memory = True
+        args.n_gpu = 1
+        args.lr_scheduler_type = "linear"
+        args.adam_beta1 = 0.9
+        args.adam_beta2 = 0.999
+        args.optim = "OptimizerNames.ADAMW"
+        args.past_index = -1
+        args.per_device_train_batch_size = args.batch_size
+        args.per_device_eval_batch_size = args.batch_size
+        args.logging_first_step = True
+        args.logging_strategy = IntervalStrategy.STEPS
+        args.evaluation_strategy = IntervalStrategy.STEPS
+        args.save_strategy = IntervalStrategy.STEPS
+        args.eval_steps = 500
+        args.save_steps = 500
+        args.label_names = None
+        args.prediction_loss_only = False
+        args.output_dir = "./out"
+        args.should_save = True
+        args.local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0))
+        args.save_total_limit = 3
+        args.metric_for_best_model = "accuracy"
+        args.greater_is_better = True
+
         self.args = args
         # Seed must be set before instantiating the model when using model
         set_seed(self.args.seed)
@@ -141,8 +271,12 @@ def __init__(
             raise RuntimeError(
                 "`Trainer` requires either a `model` or `model_init` argument")
 
+        if self.args.should_save:
+            os.makedirs(self.args.output_dir, exist_ok=True)
+
         default_collator = default_data_collator if tokenizer is None else DataCollatorWithPadding(
             tokenizer)
+
         self.data_collator = data_collator if data_collator is not None else default_collator
         self.train_dataset = train_dataset
         self.eval_dataset = eval_dataset
@@ -150,10 +284,20 @@ def __init__(
 
         self.model_wrapped = model
         self.model = model
+        self.criterion = criterion
 
         self.compute_metrics = compute_metrics
         self.optimizer, self.lr_scheduler = optimizers
 
+        self.state = TrainerState()
+        self.control = TrainerControl()
+        callbacks = DEFAULT_CALLBACKS
+        self.callback_handler = CallbackHandler(callbacks, self.model,
+                                                self.tokenizer, self.optimizer,
+                                                self.lr_scheduler)
+
+        self.add_callback(ProgressCallback)
+
         if args.max_steps > 0:
             logger.info(
                 "max_steps is given, it will override any value given in num_train_epochs"
@@ -166,28 +310,129 @@ def __init__(
             )
 
         if args.fp16:
-            logger.info(f"Using  half precision backend")
+            logger.info(f"Using  half precision")
+
+        default_label_names = (["start_positions", "end_positions"] if
+                               "QusetionAnswering" in type(self.model).__name__
+                               else ["labels"])
+        self.label_names = default_label_names if self.args.label_names is None else self.args.label_names
+
+    def add_callback(self, callback):
+        """
+        Add a callback to the current list of [`~transformer.TrainerCallback`].
+
+        Args:
+           callback (`type` or [`~transformer.TrainerCallback`]):
+               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`]. In the
+               first case, will instantiate a member of that class.
+        """
+        self.callback_handler.add_callback(callback)
 
     def train(
             self,
             resume_from_checkpoint: Optional[Union[str, bool]]=None,
-            trial: Union["optuna.Trial", Dict[str, Any]]=None,
             ignore_keys_for_eval: Optional[List[str]]=None,
             **kwargs, ):
+        print("training!!!!")
+        logger.info("training!!!!")
         train_dataloader = self.get_train_dataloader()
         model = self._wrap_model(self.model_wrapped)
-        self.create_optimizer_and_scheduler(num_training_steps=max_steps)
 
-        for epoch in range(epochs_trained, num_train_epochs):
+        self.state = TrainerState()
+
+        if self.args.max_steps > 0:
+            self.args.num_training_steps = self.args.max_steps
+            self.args.num_train_epochs = math.ceil(
+                self.args.num_training_steps / len(train_dataloader))
+
+        else:
+            self.args.num_training_steps = len(
+                train_dataloader) * self.args.num_train_epochs
+            self.args.num_train_epochs = self.args.num_train_epochs
+
+        if self.args.num_training_steps // self.args.valid_steps < self.args.minimum_valid_times:
+            exp_step = self.args.num_training_steps / self.args.minimum_valid_times
+            exp_step = max(int(exp_step - exp_step % 10), 10)
+            logger.info("Set eval step to %d" % exp_step)
+            self.args.valid_steps = exp_step
+
+        args = self.args
+
+        self.create_optimizer_and_scheduler(
+            num_training_steps=args.num_training_steps)
+
+        num_examples = len(self.train_dataset)
+        total_train_batch_size = self.args.per_device_train_batch_size * paddle.distributed.get_world_size(
+        )
+
+        logger.info("***** Running training *****")
+        logger.info(f"  Num examples = {num_examples}")
+        logger.info(f"  Num Epochs = {self.args.num_train_epochs}")
+        logger.info(
+            f"  Instantaneous batch size per device = {self.args.per_device_train_batch_size}"
+        )
+        logger.info(
+            f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}"
+        )
+        logger.info(f"  Gradient Accumulation steps = {1}")
+        logger.info(
+            f"  Total optimization steps = {self.args.num_training_steps}")
+
+        self.state.epoch = 0
+        self.state.max_steps = int(self.args.num_training_steps)
+        self.state.num_train_epochs = int(self.args.num_train_epochs)
+        self.state.is_local_process_zero = 0
+        self.state.is_world_process_zero = 0
+
+        start_time = time.time()
+        epochs_trained = 0
+        steps_trained_in_current_epoch = 0
+        steps_trained_progress_bar = None
+
+        self.training_bar = tqdm(total=self.state.max_steps)
+
+        epoch_iterator = train_dataloader
+        steps_in_epoch = len(epoch_iterator)
+
+        self.callback_handler.model = self.model
+        self.callback_handler.optimizer = self.optimizer
+        self.callback_handler.lr_scheduler = self.lr_scheduler
+        self.callback_handler.train_dataloader = train_dataloader
+
+        self.control = self.callback_handler.on_train_begin(args, self.state,
+                                                            self.control)
+
+        tr_loss = paddle.to_tensor(0.0)
+        self._total_loss_scalar = 0.0
+        self._globalstep_last_logged = self.state.global_step
+
+        for epoch in range(epochs_trained, args.num_train_epochs):
             step = -1
+
+            self.control = self.callback_handler.on_epoch_begin(
+                args, self.state, self.control)
+
             for step, inputs in enumerate(epoch_iterator):
+                # print(inputs)
+                # print("=="*20)
                 tr_loss_step = self.training_step(model, inputs)
-                self.scaler.step(self.optimizer)
-                self.scaler.update()
-                self.optimizer.step()
+                # self.scaler.step(self.optimizer)
+                # self.scaler.update()
+                tr_loss += tr_loss_step
+                self.training_bar.update(1)
 
+                self.optimizer.step()
                 self.lr_scheduler.step()
-                model.zero_grad()
+                self.optimizer.clear_grad()
+
+                self.state.global_step += 1
+                self.state.epoch = epoch + (step + 1) / steps_in_epoch
+
+                self.control = self.callback_handler.on_step_end(
+                    args, self.state, self.control)
+
+                self._maybe_log_save_evaluate(tr_loss, model, epoch,
+                                              ignore_keys_for_eval)
 
     def training_step(
             self, model: nn.Layer,
@@ -202,71 +447,723 @@ def training_step(
 
         return loss.detach()
 
+    def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
+        if not isinstance(self.train_dataset, collections.abc.Sized):
+            return None
+
+        if self.args.world_size <= 1:
+            # return RandomSampler(self.train_dataset)
+            return DistributedBatchSampler(
+                self.train_dataset,
+                # num_replicas=self.args.world_size,
+                # rank=self.args.process_index,
+                batch_size=self.args.batch_size,
+                shuffle=True,
+                # seed=self.args.seed,
+            )
+        else:
+            return DistributedBatchSampler(
+                self.train_dataset,
+                # num_replicas=self.args.world_size,
+                # rank=self.args.process_index,
+                # seed=self.args.seed,
+            )
+
+    def _maybe_log_save_evaluate(self, tr_loss, model, epoch,
+                                 ignore_keys_for_eval):
+        if self.control.should_log:
+
+            logs: Dict[str, float] = {}
+
+            # all_gather + mean() to get average loss over all processes
+            # tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
+            tr_loss_scalar = tr_loss.mean().item()
+
+            # reset tr_loss to zero
+            tr_loss -= tr_loss
+
+            logs["loss"] = round(tr_loss_scalar / (
+                self.state.global_step - self._globalstep_last_logged), 4)
+            logs["learning_rate"] = self._get_learning_rate()
+
+            self._total_loss_scalar += tr_loss_scalar
+            self._globalstep_last_logged = self.state.global_step
+
+            self.log(logs)
+
+        metrics = None
+        if self.control.should_evaluate:
+            metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
+
+        if self.control.should_save:
+            self._save_checkpoint(model, metrics=metrics)
+            self.control = self.callback_handler.on_save(self.args, self.state,
+                                                         self.control)
+
+    def _get_learning_rate(self):
+        return self.optimizer.get_lr()
+
     def get_train_dataloader(self):
-        pass
+        """
+        Returns the training [`~paddle.io.DataLoader`].
+
+        Will use no sampler if `self.train_dataset` does not implement `__len__`, a random sampler (adapted to
+        distributed training if necessary) otherwise.
+
+        Subclass and override this method if you want to inject some custom behavior.
+        """
+        if self.train_dataset is None:
+            raise ValueError("Trainer: training requires a train_dataset.")
+
+        train_dataset = self.train_dataset
+
+        train_sampler = self._get_train_sampler()
+
+        return DataLoader(
+            train_dataset,
+            # batch_size=self.args.train_batch_size,
+            batch_sampler=train_sampler,
+            collate_fn=self.data_collator,
+            # drop_last=self.args.dataloader_drop_last,
+            num_workers=self.args.dataloader_num_workers,
+            # pin_memory=self.args.dataloader_pin_memory,
+        )
 
     def _get_eval_sampler(self, eval_dataset: Dataset):
-        pass
+        if self.args.world_size <= 1:
+            return DistributedBatchSampler(
+                eval_dataset,
+                # num_replicas=self.args.world_size,
+                # rank=self.args.process_index,
+                batch_size=self.args.per_device_eval_batch_size,
+                shuffle=False,
+                # seed=self.args.seed,
+            )
+        else:
+            return DistributedBatchSampler(
+                eval_dataset,
+                batch_size=self.args.per_device_eval_batch_size,
+                shuffle=False)
 
     def get_eval_dataloader(self,
                             eval_dataset: Optional[Dataset]=None) -> DataLoader:
-        pass
+        """
+        Returns the evaluation [`~paddle.io.DataLoader`].
+
+        Subclass and override this method if you want to inject some custom behavior.
+
+        Args:
+            eval_dataset (`paddle.io.Dataset`, *optional*):
+                If provided, will override `self.eval_dataset`. If it is an `datasets.Dataset`, columns not accepted by
+                the `model.forward()` method are automatically removed. It must implement `__len__`.
+        """
+        if eval_dataset is None and self.eval_dataset is None:
+            raise ValueError("Trainer: evaluation requires an eval_dataset.")
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+
+        eval_sampler = self._get_eval_sampler(eval_dataset)
+
+        return DataLoader(
+            eval_dataset,
+            # batch_size=self.args.train_batch_size,
+            batch_sampler=eval_sampler,
+            collate_fn=self.data_collator,
+            # drop_last=self.args.dataloader_drop_last,
+            num_workers=self.args.dataloader_num_workers,
+            # pin_memory=self.args.dataloader_pin_memory,
+        )
 
     def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
-        pass
+        """
+        Returns the test [`~paddle.io.DataLoader`].
+
+        Subclass and override this method if you want to inject some custom behavior.
+
+        Args:
+            test_dataset (`paddle.io.Dataset`, *optional*):
+                The test dataset to use. If it is an `datasets.Dataset`, columns not accepted by the `model.forward()`
+                method are automatically removed. It must implement `__len__`.
+        """
+
+        test_sampler = self._get_eval_sampler(test_dataset)
+
+        # We use the same batch_size as for eval.
+        return DataLoader(
+            test_dataset,
+            sampler=test_sampler,
+            batch_size=self.args.eval_batch_size,
+            collate_fn=self.data_collator,
+            drop_last=self.args.dataloader_drop_last,
+            pin_memory=self.args.dataloader_pin_memory, )
 
     def create_optimizer_and_scheduler(self, num_training_steps: int):
-        pass
+        """
+        Setup the optimizer and the learning rate scheduler.
 
-    def create_optimizer(self):
-        pass
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method (or `create_optimizer` and/or
+        `create_scheduler`) in a subclass.
+        """
+        self.create_scheduler(num_training_steps=num_training_steps)
+        self.create_optimizer(self.lr_scheduler)
+
+    def create_optimizer(self, lr_scheduler=None):
+        """
+        Setup the optimizer.
+
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
+        """
+        if self.optimizer is None:
+            decay_parameters = [
+                p.name for n, p in self.model.named_parameters()
+                if not any(nd in n for nd in ["bias", "norm"])
+            ]
+            apply_decay_param_fun = lambda x: x in decay_parameters
+
+            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(
+                self.args)
+
+            self.optimizer = optimizer_cls(
+                learning_rate=self.lr_scheduler
+                if lr_scheduler is None else lr_scheduler,
+                apply_decay_param_fun=apply_decay_param_fun,
+                parameters=self.model.parameters(),
+                weight_decay=self.args.weight_decay,
+                grad_clip=nn.ClipGradByGlobalNorm(self.args.max_grad_norm),
+                **optimizer_kwargs)
+
+        return self.optimizer
 
     @staticmethod
     def get_optimizer_cls_and_kwargs(
             args: TrainingArguments) -> Tuple[Any, Any]:
-        pass
+        """
+        Returns the optimizer class and optimizer parameters based on the training arguments.
+
+        Args:
+            args (`paddlenlp.training_args.TrainingArguments`):
+                The training arguments for the training session.
+
+        """
+        # optimizer_kwargs = {"lr": args.learning_rate}
+        optimizer_kwargs = {}
+        adam_kwargs = {
+            "beta1": args.adam_beta1,
+            "beta2": args.adam_beta2,
+            "epsilon": args.adam_epsilon,
+        }
+        if args.optim == "OptimizerNames.ADAMW":
+            from paddle.optimizer import AdamW
+
+            optimizer_cls = AdamW
+            optimizer_kwargs.update(adam_kwargs)
+        else:
+            raise ValueError(
+                f"Trainer cannot instantiate unsupported optimizer: {args.optim}"
+            )
+        return optimizer_cls, optimizer_kwargs
 
     def create_scheduler(self,
                          num_training_steps: int,
-                         optimizer: paddle.optim.Optimizer=None):
-        pass
+                         optimizer: paddle.optimizer.Optimizer=None):
+        """
+        Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
+        passed as an argument.
+
+        Args:
+            num_training_steps (int): The number of training steps to do.
+        """
+
+        def get_scheduler(lr_scheduler_type, learning_rate, num_warmup_steps,
+                          num_training_steps):
+            # TODO  @ZHUI support others
+            return LinearDecayWithWarmup(learning_rate, num_training_steps,
+                                         num_warmup_steps)
+
+        warmup = self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_proportion
+
+        if self.lr_scheduler is None:
+            self.lr_scheduler = get_scheduler(
+                self.args.lr_scheduler_type,
+                learning_rate=self.args.learning_rate,
+                num_warmup_steps=warmup,
+                num_training_steps=num_training_steps, )
+
+        return self.lr_scheduler
 
     def _wrap_model(self, model, training=True):
-        pass
+        # train/eval could be run multiple-times - if already wrapped, don't re-wrap it again
+        if unwrap_model(model) is not model:
+            return model
+
+        if self.args.n_gpu > 1:
+            model = nn.DistributedDataParallel(model)
+
+        # Note: in paddle.distributed mode, there's no point in wrapping the model
+        # inside a DistributedDataParallel as we'll be under `no_grad` anyways.
+        if not training:
+            return model
+
+        return model
 
     def _prepare_input(
             self, data: Union[paddle.Tensor, Any]) -> Union[paddle.Tensor, Any]:
-        pass
+        """
+        Prepares one `data` before feeding it to the model, be it a tensor or a nested list/dictionary of tensors.
+        """
+        if isinstance(data, Mapping):
+            return type(data)(
+                {k: self._prepare_input(v)
+                 for k, v in data.items()})
+        elif isinstance(data, (tuple, list)):
+            return type(data)(self._prepare_input(v) for v in data)
+        elif isinstance(data, paddle.Tensor):
+            kwargs = dict(device=self.args.device)
+            # update data type for pure fp16
+            return data
+            # return data.to(**kwargs)
+        return data
 
     def _prepare_inputs(self, inputs: Dict[str, Union[paddle.Tensor, Any]]
                         ) -> Dict[str, Union[paddle.Tensor, Any]]:
-        pass
+        """
+        Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and
+        handling potential state.
+        """
+        inputs = self._prepare_input(inputs)
+        if self.args.past_index >= 0 and self._past is not None:
+            inputs["mems"] = self._past
+
+        return inputs
 
     def autocast_smart_context_manager(self):
-        pass
+        """
+        A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired
+        arguments, depending on the situation.
+        """
+        if self.args.use_amp:
+            ctx_manager = autocast()
+        else:
+            ctx_manager = contextlib.nullcontext() if sys.version_info >= (
+                3, 7) else contextlib.suppress()
+
+        return ctx_manager
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        """
+        How the loss is computed by Trainer. By default, all models return the loss in the first element.
+        Subclass and override for custom behavior.
+        """
+        if self.criterion is not None:
+            labels = inputs.pop("labels")
+        else:
+            labels = None
+
+        # print(inputs)
+
+        outputs = model(**inputs)
+
+        # outputs = model(*inputs)
+
+        if self.criterion is not None:
+            # print(outputs)
+            loss = self.criterion(outputs, labels)
+            outputs = (loss, outputs)
+        # Save past state if it exists
+        # TODO: this needs to be fixed and made cleaner later.
+        if self.args.past_index >= 0:
+            self._past = outputs[self.args.past_index]
+
+        # We don't use .loss here since the model may return tuples instead of ModelOutput.
+        loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
+
+        return (loss, outputs) if return_outputs else loss
 
     def training_step(
             self, model: nn.Layer,
             inputs: Dict[str, Union[paddle.Tensor, Any]]) -> paddle.Tensor:
-        pass
+        """
+        Perform a training step on a batch of inputs.
 
-    def save_model(self,
-                   output_dir: Optional[str]=None,
-                   _internal_call: bool=False):
-        pass
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (`nn.Module`):
+                The model to train.
+            inputs (`Dict[str, Union[paddle.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+
+        Return:
+            `paddle.Tensor`: The tensor with training loss on this batch.
+        """
+        model.train()
+        inputs = self._prepare_inputs(inputs)
+
+        with self.autocast_smart_context_manager():
+            loss = self.compute_loss(model, inputs)
+
+        if self.args.n_gpu > 1:
+            loss = loss.mean(
+            )  # mean() to average on multi-gpu parallel training
+
+        # if self.args.gradient_accumulation_steps > 1:
+        #     # deepspeed handles loss scaling by gradient_accumulation_steps in its `backward`
+        #     loss = loss / self.args.gradient_accumulation_steps
+
+        if self.do_grad_scaling:
+            self.scaler.scale(loss).backward()
+        else:
+            loss.backward()
+        # print(loss)
+        return loss.detach()
+
+    def save_model(self, output_dir: Optional[str]=None):
+        """
+        Will save the model, so you can reload it using `from_pretrained()`.
+
+        Will only save from the main process.
+        """
+
+        if output_dir is None:
+            output_dir = self.args.output_dir
+
+        if self.args.should_save:
+            self._save(output_dir)
+
+    def _save_checkpoint(self, model, metrics=None):
+        # In all cases, including ddp/dp/deepspeed, self.model is always a reference to the model we
+        # want to save except FullyShardedDDP.
+        # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"
+
+        # Save model checkpoint
+        checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+
+        run_dir = self.args.output_dir
+
+        output_dir = os.path.join(run_dir, checkpoint_folder)
+
+        self.save_model(output_dir)
+
+        if self.args.should_save:
+            # deepspeed.save_checkpoint above saves model/optim/sched
+            paddle.save(self.optimizer.state_dict(),
+                        os.path.join(output_dir, OPTIMIZER_NAME))
+            with warnings.catch_warnings(record=True) as caught_warnings:
+                paddle.save(self.lr_scheduler.state_dict(),
+                            os.path.join(output_dir, SCHEDULER_NAME))
+            if self.do_grad_scaling:
+                paddle.save(self.scaler.state_dict(),
+                            os.path.join(output_dir, SCALER_NAME))
+
+        # Determine the new best metric / best model checkpoint
+        if metrics is not None and self.args.metric_for_best_model is not None:
+            metric_to_check = self.args.metric_for_best_model
+            if not metric_to_check.startswith("eval_"):
+                metric_to_check = f"eval_{metric_to_check}"
+            metric_value = metrics[metric_to_check]
+
+            operator = np.greater if self.args.greater_is_better else np.less
+            if (self.state.best_metric is None or
+                    self.state.best_model_checkpoint is None or
+                    operator(metric_value, self.state.best_metric)):
+                self.state.best_metric = metric_value
+                self.state.best_model_checkpoint = output_dir
+
+        # Save the Trainer state
+        if self.args.should_save:
+            self.state.save_to_json(
+                os.path.join(output_dir, TRAINER_STATE_NAME))
+
+        # Save RNG state in non-distributed training
+        rng_states = {
+            "python": random.getstate(),
+            "numpy": np.random.get_state(),
+        }
+
+        # A process can arrive here before the process 0 has a chance to save the model, in which case output_dir may
+        # not yet exist.
+        os.makedirs(output_dir, exist_ok=True)
+        local_rank = self.args.local_rank
+
+        if local_rank == -1:
+            paddle.save(rng_states, os.path.join(output_dir, "rng_state.pth"))
+        else:
+            paddle.save(rng_states,
+                        os.path.join(output_dir, f"rng_state_{local_rank}.pth"))
+
+        # Maybe delete some older checkpoints.
+        if self.args.should_save:
+            self._rotate_checkpoints(use_mtime=True, output_dir=run_dir)
+
+    def _sorted_checkpoints(self,
+                            output_dir=None,
+                            checkpoint_prefix=PREFIX_CHECKPOINT_DIR,
+                            use_mtime=False) -> List[str]:
+        ordering_and_checkpoint_path = []
+
+        glob_checkpoints = [
+            str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*")
+        ]
+
+        for path in glob_checkpoints:
+            if use_mtime:
+                ordering_and_checkpoint_path.append(
+                    (os.path.getmtime(path), path))
+            else:
+                regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path)
+                if regex_match is not None and regex_match.groups() is not None:
+                    ordering_and_checkpoint_path.append(
+                        (int(regex_match.groups()[0]), path))
+
+        checkpoints_sorted = sorted(ordering_and_checkpoint_path)
+        checkpoints_sorted = [
+            checkpoint[1] for checkpoint in checkpoints_sorted
+        ]
+        # Make sure we don't delete the best model.
+        if self.state.best_model_checkpoint is not None:
+            best_model_index = checkpoints_sorted.index(
+                str(Path(self.state.best_model_checkpoint)))
+            for i in range(best_model_index, len(checkpoints_sorted) - 2):
+                checkpoints_sorted[i], checkpoints_sorted[
+                    i + 1] = checkpoints_sorted[i + 1], checkpoints_sorted[i]
+        return checkpoints_sorted
+
+    def _rotate_checkpoints(self, use_mtime=False, output_dir=None) -> None:
+        if self.args.save_total_limit is None or self.args.save_total_limit <= 0:
+            return
+
+        # Check if we should delete older checkpoint(s)
+        checkpoints_sorted = self._sorted_checkpoints(
+            use_mtime=use_mtime, output_dir=output_dir)
+        if len(checkpoints_sorted) <= self.args.save_total_limit:
+            return
+
+        # If save_total_limit=1 with load_best_model_at_end=True, we could end up deleting the last checkpoint, which
+        # we don't do to allow resuming.
+        save_total_limit = self.args.save_total_limit
+        if (self.state.best_model_checkpoint is not None and
+                self.args.save_total_limit == 1 and
+                checkpoints_sorted[-1] != self.state.best_model_checkpoint):
+            save_total_limit = 2
+
+        number_of_checkpoints_to_delete = max(
+            0, len(checkpoints_sorted) - save_total_limit)
+        checkpoints_to_be_deleted = checkpoints_sorted[:
+                                                       number_of_checkpoints_to_delete]
+        for checkpoint in checkpoints_to_be_deleted:
+            logger.info(
+                f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit"
+            )
+            shutil.rmtree(checkpoint)
 
     def _save(self, output_dir: Optional[str]=None, state_dict=None):
-        pass
+        # If we are executing this function, we are the process zero, so we don't check for that.
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        os.makedirs(output_dir, exist_ok=True)
+        logger.info(f"Saving model checkpoint to {output_dir}")
+        # Save a trained model and configuration using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        if not isinstance(self.model, PretrainedModel):
+            if isinstance(unwrap_model(self.model), PretrainedModel):
+                if state_dict is None:
+                    state_dict = self.model.state_dict()
+                unwrap_model(self.model).save_pretrained(
+                    output_dir, state_dict=state_dict)
+            else:
+                logger.info(
+                    "Trainer.model is not a `PretrainedModel`, only saving its state dict."
+                )
+                if state_dict is None:
+                    state_dict = self.model.state_dict()
+                paddle.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
+        else:
+            self.model.save_pretrained(output_dir)
+        if self.tokenizer is not None:
+            self.tokenizer.save_pretrained(output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        paddle.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
 
     def _load_optimizer_and_scheduler(self, checkpoint):
-        pass
+        """If optimizer and scheduler states exist, load them."""
+        if checkpoint is None:
+            return
+
+        if os.path.isfile(os.path.join(
+                checkpoint, OPTIMIZER_NAME)) and os.path.isfile(
+                    os.path.join(checkpoint, SCHEDULER_NAME)):
+            # Load in optimizer and scheduler states
+            map_location = self.args.device
+            self.optimizer.load_state_dict(
+                paddle.load(
+                    os.path.join(checkpoint, OPTIMIZER_NAME),
+                    map_location=map_location))
+            with warnings.catch_warnings(record=True) as caught_warnings:
+                self.lr_scheduler.load_state_dict(
+                    paddle.load(os.path.join(checkpoint, SCHEDULER_NAME)))
+            reissue_pt_warnings(caught_warnings)
+            if self.do_grad_scaling and os.path.isfile(
+                    os.path.join(checkpoint, SCALER_NAME)):
+                self.scaler.load_state_dict(
+                    paddle.load(os.path.join(checkpoint, SCALER_NAME)))
+
+    def log(self, logs: Dict[str, float]) -> None:
+        """
+        Log `logs` on the various objects watching training.
+
+        Subclass and override this method to inject custom behavior.
+
+        Args:
+            logs (`Dict[str, float]`):
+                The values to log.
+        """
+        if self.state.epoch is not None:
+            logs["epoch"] = round(self.state.epoch, 2)
+
+        output = { ** logs, ** {"step": self.state.global_step}}
+        self.state.log_history.append(output)
+        self.control = self.callback_handler.on_log(self.args, self.state,
+                                                    self.control, logs)
 
     def evaluate(
             self,
             eval_dataset: Optional[Dataset]=None,
             ignore_keys: Optional[List[str]]=None,
             metric_key_prefix: str="eval", ) -> Dict[str, float]:
-        pass
+        """
+        Run evaluation and returns metrics.
+
+        The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
+        (pass it to the init `compute_metrics` argument).
+
+        You can also subclass and override this method to inject custom behavior.
+
+        Args:
+            eval_dataset (`Dataset`, *optional*):
+                Pass a dataset if you wish to override `self.eval_dataset`. If it is an `datasets.Dataset`, columns not
+                accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
+                method.
+            ignore_keys (`Lst[str]`, *optional*):
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions.
+            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
+                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
+                "eval_bleu" if the prefix is "eval" (default)
+
+        Returns:
+            A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
+            dictionary also contains the epoch number which comes from the training state.
+        """
+        eval_dataloader = self.get_eval_dataloader(eval_dataset)
+        start_time = time.time()
+
+        output = self.evaluation_loop(
+            eval_dataloader,
+            description="Evaluation",
+            # No point gathering the predictions if there are no metrics, otherwise we defer to
+            # self.args.prediction_loss_only
+            prediction_loss_only=True if self.compute_metrics is None else None,
+            ignore_keys=ignore_keys,
+            metric_key_prefix=metric_key_prefix, )
+
+        total_batch_size = self.args.eval_batch_size * self.args.world_size
+        output.metrics.update(
+            speed_metrics(
+                metric_key_prefix,
+                start_time,
+                num_samples=output.num_samples,
+                num_steps=math.ceil(output.num_samples / total_batch_size), ))
+
+        self.log(output.metrics)
+
+        self.control = self.callback_handler.on_evaluate(
+            self.args, self.state, self.control, output.metrics)
+
+        return output.metrics
+
+    def evaluation_loop(
+            self,
+            dataloader: DataLoader,
+            description: str,
+            prediction_loss_only: Optional[bool]=None,
+            ignore_keys: Optional[List[str]]=None,
+            metric_key_prefix: str="eval", ) -> EvalLoopOutput:
+        """
+        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+        args = self.args
+
+        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
+        prediction_loss_only = False
+
+        model = self._wrap_model(self.model, training=False)
+
+        batch_size = dataloader.batch_size
+        num_samples = self.num_examples(dataloader)
+        logger.info(f"***** Running {description} *****")
+        logger.info(f"  Num examples = {num_samples}")
+        logger.info(f"  Batch size = {batch_size}")
+
+        model.eval()
+
+        self.callback_handler.eval_dataloader = dataloader
+        # Do this before wrapping.
+        eval_dataset = dataloader.dataset
+
+        if args.past_index >= 0:
+            self._past = None
+
+        # Initialize containers
+        # losses/preds/labels on GPU/TPU (accumulated for eval_accumulation_steps)
+        losses_host = None
+        preds_host = None
+        labels_host = None
+        # losses/preds/labels on CPU (final containers)
+        all_losses = None
+        all_preds = None
+        all_labels = None
+        # Will be useful when we have an iterable dataset so don't know its length.
+
+        observed_num_examples = 0
+        # Main evaluation loop
+        losses = []
+        for step, inputs in enumerate(dataloader):
+            # Update the observed num examples
+            # Prediction step
+            loss, logits, labels = self.prediction_step(
+                model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
+            losses.append(loss.numpy())
+
+            all_preds = logits if all_preds is None else nested_concat(
+                all_preds, logits, padding_index=-100)
+            all_labels = labels if all_labels is None else nested_concat(
+                all_labels, labels, padding_index=-100)
+
+        model.train()
+
+        # Metrics!
+        if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
+            metrics = self.compute_metrics(
+                EvalPrediction(
+                    predictions=all_preds, label_ids=all_labels))
+        else:
+            metrics = {}
+
+        metrics["eval_loss"] = float(np.mean(losses))
+
+        print(metrics)
+
+        return EvalLoopOutput(
+            predictions=all_preds,
+            label_ids=all_labels,
+            metrics=metrics,
+            num_samples=num_samples)
 
     def predict(self,
                 test_dataset: Dataset,
@@ -282,8 +1179,176 @@ def prediction_step(
             ignore_keys: Optional[List[str]]=None, ) -> Tuple[Optional[
                 paddle.Tensor], Optional[paddle.Tensor], Optional[
                     paddle.Tensor]]:
+        """
+        Perform an evaluation step on `model` using `inputs`.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (`nn.Module`):
+                The model to evaluate.
+            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (`bool`):
+                Whether or not to return the loss only.
+            ignore_keys (`Lst[str]`, *optional*):
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions.
+
+        Return:
+            Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
+            logits and labels (each being optional).
+        """
+        has_labels = all(inputs.get(k) is not None for k in self.label_names)
+        inputs = self._prepare_inputs(inputs)
+        if ignore_keys is None:
+            if hasattr(self.model, "config"):
+                ignore_keys = getattr(self.model.config,
+                                      "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        # labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
+        if has_labels:
+            labels = nested_detach(
+                tuple(inputs.get(name) for name in self.label_names))
+            if len(labels) == 1:
+                labels = labels[0]
+        else:
+            labels = None
+
+        with paddle.no_grad():
+            if has_labels:
+                with self.autocast_smart_context_manager():
+                    loss, outputs = self.compute_loss(
+                        model, inputs, return_outputs=True)
+                loss = loss.mean().detach()
+
+                if isinstance(outputs, dict):
+                    logits = tuple(v for k, v in outputs.items()
+                                   if k not in ignore_keys + ["loss"])
+                else:
+                    logits = outputs[1:]
+            else:
+                loss = None
+                with self.autocast_smart_context_manager():
+                    outputs = model(**inputs)
+                if isinstance(outputs, dict):
+                    logits = tuple(v for k, v in outputs.items()
+                                   if k not in ignore_keys)
+                else:
+                    logits = outputs
+                # TODO: this needs to be fixed and made cleaner later.
+                if self.args.past_index >= 0:
+                    self._past = outputs[self.args.past_index - 1]
+
+        if prediction_loss_only:
+            return (loss, None, None)
+
+        logits = nested_detach(logits)
+        if len(logits) == 1:
+            logits = logits[0]
+
+        return (loss, logits, labels)
+
+    def num_examples(self, dataloader: DataLoader) -> int:
+        """
+        Helper to get number of samples in a [`~torch.utils.data.DataLoader`] by accessing its dataset.
+
+        Will raise an exception if the underlying dataset does not implement method `__len__`
+        """
+        return len(dataloader.dataset)
+
+    def create_dataloader(self,
+                          dataset,
+                          mode='train',
+                          batch_size=16,
+                          batchify_fn=None,
+                          trans_fn=None,
+                          batched=False):
+        """
+        """
+        if trans_fn:
+            dataset = dataset.map(trans_fn, batched=batched)
+
+        shuffle = True if mode == 'train' else False
+        if mode == 'train':
+            batch_sampler = paddle.io.DistributedBatchSampler(
+                dataset, batch_size=batch_size, shuffle=shuffle)
+        else:
+            batch_sampler = paddle.io.BatchSampler(
+                dataset, batch_size=batch_size, shuffle=shuffle)
+
+        return paddle.io.DataLoader(
+            dataset=dataset,
+            batch_sampler=batch_sampler,
+            collate_fn=batchify_fn,
+            num_workers=0,
+            return_list=True)
+
+    def eval(self, *args, **kwargs):
+        """
+        """
         pass
 
+    def prepare_train_config(self):
+        """
+        """
+        if self.args.max_steps > 0:
+            self.args.num_training_steps = self.args.max_steps
+            self.args.num_train_epochs = math.ceil(
+                self.args.num_training_steps / len(self.train_dl))
+
+        else:
+            self.args.num_training_steps = len(
+                self.train_dl) * self.args.num_train_epochs
+            self.args.num_train_epochs = self.args.num_train_epochs
+
+        if self.args.num_training_steps // self.args.valid_steps < self.args.minimum_valid_times:
+            exp_step = self.args.num_training_steps / self.args.minimum_valid_times
+            exp_step = max(int(exp_step - exp_step % 10), 10)
+            logger.info("Set eval step to %d" % exp_step)
+            self.args.valid_steps = exp_step
+
+        warmup = self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_proportion
+
+        self.lr_scheduler = LinearDecayWithWarmup(
+            self.args.learning_rate, self.args.num_training_steps, warmup)
+
+        # Generate parameter names needed to perform weight decay.
+        # All bias and LayerNorm parameters are excluded.
+        decay_params = [
+            p.name for n, p in self.model.named_parameters()
+            if not any(nd in n for nd in ["bias", "norm"])
+        ]
+
+        self.optimizer = paddle.optimizer.AdamW(
+            learning_rate=self.lr_scheduler,
+            beta1=0.9,
+            beta2=0.999,
+            epsilon=self.args.adam_epsilon,
+            parameters=self.model.parameters(),
+            weight_decay=self.args.weight_decay,
+            apply_decay_param_fun=lambda x: x in decay_params,
+            grad_clip=nn.ClipGradByGlobalNorm(self.args.max_grad_norm))
+
+    def print_config(self):
+        """
+        """
+        logger.info('{:^40}'.format("Configuration Arguments"))
+        logger.info('{:20}:{}'.format("paddle commit id",
+                                      paddle.version.commit))
+        for arg in vars(self.args):
+            logger.info('{:20}:{}'.format(arg, getattr(self.args, arg)))
+
+
+class TrainerBase(object):
+    """
+    """
+
     def create_dataloader(self,
                           dataset,
                           mode='train',
diff --git a/examples/language_model/ernie-1.0/finetune/trainer_callback.py b/examples/language_model/ernie-1.0/finetune/trainer_callback.py
new file mode 100644
index 000000000000..70f8cfd96eea
--- /dev/null
+++ b/examples/language_model/ernie-1.0/finetune/trainer_callback.py
@@ -0,0 +1,660 @@
+# coding=utf-8
+# Copyright 2020-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Callbacks to use with the Trainer class and customize the training loop.
+"""
+import dataclasses
+import json
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+from tqdm.auto import tqdm
+
+from trainer_utils import IntervalStrategy
+from trainer_args import TrainingArguments
+from utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class TrainerState:
+    """
+    A class containing the [`Trainer`] inner state that will be saved along the model and optimizer when checkpointing
+    and passed to the [`TrainerCallback`].
+
+    <Tip>
+
+    In all this class, one step is to be understood as one update step. When using gradient accumulation, one update
+    step may require several forward and backward passes: if you use `gradient_accumulation_steps=n`, then one update
+    step requires going through *n* batches.
+
+    </Tip>
+
+    Args:
+        epoch (`float`, *optional*):
+            Only set during training, will represent the epoch the training is at (the decimal part being the
+            percentage of the current epoch completed).
+        global_step (`int`, *optional*, defaults to 0):
+            During training, represents the number of update steps completed.
+        max_steps (`int`, *optional*, defaults to 0):
+            The number of update steps to do during the current training.
+        total_flos (`float`, *optional*, defaults to 0):
+            The total number of floating operations done by the model since the beginning of training (stored as floats
+            to avoid overflow).
+        log_history (`List[Dict[str, float]]`, *optional*):
+            The list of logs done since the beginning of training.
+        best_metric (`float`, *optional*):
+            When tracking the best model, the value of the best metric encountered so far.
+        best_model_checkpoint (`str`, *optional*):
+            When tracking the best model, the value of the name of the checkpoint for the best model encountered so
+            far.
+        is_local_process_zero (`bool`, *optional*, defaults to `True`):
+            Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on
+            several machines) main process.
+        is_world_process_zero (`bool`, *optional*, defaults to `True`):
+            Whether or not this process is the global main process (when training in a distributed fashion on several
+            machines, this is only going to be `True` for one process).
+        is_hyper_param_search (`bool`, *optional*, defaults to `False`):
+            Whether we are in the process of a hyper parameter search using Trainer.hyperparameter_search. This will
+            impact the way data will be logged in TensorBoard.
+    """
+
+    epoch: Optional[float] = None
+    global_step: int = 0
+    max_steps: int = 0
+    num_train_epochs: int = 0
+    total_flos: float = 0
+    log_history: List[Dict[str, float]] = None
+    best_metric: Optional[float] = None
+    best_model_checkpoint: Optional[str] = None
+    is_local_process_zero: bool = True
+    is_world_process_zero: bool = True
+    is_hyper_param_search: bool = False
+    trial_name: str = None
+    trial_params: Dict[str, Union[str, float, int, bool]] = None
+
+    def __post_init__(self):
+        if self.log_history is None:
+            self.log_history = []
+
+    def save_to_json(self, json_path: str):
+        """Save the content of this instance in JSON format inside `json_path`."""
+        json_string = json.dumps(
+            dataclasses.asdict(self), indent=2, sort_keys=True) + "\n"
+        with open(json_path, "w", encoding="utf-8") as f:
+            f.write(json_string)
+
+    @classmethod
+    def load_from_json(cls, json_path: str):
+        """Create an instance from the content of `json_path`."""
+        with open(json_path, "r", encoding="utf-8") as f:
+            text = f.read()
+        return cls(**json.loads(text))
+
+
+@dataclass
+class TrainerControl:
+    """
+    A class that handles the [`Trainer`] control flow. This class is used by the [`TrainerCallback`] to activate some
+    switches in the training loop.
+
+    Args:
+        should_training_stop (`bool`, *optional*, defaults to `False`):
+            Whether or not the training should be interrupted.
+
+            If `True`, this variable will not be set back to `False`. The training will just stop.
+        should_epoch_stop (`bool`, *optional*, defaults to `False`):
+            Whether or not the current epoch should be interrupted.
+
+            If `True`, this variable will be set back to `False` at the beginning of the next epoch.
+        should_save (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should be saved at this step.
+
+            If `True`, this variable will be set back to `False` at the beginning of the next step.
+        should_evaluate (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should be evaluated at this step.
+
+            If `True`, this variable will be set back to `False` at the beginning of the next step.
+        should_log (`bool`, *optional*, defaults to `False`):
+            Whether or not the logs should be reported at this step.
+
+            If `True`, this variable will be set back to `False` at the beginning of the next step.
+    """
+
+    should_training_stop: bool = False
+    should_epoch_stop: bool = False
+    should_save: bool = False
+    should_evaluate: bool = False
+    should_log: bool = False
+
+    def _new_training(self):
+        """Internal method that resets the variable for a new training."""
+        self.should_training_stop = False
+
+    def _new_epoch(self):
+        """Internal method that resets the variable for a new epoch."""
+        self.should_epoch_stop = False
+
+    def _new_step(self):
+        """Internal method that resets the variable for a new step."""
+        self.should_save = False
+        self.should_evaluate = False
+        self.should_log = False
+
+
+class TrainerCallback:
+    """
+    A class for objects that will inspect the state of the training loop at some events and take some decisions. At
+    each of those events the following arguments are available:
+
+    Args:
+        args ([`TrainingArguments`]):
+            The training arguments used to instantiate the [`Trainer`].
+        state ([`TrainerState`]):
+            The current state of the [`Trainer`].
+        control ([`TrainerControl`]):
+            The object that is returned to the [`Trainer`] and can be used to make some decisions.
+        model ([`PreTrainedModel`] or `torch.nn.Module`):
+            The model being trained.
+        tokenizer ([`PreTrainedTokenizer`]):
+            The tokenizer used for encoding the data.
+        optimizer (`torch.optim.Optimizer`):
+            The optimizer used for the training steps.
+        lr_scheduler (`torch.optim.lr_scheduler.LambdaLR`):
+            The scheduler used for setting the learning rate.
+        train_dataloader (`torch.utils.data.DataLoader`, *optional*):
+            The current dataloader used for training.
+        eval_dataloader (`torch.utils.data.DataLoader`, *optional*):
+            The current dataloader used for training.
+        metrics (`Dict[str, float]`):
+            The metrics computed by the last evaluation phase.
+
+            Those are only accessible in the event `on_evaluate`.
+        logs  (`Dict[str, float]`):
+            The values to log.
+
+            Those are only accessible in the event `on_log`.
+
+    The `control` object is the only one that can be changed by the callback, in which case the event that changes it
+    should return the modified version.
+
+    The argument `args`, `state` and `control` are positionals for all events, all the others are grouped in `kwargs`.
+    You can unpack the ones you need in the signature of the event using them. As an example, see the code of the
+    simple [`~transformer.PrinterCallback`].
+
+    Example:
+
+    ```python
+    class PrinterCallback(TrainerCallback):
+        def on_log(self, args, state, control, logs=None, **kwargs):
+            _ = logs.pop("total_flos", None)
+            if state.is_local_process_zero:
+                print(logs)
+    ```"""
+
+    def on_init_end(self,
+                    args: TrainingArguments,
+                    state: TrainerState,
+                    control: TrainerControl,
+                    **kwargs):
+        """
+        Event called at the end of the initialization of the [`Trainer`].
+        """
+        pass
+
+    def on_train_begin(self,
+                       args: TrainingArguments,
+                       state: TrainerState,
+                       control: TrainerControl,
+                       **kwargs):
+        """
+        Event called at the beginning of training.
+        """
+        pass
+
+    def on_train_end(self,
+                     args: TrainingArguments,
+                     state: TrainerState,
+                     control: TrainerControl,
+                     **kwargs):
+        """
+        Event called at the end of training.
+        """
+        pass
+
+    def on_epoch_begin(self,
+                       args: TrainingArguments,
+                       state: TrainerState,
+                       control: TrainerControl,
+                       **kwargs):
+        """
+        Event called at the beginning of an epoch.
+        """
+        pass
+
+    def on_epoch_end(self,
+                     args: TrainingArguments,
+                     state: TrainerState,
+                     control: TrainerControl,
+                     **kwargs):
+        """
+        Event called at the end of an epoch.
+        """
+        pass
+
+    def on_step_begin(self,
+                      args: TrainingArguments,
+                      state: TrainerState,
+                      control: TrainerControl,
+                      **kwargs):
+        """
+        Event called at the beginning of a training step. If using gradient accumulation, one training step might take
+        several inputs.
+        """
+        pass
+
+    def on_substep_end(self,
+                       args: TrainingArguments,
+                       state: TrainerState,
+                       control: TrainerControl,
+                       **kwargs):
+        """
+        Event called at the end of an substep during gradient accumulation.
+        """
+        pass
+
+    def on_step_end(self,
+                    args: TrainingArguments,
+                    state: TrainerState,
+                    control: TrainerControl,
+                    **kwargs):
+        """
+        Event called at the end of a training step. If using gradient accumulation, one training step might take
+        several inputs.
+        """
+        pass
+
+    def on_evaluate(self,
+                    args: TrainingArguments,
+                    state: TrainerState,
+                    control: TrainerControl,
+                    **kwargs):
+        """
+        Event called after an evaluation phase.
+        """
+        pass
+
+    def on_save(self,
+                args: TrainingArguments,
+                state: TrainerState,
+                control: TrainerControl,
+                **kwargs):
+        """
+        Event called after a checkpoint save.
+        """
+        pass
+
+    def on_log(self,
+               args: TrainingArguments,
+               state: TrainerState,
+               control: TrainerControl,
+               **kwargs):
+        """
+        Event called after logging the last logs.
+        """
+        pass
+
+    def on_prediction_step(self,
+                           args: TrainingArguments,
+                           state: TrainerState,
+                           control: TrainerControl,
+                           **kwargs):
+        """
+        Event called after a prediction step.
+        """
+        pass
+
+
+class CallbackHandler(TrainerCallback):
+    """Internal class that just calls the list of callbacks in order."""
+
+    def __init__(self, callbacks, model, tokenizer, optimizer, lr_scheduler):
+        self.callbacks = []
+        for cb in callbacks:
+            self.add_callback(cb)
+        self.model = model
+        self.tokenizer = tokenizer
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        self.train_dataloader = None
+        self.eval_dataloader = None
+
+        if not any(
+                isinstance(cb, DefaultFlowCallback) for cb in self.callbacks):
+            logger.warning(
+                "The Trainer will not work properly if you don't have a `DefaultFlowCallback` in its callbacks. You\n"
+                +
+                "should add one before training with `trainer.add_callback(DefaultFlowCallback). The current list of"
+                + "callbacks is\n:" + self.callback_list)
+
+    def add_callback(self, callback):
+        cb = callback() if isinstance(callback, type) else callback
+        cb_class = callback if isinstance(callback,
+                                          type) else callback.__class__
+        if cb_class in [c.__class__ for c in self.callbacks]:
+            logger.warning(
+                f"You are adding a {cb_class} to the callbacks of this Trainer, but there is already one. The current"
+                + "list of callbacks is\n:" + self.callback_list)
+        self.callbacks.append(cb)
+
+    def pop_callback(self, callback):
+        if isinstance(callback, type):
+            for cb in self.callbacks:
+                if isinstance(cb, callback):
+                    self.callbacks.remove(cb)
+                    return cb
+        else:
+            for cb in self.callbacks:
+                if cb == callback:
+                    self.callbacks.remove(cb)
+                    return cb
+
+    def remove_callback(self, callback):
+        if isinstance(callback, type):
+            for cb in self.callbacks:
+                if isinstance(cb, callback):
+                    self.callbacks.remove(cb)
+                    return
+        else:
+            self.callbacks.remove(callback)
+
+    @property
+    def callback_list(self):
+        return "\n".join(cb.__class__.__name__ for cb in self.callbacks)
+
+    def on_init_end(self,
+                    args: TrainingArguments,
+                    state: TrainerState,
+                    control: TrainerControl):
+        return self.call_event("on_init_end", args, state, control)
+
+    def on_train_begin(self,
+                       args: TrainingArguments,
+                       state: TrainerState,
+                       control: TrainerControl):
+        control.should_training_stop = False
+        return self.call_event("on_train_begin", args, state, control)
+
+    def on_train_end(self,
+                     args: TrainingArguments,
+                     state: TrainerState,
+                     control: TrainerControl):
+        return self.call_event("on_train_end", args, state, control)
+
+    def on_epoch_begin(self,
+                       args: TrainingArguments,
+                       state: TrainerState,
+                       control: TrainerControl):
+        control.should_epoch_stop = False
+        return self.call_event("on_epoch_begin", args, state, control)
+
+    def on_epoch_end(self,
+                     args: TrainingArguments,
+                     state: TrainerState,
+                     control: TrainerControl):
+        return self.call_event("on_epoch_end", args, state, control)
+
+    def on_step_begin(self,
+                      args: TrainingArguments,
+                      state: TrainerState,
+                      control: TrainerControl):
+        control.should_log = False
+        control.should_evaluate = False
+        control.should_save = False
+        return self.call_event("on_step_begin", args, state, control)
+
+    def on_substep_end(self,
+                       args: TrainingArguments,
+                       state: TrainerState,
+                       control: TrainerControl):
+        return self.call_event("on_substep_end", args, state, control)
+
+    def on_step_end(self,
+                    args: TrainingArguments,
+                    state: TrainerState,
+                    control: TrainerControl):
+        return self.call_event("on_step_end", args, state, control)
+
+    def on_evaluate(self,
+                    args: TrainingArguments,
+                    state: TrainerState,
+                    control: TrainerControl,
+                    metrics):
+        control.should_evaluate = False
+        return self.call_event(
+            "on_evaluate", args, state, control, metrics=metrics)
+
+    def on_save(self,
+                args: TrainingArguments,
+                state: TrainerState,
+                control: TrainerControl):
+        control.should_save = False
+        return self.call_event("on_save", args, state, control)
+
+    def on_log(self,
+               args: TrainingArguments,
+               state: TrainerState,
+               control: TrainerControl,
+               logs):
+        control.should_log = False
+        return self.call_event("on_log", args, state, control, logs=logs)
+
+    def on_prediction_step(self,
+                           args: TrainingArguments,
+                           state: TrainerState,
+                           control: TrainerControl):
+        return self.call_event("on_prediction_step", args, state, control)
+
+    def call_event(self, event, args, state, control, **kwargs):
+        for callback in self.callbacks:
+            result = getattr(callback, event)(
+                args,
+                state,
+                control,
+                model=self.model,
+                tokenizer=self.tokenizer,
+                optimizer=self.optimizer,
+                lr_scheduler=self.lr_scheduler,
+                train_dataloader=self.train_dataloader,
+                eval_dataloader=self.eval_dataloader,
+                **kwargs, )
+            # A Callback can skip the return of `control` if it doesn't change it.
+            if result is not None:
+                control = result
+        return control
+
+
+class DefaultFlowCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that handles the default flow of the training loop for logs, evaluation and checkpoints.
+    """
+
+    def on_step_end(self,
+                    args: TrainingArguments,
+                    state: TrainerState,
+                    control: TrainerControl,
+                    **kwargs):
+        # Log
+        if state.global_step == 1 and args.logging_first_step:
+            control.should_log = True
+        if args.logging_strategy == IntervalStrategy.STEPS and state.global_step % args.logging_steps == 0:
+            control.should_log = True
+
+        # Evaluate
+        if args.evaluation_strategy == IntervalStrategy.STEPS and state.global_step % args.eval_steps == 0:
+            control.should_evaluate = True
+
+        # Save
+        if (args.save_strategy == IntervalStrategy.STEPS and
+                args.save_steps > 0 and
+                state.global_step % args.save_steps == 0):
+            control.should_save = True
+
+        # End training
+        if state.global_step >= state.max_steps:
+            control.should_training_stop = True
+
+        return control
+
+    def on_epoch_end(self,
+                     args: TrainingArguments,
+                     state: TrainerState,
+                     control: TrainerControl,
+                     **kwargs):
+        # Log
+        if args.logging_strategy == IntervalStrategy.EPOCH:
+            control.should_log = True
+
+        # Evaluate
+        if args.evaluation_strategy == IntervalStrategy.EPOCH:
+            control.should_evaluate = True
+
+        # Save
+        if args.save_strategy == IntervalStrategy.EPOCH:
+            control.should_save = True
+
+        return control
+
+
+class ProgressCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that displays the progress of training or evaluation.
+    """
+
+    def __init__(self):
+        self.training_bar = None
+        self.prediction_bar = None
+
+    def on_train_begin(self, args, state, control, **kwargs):
+        if state.is_local_process_zero:
+            self.training_bar = tqdm(total=state.max_steps)
+        self.current_step = 0
+
+    def on_step_end(self, args, state, control, **kwargs):
+        if state.is_local_process_zero:
+            self.training_bar.update(state.global_step - self.current_step)
+            self.current_step = state.global_step
+
+    def on_prediction_step(self,
+                           args,
+                           state,
+                           control,
+                           eval_dataloader=None,
+                           **kwargs):
+        if state.is_local_process_zero and has_length(eval_dataloader.dataset):
+            if self.prediction_bar is None:
+                self.prediction_bar = tqdm(
+                    total=len(eval_dataloader), leave=self.training_bar is None)
+            self.prediction_bar.update(1)
+
+    def on_evaluate(self, args, state, control, **kwargs):
+        if state.is_local_process_zero:
+            if self.prediction_bar is not None:
+                self.prediction_bar.close()
+            self.prediction_bar = None
+
+    def on_log(self, args, state, control, logs=None, **kwargs):
+        if state.is_local_process_zero and self.training_bar is not None:
+            _ = logs.pop("total_flos", None)
+            self.training_bar.write(str(logs))
+
+    def on_train_end(self, args, state, control, **kwargs):
+        if state.is_local_process_zero:
+            self.training_bar.close()
+            self.training_bar = None
+
+
+class PrinterCallback(TrainerCallback):
+    """
+    A bare [`TrainerCallback`] that just prints the logs.
+    """
+
+    def on_log(self, args, state, control, logs=None, **kwargs):
+        _ = logs.pop("total_flos", None)
+        if state.is_local_process_zero:
+            print(logs)
+
+
+class EarlyStoppingCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that handles early stopping.
+
+    Args:
+       early_stopping_patience (`int`):
+            Use with `metric_for_best_model` to stop training when the specified metric worsens for
+            `early_stopping_patience` evaluation calls.
+       early_stopping_threshold(`float`, *optional*):
+            Use with TrainingArguments `metric_for_best_model` and `early_stopping_patience` to denote how much the
+            specified metric must improve to satisfy early stopping conditions. `
+
+    This callback depends on [`TrainingArguments`] argument *load_best_model_at_end* functionality to set best_metric
+    in [`TrainerState`].
+    """
+
+    def __init__(self,
+                 early_stopping_patience: int=1,
+                 early_stopping_threshold: Optional[float]=0.0):
+        self.early_stopping_patience = early_stopping_patience
+        self.early_stopping_threshold = early_stopping_threshold
+        # early_stopping_patience_counter denotes the number of times validation metrics failed to improve.
+        self.early_stopping_patience_counter = 0
+
+    def check_metric_value(self, args, state, control, metric_value):
+        # best_metric is set by code for load_best_model
+        operator = np.greater if args.greater_is_better else np.less
+        if state.best_metric is None or (
+                operator(metric_value, state.best_metric) and
+                abs(metric_value - state.best_metric) >
+                self.early_stopping_threshold):
+            self.early_stopping_patience_counter = 0
+        else:
+            self.early_stopping_patience_counter += 1
+
+    def on_train_begin(self, args, state, control, **kwargs):
+        assert args.load_best_model_at_end, "EarlyStoppingCallback requires load_best_model_at_end = True"
+        assert (
+            args.metric_for_best_model is not None
+        ), "EarlyStoppingCallback requires metric_for_best_model is defined"
+        assert (
+            args.evaluation_strategy != IntervalStrategy.NO
+        ), "EarlyStoppingCallback requires IntervalStrategy of steps or epoch"
+
+    def on_evaluate(self, args, state, control, metrics, **kwargs):
+        metric_to_check = args.metric_for_best_model
+        if not metric_to_check.startswith("eval_"):
+            metric_to_check = f"eval_{metric_to_check}"
+        metric_value = metrics.get(metric_to_check)
+
+        if metric_value is None:
+            logger.warning(
+                f"early stopping required metric_for_best_model, but did not find {metric_to_check} so early stopping is disabled"
+            )
+            return
+
+        self.check_metric_value(args, state, control, metric_value)
+        if self.early_stopping_patience_counter >= self.early_stopping_patience:
+            control.should_training_stop = True
diff --git a/examples/language_model/ernie-1.0/finetune/trainer_utils.py b/examples/language_model/ernie-1.0/finetune/trainer_utils.py
new file mode 100644
index 000000000000..66da32be4794
--- /dev/null
+++ b/examples/language_model/ernie-1.0/finetune/trainer_utils.py
@@ -0,0 +1,202 @@
+# coding=utf-8
+# Copyright 2020-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utilities for the Trainer and TFTrainer class. Should be independent from PyTorch and TensorFlow.
+"""
+
+import copy
+import functools
+import gc
+import inspect
+import os
+import random
+import re
+import threading
+import time
+from enum import Enum
+from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
+
+import numpy as np
+
+
+class ExplicitEnum(Enum):
+    """
+    Enum with more explicit error message for missing values.
+    """
+
+    @classmethod
+    def _missing_(cls, value):
+        raise ValueError(
+            f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
+        )
+
+
+class EvalPrediction(NamedTuple):
+    """
+    Evaluation output (always contains labels), to be used to compute metrics.
+
+    Parameters:
+        predictions (`np.ndarray`): Predictions of the model.
+        label_ids (`np.ndarray`): Targets to be matched.
+    """
+
+    predictions: Union[np.ndarray, Tuple[np.ndarray]]
+    label_ids: Union[np.ndarray, Tuple[np.ndarray]]
+
+
+class EvalLoopOutput(NamedTuple):
+    predictions: Union[np.ndarray, Tuple[np.ndarray]]
+    label_ids: Optional[Union[np.ndarray, Tuple[np.ndarray]]]
+    metrics: Optional[Dict[str, float]]
+    num_samples: Optional[int]
+
+
+class PredictionOutput(NamedTuple):
+    predictions: Union[np.ndarray, Tuple[np.ndarray]]
+    label_ids: Optional[Union[np.ndarray, Tuple[np.ndarray]]]
+    metrics: Optional[Dict[str, float]]
+
+
+class TrainOutput(NamedTuple):
+    global_step: int
+    training_loss: float
+    metrics: Dict[str, float]
+
+
+PREFIX_CHECKPOINT_DIR = "checkpoint"
+_re_checkpoint = re.compile(r"^" + PREFIX_CHECKPOINT_DIR + r"\-(\d+)$")
+
+
+def get_last_checkpoint(folder):
+    content = os.listdir(folder)
+    checkpoints = [
+        path for path in content
+        if _re_checkpoint.search(path) is not None and os.path.isdir(
+            os.path.join(folder, path))
+    ]
+    if len(checkpoints) == 0:
+        return
+    return os.path.join(
+        folder,
+        max(checkpoints,
+            key=lambda x: int(_re_checkpoint.search(x).groups()[0])))
+
+
+class IntervalStrategy(ExplicitEnum):
+    NO = "no"
+    STEPS = "steps"
+    EPOCH = "epoch"
+
+
+class EvaluationStrategy(ExplicitEnum):
+    NO = "no"
+    STEPS = "steps"
+    EPOCH = "epoch"
+
+
+class BestRun(NamedTuple):
+    """
+    The best run found by an hyperparameter search (see [`~Trainer.hyperparameter_search`]).
+
+    Parameters:
+        run_id (`str`):
+            The id of the best run (if models were saved, the corresponding checkpoint will be in the folder ending
+            with run-{run_id}).
+        objective (`float`):
+            The objective that was obtained for this run.
+        hyperparameters (`Dict[str, Any]`):
+            The hyperparameters picked to get this run.
+    """
+
+    run_id: str
+    objective: float
+    hyperparameters: Dict[str, Any]
+
+
+def default_compute_objective(metrics: Dict[str, float]) -> float:
+    """
+    The default objective to maximize/minimize when doing an hyperparameter search. It is the evaluation loss if no
+    metrics are provided to the [`Trainer`], the sum of all metrics otherwise.
+
+    Args:
+        metrics (`Dict[str, float]`): The metrics returned by the evaluate method.
+
+    Return:
+        `float`: The objective to minimize or maximize
+    """
+    metrics = copy.deepcopy(metrics)
+    loss = metrics.pop("eval_loss", None)
+    _ = metrics.pop("epoch", None)
+    # Remove speed metrics
+    speed_metrics = [
+        m for m in metrics.keys()
+        if m.endswith("_runtime") or m.endswith("_per_second")
+    ]
+    for sm in speed_metrics:
+        _ = metrics.pop(sm, None)
+    return loss if len(metrics) == 0 else sum(metrics.values())
+
+
+def is_main_process(local_rank):
+    """
+    Whether or not the current process is the local process, based on `xm.get_ordinal()` (for TPUs) first, then on
+    `local_rank`.
+    """
+
+    return local_rank in [-1, 0]
+
+
+def total_processes_number(local_rank):
+    """
+    Return the number of processes launched in parallel. Works with `torch.distributed` and TPUs.
+    """
+    if local_rank != -1:
+        import paddle
+
+        return paddle.distributed.get_world_size()
+    return 1
+
+
+def speed_metrics(split, start_time, num_samples=None, num_steps=None):
+    """
+    Measure and return speed performance metrics.
+
+    This function requires a time snapshot `start_time` before the operation to be measured starts and this function
+    should be run immediately after the operation to be measured has completed.
+
+    Args:
+
+    - split: name to prefix metric (like train, eval, test...)
+    - start_time: operation start time
+    - num_samples: number of samples processed
+    """
+    runtime = time.time() - start_time
+    result = {f"{split}_runtime": round(runtime, 4)}
+    if num_samples is not None:
+        samples_per_second = num_samples / runtime
+        result[f"{split}_samples_per_second"] = round(samples_per_second, 3)
+    if num_steps is not None:
+        steps_per_second = num_steps / runtime
+        result[f"{split}_steps_per_second"] = round(steps_per_second, 3)
+    return result
+
+
+class SchedulerType(ExplicitEnum):
+    LINEAR = "linear"
+    COSINE = "cosine"
+    COSINE_WITH_RESTARTS = "cosine_with_restarts"
+    POLYNOMIAL = "polynomial"
+    CONSTANT = "constant"
+    CONSTANT_WITH_WARMUP = "constant_with_warmup"
diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py
index b3cb7f9707bc..689a6f45a4f3 100644
--- a/paddlenlp/transformers/model_utils.py
+++ b/paddlenlp/transformers/model_utils.py
@@ -37,6 +37,12 @@
 ]
 
 
+def unwrap_model(model, *args, **kwargs):
+    raw_model = model._layers if isinstance(model,
+                                            paddle.DataParallel) else model
+    return raw_model
+
+
 def register_base_model(cls):
     """
     A decorator for `PretrainedModel` class. It first retrieves the parent class

From 2edc6e3cb0f03ed464c39cfb2609083c0021f83a Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Mon, 21 Mar 2022 12:51:21 +0800
Subject: [PATCH 05/15] refine trainer.

---
 .../ernie-1.0/finetune/config.yml             |   4 +-
 .../ernie-1.0/finetune/finetune.py            |  13 +-
 .../ernie-1.0/finetune/question_answering.py  |   2 +-
 .../finetune/sequence_classification.py       |   8 +-
 .../finetune/token_classification.py          |   2 +-
 .../ernie-1.0/finetune/trainer_args.py        | 220 ++++--------------
 .../ernie-1.0/finetune/trainer_base.py        | 143 +++++-------
 .../ernie-1.0/finetune/trainer_callback.py    |  10 +-
 .../ernie-1.0/finetune/trainer_utils.py       |  13 +-
 9 files changed, 134 insertions(+), 281 deletions(-)

diff --git a/examples/language_model/ernie-1.0/finetune/config.yml b/examples/language_model/ernie-1.0/finetune/config.yml
index 298b430964df..fd7555ee78dc 100644
--- a/examples/language_model/ernie-1.0/finetune/config.yml
+++ b/examples/language_model/ernie-1.0/finetune/config.yml
@@ -7,8 +7,8 @@ DefaultArgs:
     max_seq_length: 128
     weight_decay: 0.01
     logging_steps: 10
-    valid_steps: 200
-    minimum_valid_times: 20
+    eval_steps: 200
+    minimum_eval_times: 20
     max_steps: -1
     warmup_steps: 0
     metric: "Accuracy"
diff --git a/examples/language_model/ernie-1.0/finetune/finetune.py b/examples/language_model/ernie-1.0/finetune/finetune.py
index 70ae65b32964..b2da87dafaf0 100644
--- a/examples/language_model/ernie-1.0/finetune/finetune.py
+++ b/examples/language_model/ernie-1.0/finetune/finetune.py
@@ -124,15 +124,15 @@ def parse_args():
         default=10,
         help="Log every X updates steps.")
     group.add_argument(
-        "--valid_steps",
+        "--eval_steps",
         type=int,
         default=200,
         help="Save checkpoint every X updates steps.")
     group.add_argument(
-        "--minimum_valid_times",
+        "--minimum_eval_times",
         type=int,
         default=None,
-        help="If under valid_steps, the valid time is less then minimum_valid_times, the config of override valid_steps."
+        help="If under eval_steps, the valid time is less then minimum_eval_times, the config of override eval_steps."
     )
     group.add_argument(
         "--max_steps",
@@ -144,17 +144,16 @@ def parse_args():
         "--warmup_steps",
         default=0,
         type=int,
-        help="Linear warmup over warmup_steps. If > 0: Override warmup_proportion"
-    )
+        help="Linear warmup over warmup_steps. If > 0: Override warmup_ratio")
     group.add_argument(
-        "--warmup_proportion",
+        "--warmup_ratio",
         default=0.1,
         type=float,
         help="Linear warmup proportion over total steps.")
 
     group = parser.add_argument_group(title='Additional training configs.')
     group.add_argument(
-        "--use_amp",
+        "--fp16",
         type=distutils.util.strtobool,
         default=False,
         help="Enable mixed precision training.")
diff --git a/examples/language_model/ernie-1.0/finetune/question_answering.py b/examples/language_model/ernie-1.0/finetune/question_answering.py
index 03be17723111..258601d48b45 100644
--- a/examples/language_model/ernie-1.0/finetune/question_answering.py
+++ b/examples/language_model/ernie-1.0/finetune/question_answering.py
@@ -257,7 +257,7 @@ def train(self):
                 self.lr_scheduler.step()
                 self.optimizer.clear_grad()
 
-                if global_step % self.args.valid_steps == 0:
+                if global_step % self.args.eval_steps == 0:
                     self.eval()
 
                 if global_step == self.args.num_training_steps:
diff --git a/examples/language_model/ernie-1.0/finetune/sequence_classification.py b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
index cdbf887cc1e0..bd2fde51a9e1 100644
--- a/examples/language_model/ernie-1.0/finetune/sequence_classification.py
+++ b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
@@ -239,7 +239,7 @@ def train(self):
 
         metric = Accuracy()
 
-        if self.args.use_amp:
+        if self.args.fp16:
             scaler = paddle.amp.GradScaler(
                 init_loss_scaling=self.args.scale_loss)
 
@@ -253,7 +253,7 @@ def train(self):
                 global_step += 1
                 input_ids, segment_ids, labels = batch
                 with paddle.amp.auto_cast(
-                        bool(self.args.use_amp),
+                        bool(self.args.fp16),
                         custom_white_list=["layer_norm", "softmax", "gelu"], ):
                     logits = self.model(input_ids, segment_ids)
                     loss = loss_fct(logits, labels)
@@ -263,7 +263,7 @@ def train(self):
                 metric.update(correct)
                 acc = metric.accumulate()
 
-                if self.args.use_amp:
+                if self.args.fp16:
                     scaler.scale(loss).backward()
                     scaler.minimize(self.optimizer, loss)
                 else:
@@ -282,7 +282,7 @@ def train(self):
                            self.args.logging_steps / (time.time() - tic_train)))
                     metric.reset()
                     tic_train = time.time()
-                if global_step % self.args.valid_steps == 0 or global_step == self.args.num_training_steps:
+                if global_step % self.args.eval_steps == 0 or global_step == self.args.num_training_steps:
                     tic_eval = time.time()
                     metric.reset()
                     if self.dev_dl is not None:
diff --git a/examples/language_model/ernie-1.0/finetune/token_classification.py b/examples/language_model/ernie-1.0/finetune/token_classification.py
index bb20a1764026..851112290f9c 100644
--- a/examples/language_model/ernie-1.0/finetune/token_classification.py
+++ b/examples/language_model/ernie-1.0/finetune/token_classification.py
@@ -150,7 +150,7 @@ def train(self):
                 self.lr_scheduler.step()
                 self.optimizer.clear_grad()
 
-                if global_step % self.args.valid_steps == 0 or global_step == self.args.num_training_steps:
+                if global_step % self.args.eval_steps == 0 or global_step == self.args.num_training_steps:
                     if paddle.distributed.get_rank() == 0:
                         dev_f1 = evaluate(self.model, loss_fct, metric,
                                           self.dev_dl, label_num, "valid")
diff --git a/examples/language_model/ernie-1.0/finetune/trainer_args.py b/examples/language_model/ernie-1.0/finetune/trainer_args.py
index e0ad30e6361a..d8bf99d25f75 100644
--- a/examples/language_model/ernie-1.0/finetune/trainer_args.py
+++ b/examples/language_model/ernie-1.0/finetune/trainer_args.py
@@ -23,31 +23,14 @@
 from typing import Any, Dict, List, Optional
 
 from utils import logging
+from trainer_utils import (SchedulerType, IntervalStrategy, EvaluationStrategy,
+                           OptimizerNames)
 
+logger = logging.get_logger(__name__)
+log_levels = logging.get_log_levels_dict().copy()
+trainer_log_levels = dict(**log_levels, passive=-1)
 
-class ExplicitEnum(Enum):
-    """
-    Enum with more explicit error message for missing values.
-    """
-
-    @classmethod
-    def _missing_(cls, value):
-        raise ValueError(
-            f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
-        )
-
-
-class IntervalStrategy(ExplicitEnum):
-    NO = "no"
-    STEPS = "steps"
-    EPOCH = "epoch"
-
-
-class EvaluationStrategy(ExplicitEnum):
-    NO = "no"
-    STEPS = "steps"
-    EPOCH = "epoch"
-
+import paddle
 
 # logger = logging.get_logger(__name__)
 log_levels = logging.get_log_levels_dict().copy()
@@ -56,7 +39,7 @@ class EvaluationStrategy(ExplicitEnum):
 
 def default_logdir() -> str:
     """
-    Same default as PyTorch
+    Same default
     """
     import socket
     from datetime import datetime
@@ -104,9 +87,9 @@ class TrainingArguments:
         prediction_loss_only (`bool`, *optional*, defaults to `False`):
             When performing evaluation and generating predictions, only returns the loss.
         per_device_train_batch_size (`int`, *optional*, defaults to 8):
-            The batch size per GPU/TPU core/CPU for training.
+            The batch size per GPU core/CPU for training.
         per_device_eval_batch_size (`int`, *optional*, defaults to 8):
-            The batch size per GPU/TPU core/CPU for evaluation.
+            The batch size per GPU core/CPU for evaluation.
         gradient_accumulation_steps (`int`, *optional*, defaults to 1):
             Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
 
@@ -119,7 +102,7 @@ class TrainingArguments:
 
         eval_accumulation_steps (`int`, *optional*):
             Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If
-            left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster but
+            left unset, the whole predictions are accumulated on GPU before being moved to the CPU (faster but
             requires more memory).
         learning_rate (`float`, *optional*, defaults to 5e-5):
             The initial learning rate for [`AdamW`] optimizer.
@@ -211,27 +194,10 @@ class TrainingArguments:
         fp16_opt_level (`str`, *optional*, defaults to 'O1'):
             For `fp16` training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details on
             the [Apex documentation](https://nvidia.github.io/apex/amp).
-        fp16_backend (`str`, *optional*, defaults to `"auto"`):
-            This argument is deprecated. Use `half_precision_backend` instead.
-        half_precision_backend (`str`, *optional*, defaults to `"auto"`):
-            The backend to use for mixed precision training. Must be one of `"auto"`, `"amp"` or `"apex"`. `"auto"`
-            will use AMP or APEX depending on the PyTorch version detected, while the other choices will force the
-            requested backend.
-        bf16_full_eval (`bool`, *optional*, defaults to `False`):
-            Whether to use full bfloat16 evaluation instead of 32-bit. This will be faster and save memory but can harm
-            metric values. This is an experimental API and it may change.
-        fp16_full_eval (`bool`, *optional*, defaults to `False`):
-            Whether to use full float16 evaluation instead of 32-bit. This will be faster and save memory but can harm
-            metric values.
-        tf32 (`bool`, *optional*):
-            Whether to enable tf32 mode, available in Ampere and newer GPU architectures. This is an experimental API
-            and it may change.
         local_rank (`int`, *optional*, defaults to -1):
             Rank of the process during distributed training.
         xpu_backend (`str`, *optional*):
             The backend to use for xpu distributed training. Must be one of `"mpi"` or `"ccl"`.
-        tpu_num_cores (`int`, *optional*):
-            When training on TPU, the number of TPU cores (automatically passed by launcher script).
         dataloader_drop_last (`bool`, *optional*, defaults to `False`):
             Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
             or not.
@@ -290,25 +256,6 @@ class TrainingArguments:
             When resuming training, whether or not to skip the epochs and batches to get the data loading at the same
             stage as in the previous training. If set to `True`, the training will begin faster (as that skipping step
             can take a long time) but will not yield the same results as the interrupted training would have.
-        sharded_ddp (`bool`, `str` or list of [`~trainer_utils.ShardedDDPOption`], *optional*, defaults to `False`):
-            Use Sharded DDP training from [FairScale](https://github.com/facebookresearch/fairscale) (in distributed
-            training only). This is an experimental feature.
-
-            A list of options along the following:
-
-            - `"simple"`: to use first instance of sharded DDP released by fairscale (`ShardedDDP`) similar to ZeRO-2.
-            - `"zero_dp_2"`: to use the second instance of sharded DPP released by fairscale (`FullyShardedDDP`) in
-              Zero-2 mode (with `reshard_after_forward=False`).
-            - `"zero_dp_3"`: to use the second instance of sharded DPP released by fairscale (`FullyShardedDDP`) in
-              Zero-3 mode (with `reshard_after_forward=True`).
-            - `"offload"`: to add ZeRO-offload (only compatible with `"zero_dp_2"` and `"zero_dp_3"`).
-
-            If a string is passed, it will be split on space. If a bool is passed, it will be converted to an empty
-            list for `False` and `["simple"]` for `True`.
-        deepspeed (`str` or `dict`, *optional*):
-            Use [Deepspeed](https://github.com/microsoft/deepspeed). This is an experimental feature and its API may
-            evolve in the future. The value is either the location of DeepSpeed json config file (e.g.,
-            `ds_config.json`) or an already loaded json file as a `dict`"
         label_smoothing_factor (`float`, *optional*, defaults to 0.0):
             The label smoothing factor to use. Zero means no label smoothing, otherwise the underlying onehot-encoded
             labels are changed from 0s and 1s to `label_smoothing_factor/num_labels` and `1 - label_smoothing_factor +
@@ -323,13 +270,8 @@ class TrainingArguments:
             - `"tpu_metrics_debug"`: print debug metrics on TPU
 
             The options should be separated by whitespaces.
-        optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_hf"`):
-            The optimizer to use: adamw_hf, adamw_torch, adamw_apex_fused, or adafactor.
-        adafactor (`bool`, *optional*, defaults to `False`):
-            This argument is deprecated. Use `--optim adafactor` instead.
-        group_by_length (`bool`, *optional*, defaults to `False`):
-            Whether or not to group together samples of roughly the same length in the training dataset (to minimize
-            padding applied and be more efficient). Only useful if applying dynamic padding.
+        optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw"`):
+            The optimizer to use: adamw, or adafactor.
         length_column_name (`str`, *optional*, defaults to `"length"`):
             Column name for precomputed lengths. If the column exists, grouping by length will use these values rather
             than computing them on train startup. Ignored unless `group_by_length` is `True` and the dataset is an
@@ -338,61 +280,15 @@ class TrainingArguments:
             The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
             `"comet_ml"`, `"mlflow"`, `"tensorboard"` and `"wandb"`. Use `"all"` to report to all integrations
             installed, `"none"` for no integrations.
-        ddp_find_unused_parameters (`bool`, *optional*):
-            When using distributed training, the value of the flag `find_unused_parameters` passed to
-            `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.
-        ddp_bucket_cap_mb (`int`, *optional*):
-            When using distributed training, the value of the flag `bucket_cap_mb` passed to `DistributedDataParallel`.
         dataloader_pin_memory (`bool`, *optional*, defaults to `True`):
             Whether you want to pin memory in data loaders or not. Will default to `True`.
         skip_memory_metrics (`bool`, *optional*, defaults to `True`):
             Whether to skip adding of memory profiler reports to metrics. This is skipped by default because it slows
             down the training and evaluation speed.
-        push_to_hub (`bool`, *optional*, defaults to `False`):
-            Whether or not to push the model to the Hub every time the model is saved. If this is activated,
-            `output_dir` will begin a git directory synced with the the repo (determined by `hub_model_id`) and the
-            content will be pushed each time a save is triggered (depending on your `save_strategy`). Calling
-            [`~Trainer.save_model`] will also trigger a push.
-
-            <Tip warning={true}>
-
-            If `output_dir` exists, it needs to be a local clone of the repository to which the [`Trainer`] will be
-            pushed.
-
-            </Tip>
-
         resume_from_checkpoint (`str`, *optional*):
             The path to a folder with a valid checkpoint for your model. This argument is not directly used by
             [`Trainer`], it's intended to be used by your training/evaluation scripts instead. See the [example
             scripts](https://github.com/huggingface/transformers/tree/master/examples) for more details.
-        hub_model_id (`str`, *optional*):
-            The name of the repository to keep in sync with the local *output_dir*. It can be a simple model ID in
-            which case the model will be pushed in your namespace. Otherwise it should be the whole repository name,
-            for instance `"user_name/model"`, which allows you to push to an organization you are a member of with
-            `"organization_name/model"`. Will default to `user_name/output_dir_name` with *output_dir_name* being the
-            name of `output_dir`.
-
-            Will default to to the name of `output_dir`.
-        hub_strategy (`str` or [`~trainer_utils.HubStrategy`], *optional*, defaults to `"every_save"`):
-            Defines the scope of what is pushed to the Hub and when. Possible values are:
-
-            - `"end"`: push the model, its configuration, the tokenizer (if passed along to the [`Trainer`]) and a
-              draft of a model card when the [`~Trainer.save_model`] method is called.
-            - `"every_save"`: push the model, its configuration, the tokenizer (if passed along to the [`Trainer`]) and
-              a draft of a model card each time there is a model save. The pushes are asynchronous to not block
-              training, and in case the save are very frequent, a new push is only attempted if the previous one is
-              finished. A last push is made with the final model at the end of training.
-            - `"checkpoint"`: like `"every_save"` but the latest checkpoint is also pushed in a subfolder named
-              last-checkpoint, allowing you to resume training easily with
-              `trainer.train(resume_from_checkpoint="last-checkpoint")`.
-            - `"all_checkpoints"`: like `"checkpoint"` but all checkpoints are pushed like they appear in the output
-              folder (so you will get one checkpoint folder per folder in your final repository)
-
-        hub_token (`str`, *optional*):
-            The token to use to push the model to the Hub. Will default to the token in the cache folder obtained with
-            `huggingface-cli login`.
-        gradient_checkpointing (`bool`, *optional*, defaults to `False`):
-            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
     """
 
     output_dir: str = field(
@@ -418,7 +314,7 @@ class TrainingArguments:
         default=False,
         metadata={"help": "Whether to run predictions on the test set."})
     evaluation_strategy: IntervalStrategy = field(
-        default="no",
+        default="steps",
         metadata={"help": "The evaluation strategy to use."}, )
     prediction_loss_only: bool = field(
         default=False,
@@ -429,25 +325,10 @@ class TrainingArguments:
 
     per_device_train_batch_size: int = field(
         default=8,
-        metadata={"help": "Batch size per GPU/TPU core/CPU for training."})
+        metadata={"help": "Batch size per GPU core/CPU for training."})
     per_device_eval_batch_size: int = field(
         default=8,
-        metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."})
-
-    per_gpu_train_batch_size: Optional[int] = field(
-        default=None,
-        metadata={
-            "help":
-            "Deprecated, the use of `--per_device_train_batch_size` is preferred. "
-            "Batch size per GPU/TPU core/CPU for training."
-        }, )
-    per_gpu_eval_batch_size: Optional[int] = field(
-        default=None,
-        metadata={
-            "help":
-            "Deprecated, the use of `--per_device_eval_batch_size` is preferred. "
-            "Batch size per GPU/TPU core/CPU for evaluation."
-        }, )
+        metadata={"help": "Batch size per GPU core/CPU for evaluation."})
 
     gradient_accumulation_steps: int = field(
         default=1,
@@ -586,7 +467,7 @@ class TrainingArguments:
             "Drop the last incomplete batch if it is not divisible by the batch size."
         })
     eval_steps: int = field(
-        default=None, metadata={"help": "Run an evaluation every X steps."})
+        default=200, metadata={"help": "Run an evaluation every X steps."})
     dataloader_num_workers: int = field(
         default=0,
         metadata={
@@ -608,6 +489,10 @@ class TrainingArguments:
             "An optional descriptor for the run. Notably used for wandb logging."
         })
 
+    disable_tqdm: Optional[bool] = field(
+        default=None,
+        metadata={"help": "Whether or not to disable the tqdm progress bars."})
+
     label_names: Optional[List[str]] = field(
         default=None,
         metadata={
@@ -674,9 +559,9 @@ class TrainingArguments:
     _n_gpu: int = field(init=False, repr=False, default=-1)
 
     def __post_init__(self):
-        # Handle --use_env option in torch.distributed.launch (local_rank not passed as an arg then).
+        # Handle --use_env option in paddle.distributed.launch (local_rank not passed as an arg then).
         # This needs to happen before any call to self.device or self.n_gpu.
-        env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+        env_local_rank = int(os.environ.get("PADDLE_RANK_IN_NODE", -1))
         if env_local_rank != -1 and env_local_rank != self.local_rank:
             self.local_rank = env_local_rank
 
@@ -767,12 +652,6 @@ def __post_init__(self):
 
     def __str__(self):
         self_as_dict = asdict(self)
-
-        # Remove deprecated arguments. That code should be removed once
-        # those deprecated arguments are removed from TrainingArguments. (TODO: v5)
-        del self_as_dict["per_gpu_train_batch_size"]
-        del self_as_dict["per_gpu_eval_batch_size"]
-
         self_as_dict = {
             k: f"<{k.upper()}>" if k.endswith("_token") else v
             for k, v in self_as_dict.items()
@@ -786,27 +665,17 @@ def __str__(self):
     @property
     def train_batch_size(self) -> int:
         """
-        The actual batch size for training (may differ from `per_gpu_train_batch_size` in distributed training).
+        The actual batch size for training.
         """
-        if self.per_gpu_train_batch_size:
-            logger.warning(
-                "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future "
-                "version. Using `--per_device_train_batch_size` is preferred.")
-        per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size
-        train_batch_size = per_device_batch_size * max(1, self.n_gpu)
+        train_batch_size = self.per_device_train_batch_size * max(1, self.n_gpu)
         return train_batch_size
 
     @property
     def eval_batch_size(self) -> int:
         """
-        The actual batch size for evaluation (may differ from `per_gpu_eval_batch_size` in distributed training).
+        The actual batch size for evaluation.
         """
-        if self.per_gpu_eval_batch_size:
-            logger.warning(
-                "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future "
-                "version. Using `--per_device_eval_batch_size` is preferred.")
-        per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size
-        eval_batch_size = per_device_batch_size * max(1, self.n_gpu)
+        eval_batch_size = self.per_device_eval_batch_size * max(1, self.n_gpu)
         return eval_batch_size
 
     @property
@@ -822,13 +691,20 @@ def n_gpu(self):
         # _ = self._setup_devices
         return self._n_gpu
 
+    @property
+    def device(self) -> "paddle.device":
+        """
+        The device used by this process.
+        """
+        return paddle.device.get_device()
+
     @property
     def world_size(self):
         """
         The number of processes used in parallel.
         """
         if self.local_rank != -1:
-            return torch.distributed.get_world_size()
+            return paddle.distributed.get_world_size()
         return 1
 
     @property
@@ -837,7 +713,7 @@ def process_index(self):
         The index of the current process used.
         """
         if self.local_rank != -1:
-            return torch.distributed.get_rank()
+            return paddle.distributed.get_rank()
         return 0
 
     @property
@@ -867,10 +743,7 @@ def should_save(self):
         if self.save_on_each_node:
             return self.local_process_index == 0
         else:
-            if is_sagemaker_mp_enabled():
-                return smp.rank() == 0
-            else:
-                return self.process_index == 0
+            return self.process_index == 0
 
     def get_process_log_level(self):
         """
@@ -892,7 +765,7 @@ def get_process_log_level(self):
     @contextlib.contextmanager
     def main_process_first(self, local=True, desc="work"):
         """
-        A context manager for torch distributed environment where on needs to do something on the main process, while
+        A context manager for paddle distributed environment where on needs to do something on the main process, while
         blocking replicas, and when it's finished releasing the replicas.
 
         One such use is for `datasets`'s `map` feature which to be efficient should be run once on the main process,
@@ -910,7 +783,7 @@ def main_process_first(self, local=True, desc="work"):
                 a work description to be used in debug logs
 
         """
-        if is_torch_available() and self.world_size > 1:
+        if self.world_size > 1:
             if local:
                 is_main_process = self.local_process_index == 0
                 main_process_desc = "main local process"
@@ -924,12 +797,7 @@ def main_process_first(self, local=True, desc="work"):
                     logger.debug(
                         f"{self.process_index}: waiting for the {main_process_desc} to perform {desc}"
                     )
-                    if is_torch_tpu_available():
-                        xm.rendezvous(desc)
-                    elif is_sagemaker_dp_enabled():
-                        sm_dist.barrier()
-                    else:
-                        torch.distributed.barrier()
+                    paddle.distributed.barrier()
                 yield
             finally:
                 if is_main_process:
@@ -937,12 +805,7 @@ def main_process_first(self, local=True, desc="work"):
                     logger.debug(
                         f"{self.process_index}: {main_process_desc} completed {desc}, releasing all replicas"
                     )
-                    if is_torch_tpu_available():
-                        xm.rendezvous(desc)
-                    elif is_sagemaker_dp_enabled():
-                        sm_dist.barrier()
-                    else:
-                        torch.distributed.barrier()
+                    paddle.distributed.barrier()
         else:
             yield
 
@@ -988,8 +851,7 @@ def to_sanitized_dict(self) -> Dict[str, Any]:
         }
 
         valid_types = [bool, int, float, str]
-        if is_torch_available():
-            valid_types.append(torch.Tensor)
+        valid_types.append(paddle.Tensor)
 
         return {
             k: v if type(v) in valid_types else str(v)
diff --git a/examples/language_model/ernie-1.0/finetune/trainer_base.py b/examples/language_model/ernie-1.0/finetune/trainer_base.py
index 914481545458..497224aa544a 100644
--- a/examples/language_model/ernie-1.0/finetune/trainer_base.py
+++ b/examples/language_model/ernie-1.0/finetune/trainer_base.py
@@ -39,16 +39,12 @@
 
 import numpy as np
 
-from trainer_args import TrainingArguments
+from trainer_args import (TrainingArguments, )
 # from trainer_callback import TrainerState, TrainerControl
 
-from trainer_utils import (
-    IntervalStrategy,
-    EvaluationStrategy,
-    EvalPrediction,
-    PredictionOutput,
-    EvalLoopOutput,
-    speed_metrics, )
+from trainer_utils import (IntervalStrategy, EvaluationStrategy, EvalPrediction,
+                           PredictionOutput, EvalLoopOutput, speed_metrics,
+                           OptimizerNames)
 
 from trainer_callback import (
     CallbackHandler,
@@ -198,9 +194,9 @@ class Trainer:
         - **model** -- Always points to the core model. If using a transformers model, it will be a [`PretrainedModel`]
           subclass.
         - **model_wrapped** -- Always points to the most external model in case one or more other modules wrap the
-          original model. This is the model that should be used for the forward pass. For example, under `DeepSpeed`,
-          the inner model is wrapped in `DeepSpeed` and then again in `paddle.nn.DistributedDataParallel`. If the inner
-          model hasn't been wrapped, then `self.model_wrapped` is the same as `self.model`.
+          original model. This is the model that should be used for the forward pass. For example, the inner model is 
+          wrapped in `paddle.nn.DataParallel`. If model hasn't been wrapped, then `self.model_wrapped` is the same 
+          as `self.model`.
         - **is_model_parallel** -- Whether or not a model has been switched to a model parallel mode (different from
           data parallelism, this means some of the model layers are split on different GPUs).
         - **place_model_on_device** -- Whether or not to automatically place the model on the device - it will be set
@@ -231,40 +227,26 @@ def __init__(
                 f"No `TrainingArguments` passed, using `output_dir={output_dir}`."
             )
             args = TrainingArguments(output_dir=output_dir)
-        args.world_size = 1
-        args.fp16 = args.use_amp
-        args.do_grad_scaling = args.use_amp
-        self.do_grad_scaling = args.do_grad_scaling
-        args.train_batch_size = args.batch_size
-        args.eval_batch_size = args.batch_size
-
-        args.dataloader_drop_last = True
-        args.dataloader_num_workers = 0
-        args.dataloader_pin_memory = True
-        args.n_gpu = 1
-        args.lr_scheduler_type = "linear"
-        args.adam_beta1 = 0.9
-        args.adam_beta2 = 0.999
-        args.optim = "OptimizerNames.ADAMW"
-        args.past_index = -1
+
+        output_dir = "tmp_trainer"
+        new_args = TrainingArguments(output_dir=output_dir)
+
         args.per_device_train_batch_size = args.batch_size
         args.per_device_eval_batch_size = args.batch_size
-        args.logging_first_step = True
-        args.logging_strategy = IntervalStrategy.STEPS
-        args.evaluation_strategy = IntervalStrategy.STEPS
-        args.save_strategy = IntervalStrategy.STEPS
-        args.eval_steps = 500
-        args.save_steps = 500
-        args.label_names = None
-        args.prediction_loss_only = False
-        args.output_dir = "./out"
-        args.should_save = True
-        args.local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0))
-        args.save_total_limit = 3
-        args.metric_for_best_model = "accuracy"
-        args.greater_is_better = True
+
+        for arg in vars(args):
+            v = getattr(args, arg)
+            if v is not None:
+                try:
+                    setattr(new_args, arg, v)
+                except Exception as e:
+                    print(arg, v)
+                    pass
+        args = new_args
 
         self.args = args
+        self.do_grad_scaling = args.fp16
+
         # Seed must be set before instantiating the model when using model
         set_seed(self.args.seed)
         if model is None:
@@ -316,6 +298,7 @@ def __init__(
                                "QusetionAnswering" in type(self.model).__name__
                                else ["labels"])
         self.label_names = default_label_names if self.args.label_names is None else self.args.label_names
+        self.print_config()
 
     def add_callback(self, callback):
         """
@@ -338,49 +321,50 @@ def train(
         train_dataloader = self.get_train_dataloader()
         model = self._wrap_model(self.model_wrapped)
 
+        args = self.args
         self.state = TrainerState()
 
-        if self.args.max_steps > 0:
-            self.args.num_training_steps = self.args.max_steps
-            self.args.num_train_epochs = math.ceil(
-                self.args.num_training_steps / len(train_dataloader))
+        total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size
+
+        if args.max_steps > 0:
+            args.num_training_steps = args.max_steps
+            num_train_epochs = math.ceil(args.num_training_steps /
+                                         len(train_dataloader))
+            num_train_samples = args.max_steps * total_train_batch_size
 
         else:
-            self.args.num_training_steps = len(
-                train_dataloader) * self.args.num_train_epochs
-            self.args.num_train_epochs = self.args.num_train_epochs
+            args.num_training_steps = len(
+                train_dataloader) * args.num_train_epochs
+            num_train_epochs = math.ceil(args.num_train_epochs)
+            num_train_samples = len(self.train_dataset) * args.num_train_epochs
 
-        if self.args.num_training_steps // self.args.valid_steps < self.args.minimum_valid_times:
-            exp_step = self.args.num_training_steps / self.args.minimum_valid_times
+        if args.num_training_steps // args.eval_steps < args.minimum_eval_times:
+            exp_step = args.num_training_steps / args.minimum_eval_times
             exp_step = max(int(exp_step - exp_step % 10), 10)
             logger.info("Set eval step to %d" % exp_step)
-            self.args.valid_steps = exp_step
-
-        args = self.args
+            args.eval_steps = exp_step
 
         self.create_optimizer_and_scheduler(
             num_training_steps=args.num_training_steps)
 
         num_examples = len(self.train_dataset)
-        total_train_batch_size = self.args.per_device_train_batch_size * paddle.distributed.get_world_size(
-        )
 
         logger.info("***** Running training *****")
         logger.info(f"  Num examples = {num_examples}")
-        logger.info(f"  Num Epochs = {self.args.num_train_epochs}")
+        logger.info(f"  Num Epochs = {num_train_epochs}")
         logger.info(
-            f"  Instantaneous batch size per device = {self.args.per_device_train_batch_size}"
+            f"  Instantaneous batch size per device = {args.per_device_train_batch_size}"
         )
         logger.info(
             f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}"
         )
         logger.info(f"  Gradient Accumulation steps = {1}")
-        logger.info(
-            f"  Total optimization steps = {self.args.num_training_steps}")
+        logger.info(f"  Total optimization steps = {args.num_training_steps}")
+        logger.info(f"  Total num train samples = {num_train_samples}")
 
         self.state.epoch = 0
-        self.state.max_steps = int(self.args.num_training_steps)
-        self.state.num_train_epochs = int(self.args.num_train_epochs)
+        self.state.max_steps = int(args.num_training_steps)
+        self.state.num_train_epochs = num_train_epochs
         self.state.is_local_process_zero = 0
         self.state.is_world_process_zero = 0
 
@@ -406,7 +390,7 @@ def train(
         self._total_loss_scalar = 0.0
         self._globalstep_last_logged = self.state.global_step
 
-        for epoch in range(epochs_trained, args.num_train_epochs):
+        for epoch in range(epochs_trained, num_train_epochs):
             step = -1
 
             self.control = self.callback_handler.on_epoch_begin(
@@ -457,7 +441,7 @@ def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
                 self.train_dataset,
                 # num_replicas=self.args.world_size,
                 # rank=self.args.process_index,
-                batch_size=self.args.batch_size,
+                batch_size=self.args.train_batch_size,
                 shuffle=True,
                 # seed=self.args.seed,
             )
@@ -493,6 +477,7 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch,
 
         metrics = None
         if self.control.should_evaluate:
+            logger.info("evaluating!!!!!")
             metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
 
         if self.control.should_save:
@@ -535,14 +520,14 @@ def _get_eval_sampler(self, eval_dataset: Dataset):
                 eval_dataset,
                 # num_replicas=self.args.world_size,
                 # rank=self.args.process_index,
-                batch_size=self.args.per_device_eval_batch_size,
+                batch_size=self.args.eval_batch_size,
                 shuffle=False,
                 # seed=self.args.seed,
             )
         else:
             return DistributedBatchSampler(
                 eval_dataset,
-                batch_size=self.args.per_device_eval_batch_size,
+                batch_size=self.args.eval_batch_size,
                 shuffle=False)
 
     def get_eval_dataloader(self,
@@ -653,7 +638,7 @@ def get_optimizer_cls_and_kwargs(
             "beta2": args.adam_beta2,
             "epsilon": args.adam_epsilon,
         }
-        if args.optim == "OptimizerNames.ADAMW":
+        if args.optim == OptimizerNames.ADAMW:
             from paddle.optimizer import AdamW
 
             optimizer_cls = AdamW
@@ -681,7 +666,7 @@ def get_scheduler(lr_scheduler_type, learning_rate, num_warmup_steps,
             return LinearDecayWithWarmup(learning_rate, num_training_steps,
                                          num_warmup_steps)
 
-        warmup = self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_proportion
+        warmup = self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_ratio
 
         if self.lr_scheduler is None:
             self.lr_scheduler = get_scheduler(
@@ -742,7 +727,7 @@ def autocast_smart_context_manager(self):
         A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired
         arguments, depending on the situation.
         """
-        if self.args.use_amp:
+        if self.args.fp16:
             ctx_manager = autocast()
         else:
             ctx_manager = contextlib.nullcontext() if sys.version_info >= (
@@ -835,8 +820,6 @@ def save_model(self, output_dir: Optional[str]=None):
             self._save(output_dir)
 
     def _save_checkpoint(self, model, metrics=None):
-        # In all cases, including ddp/dp/deepspeed, self.model is always a reference to the model we
-        # want to save except FullyShardedDDP.
         # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"
 
         # Save model checkpoint
@@ -1307,13 +1290,13 @@ def prepare_train_config(self):
                 self.train_dl) * self.args.num_train_epochs
             self.args.num_train_epochs = self.args.num_train_epochs
 
-        if self.args.num_training_steps // self.args.valid_steps < self.args.minimum_valid_times:
-            exp_step = self.args.num_training_steps / self.args.minimum_valid_times
+        if self.args.num_training_steps // self.args.eval_steps < self.args.minimum_eval_times:
+            exp_step = self.args.num_training_steps / self.args.minimum_eval_times
             exp_step = max(int(exp_step - exp_step % 10), 10)
             logger.info("Set eval step to %d" % exp_step)
-            self.args.valid_steps = exp_step
+            self.args.eval_steps = exp_step
 
-        warmup = self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_proportion
+        warmup = self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_ratio
 
         self.lr_scheduler = LinearDecayWithWarmup(
             self.args.learning_rate, self.args.num_training_steps, warmup)
@@ -1339,10 +1322,10 @@ def print_config(self):
         """
         """
         logger.info('{:^40}'.format("Configuration Arguments"))
-        logger.info('{:20}:{}'.format("paddle commit id",
+        logger.info('{:30}:{}'.format("paddle commit id",
                                       paddle.version.commit))
         for arg in vars(self.args):
-            logger.info('{:20}:{}'.format(arg, getattr(self.args, arg)))
+            logger.info('{:30}:{}'.format(arg, getattr(self.args, arg)))
 
 
 class TrainerBase(object):
@@ -1399,13 +1382,13 @@ def prepare_train_config(self):
                 self.train_dl) * self.args.num_train_epochs
             self.args.num_train_epochs = self.args.num_train_epochs
 
-        if self.args.num_training_steps // self.args.valid_steps < self.args.minimum_valid_times:
-            exp_step = self.args.num_training_steps / self.args.minimum_valid_times
+        if self.args.num_training_steps // self.args.eval_steps < self.args.minimum_eval_times:
+            exp_step = self.args.num_training_steps / self.args.minimum_eval_times
             exp_step = max(int(exp_step - exp_step % 10), 10)
             logger.info("Set eval step to %d" % exp_step)
-            self.args.valid_steps = exp_step
+            self.args.eval_steps = exp_step
 
-        warmup = self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_proportion
+        warmup = self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_ratio
 
         self.lr_scheduler = LinearDecayWithWarmup(
             self.args.learning_rate, self.args.num_training_steps, warmup)
diff --git a/examples/language_model/ernie-1.0/finetune/trainer_callback.py b/examples/language_model/ernie-1.0/finetune/trainer_callback.py
index 70f8cfd96eea..7336da8fd321 100644
--- a/examples/language_model/ernie-1.0/finetune/trainer_callback.py
+++ b/examples/language_model/ernie-1.0/finetune/trainer_callback.py
@@ -168,17 +168,17 @@ class TrainerCallback:
             The current state of the [`Trainer`].
         control ([`TrainerControl`]):
             The object that is returned to the [`Trainer`] and can be used to make some decisions.
-        model ([`PreTrainedModel`] or `torch.nn.Module`):
+        model ([`PreTrainedModel`] or `paddle.nn.Layer`):
             The model being trained.
         tokenizer ([`PreTrainedTokenizer`]):
             The tokenizer used for encoding the data.
-        optimizer (`torch.optim.Optimizer`):
+        optimizer (`paddle.optimizer.Optimizer`):
             The optimizer used for the training steps.
-        lr_scheduler (`torch.optim.lr_scheduler.LambdaLR`):
+        lr_scheduler (`paddle.optimizer.lr.LRScheduler`):
             The scheduler used for setting the learning rate.
-        train_dataloader (`torch.utils.data.DataLoader`, *optional*):
+        train_dataloader (`paddle.io.DataLoader`, *optional*):
             The current dataloader used for training.
-        eval_dataloader (`torch.utils.data.DataLoader`, *optional*):
+        eval_dataloader (`paddle.io.DataLoader`, *optional*):
             The current dataloader used for training.
         metrics (`Dict[str, float]`):
             The metrics computed by the last evaluation phase.
diff --git a/examples/language_model/ernie-1.0/finetune/trainer_utils.py b/examples/language_model/ernie-1.0/finetune/trainer_utils.py
index 66da32be4794..cfa4392ba489 100644
--- a/examples/language_model/ernie-1.0/finetune/trainer_utils.py
+++ b/examples/language_model/ernie-1.0/finetune/trainer_utils.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Utilities for the Trainer and TFTrainer class. Should be independent from PyTorch and TensorFlow.
+Utilities for the Trainer class. 
 """
 
 import copy
@@ -106,6 +106,15 @@ class EvaluationStrategy(ExplicitEnum):
     EPOCH = "epoch"
 
 
+class OptimizerNames(ExplicitEnum):
+    """
+    Stores the acceptable string identifiers for optimizers.
+    """
+
+    ADAMW = "adamw"
+    ADAFACTOR = "adafactor"
+
+
 class BestRun(NamedTuple):
     """
     The best run found by an hyperparameter search (see [`~Trainer.hyperparameter_search`]).
@@ -160,7 +169,7 @@ def is_main_process(local_rank):
 
 def total_processes_number(local_rank):
     """
-    Return the number of processes launched in parallel. Works with `torch.distributed` and TPUs.
+    Return the number of processes launched in parallel. Works with `paddle.distributed` and TPUs.
     """
     if local_rank != -1:
         import paddle

From ca08daa215b0466efe1946852c1002c231b977b2 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Tue, 22 Mar 2022 18:17:19 +0800
Subject: [PATCH 06/15] update for some details.

---
 .../ernie-1.0/finetune/finetune.py            |  19 +-
 .../finetune/sequence_classification.py       |   7 +-
 .../finetune/token_classification.py          |   1 +
 .../ernie-1.0/finetune/trainer_args.py        |   1 +
 .../ernie-1.0/finetune/trainer_base.py        | 281 +++++++++++-------
 .../ernie-1.0/finetune/trainer_callback.py    |   2 +-
 .../ernie-1.0/finetune/trainer_utils.py       | 106 ++++++-
 7 files changed, 298 insertions(+), 119 deletions(-)

diff --git a/examples/language_model/ernie-1.0/finetune/finetune.py b/examples/language_model/ernie-1.0/finetune/finetune.py
index b2da87dafaf0..d8554a46ee8d 100644
--- a/examples/language_model/ernie-1.0/finetune/finetune.py
+++ b/examples/language_model/ernie-1.0/finetune/finetune.py
@@ -279,8 +279,23 @@ def do_train(args):
             tokenizer,
             args,
             test_ds=all_ds["test"])
-    trainer.train()
-    trainer.eval()
+
+    train_result = trainer.train(resume_from_checkpoint=None)
+    metrics = train_result.metrics
+
+    # max_train_samples = (
+    #     data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+    # )
+    # metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+
+    trainer.save_model()  # Saves the tokenizer too for easy upload
+
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
+
+    # trainer.train()
+    # trainer.eval()
 
 
 def print_arguments(args):
diff --git a/examples/language_model/ernie-1.0/finetune/sequence_classification.py b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
index bd2fde51a9e1..f598eac948a9 100644
--- a/examples/language_model/ernie-1.0/finetune/sequence_classification.py
+++ b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
@@ -353,12 +353,7 @@ def compute_metrics(p):
             metric.update(result)
             accu = metric.accumulate()
             metric.reset()
-            return {"eval_accuracy": accu}
-
-            # return {
-            #     "accuracy": (preds == p.label_ids).astype(np.float32).mean()
-            #     .item()
-            # }
+            return {"accuracy": accu}
 
         super().__init__(
             model,
diff --git a/examples/language_model/ernie-1.0/finetune/token_classification.py b/examples/language_model/ernie-1.0/finetune/token_classification.py
index 851112290f9c..c6ecb2776c12 100644
--- a/examples/language_model/ernie-1.0/finetune/token_classification.py
+++ b/examples/language_model/ernie-1.0/finetune/token_classification.py
@@ -1,3 +1,4 @@
+# Copyright 2020-present the HuggingFace Inc. team.
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/examples/language_model/ernie-1.0/finetune/trainer_args.py b/examples/language_model/ernie-1.0/finetune/trainer_args.py
index d8bf99d25f75..fca87a350a50 100644
--- a/examples/language_model/ernie-1.0/finetune/trainer_args.py
+++ b/examples/language_model/ernie-1.0/finetune/trainer_args.py
@@ -1,3 +1,4 @@
+# Copyright 2020-present the HuggingFace Inc. team.
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/examples/language_model/ernie-1.0/finetune/trainer_base.py b/examples/language_model/ernie-1.0/finetune/trainer_base.py
index 497224aa544a..2cdaf4977e6d 100644
--- a/examples/language_model/ernie-1.0/finetune/trainer_base.py
+++ b/examples/language_model/ernie-1.0/finetune/trainer_base.py
@@ -1,3 +1,4 @@
+# Copyright 2020-present the HuggingFace Inc. team.
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -42,9 +43,15 @@
 from trainer_args import (TrainingArguments, )
 # from trainer_callback import TrainerState, TrainerControl
 
-from trainer_utils import (IntervalStrategy, EvaluationStrategy, EvalPrediction,
-                           PredictionOutput, EvalLoopOutput, speed_metrics,
-                           OptimizerNames)
+from trainer_utils import (
+    IntervalStrategy,
+    EvaluationStrategy,
+    TrainOutput,
+    EvalPrediction,
+    PredictionOutput,
+    EvalLoopOutput,
+    speed_metrics,
+    OptimizerNames, )
 
 from trainer_callback import (
     CallbackHandler,
@@ -206,6 +213,7 @@ class Trainer:
           in `train`)
 
     """
+    from trainer_utils import log_metrics, metrics_format, save_metrics, save_state
 
     def __init__(
             self,
@@ -273,6 +281,7 @@ def __init__(
 
         self.state = TrainerState()
         self.control = TrainerControl()
+
         callbacks = DEFAULT_CALLBACKS
         self.callback_handler = CallbackHandler(callbacks, self.model,
                                                 self.tokenizer, self.optimizer,
@@ -298,26 +307,51 @@ def __init__(
                                "QusetionAnswering" in type(self.model).__name__
                                else ["labels"])
         self.label_names = default_label_names if self.args.label_names is None else self.args.label_names
+
+        self.control = self.callback_handler.on_init_end(self.args, self.state,
+                                                         self.control)
         self.print_config()
 
     def add_callback(self, callback):
         """
-        Add a callback to the current list of [`~transformer.TrainerCallback`].
+        Add a callback to the current list of [`~TrainerCallback`].
 
         Args:
-           callback (`type` or [`~transformer.TrainerCallback`]):
-               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`]. In the
+           callback (`type` or [`~TrainerCallback`]):
+               A [`~TrainerCallback`] class or an instance of a [`~TrainerCallback`]. In the
                first case, will instantiate a member of that class.
         """
         self.callback_handler.add_callback(callback)
 
+    def pop_callback(self, callback):
+        """
+        Remove a callback from the current list of [`~transformer.TrainerCallback`] and returns it.
+        If the callback is not found, returns `None` (and no error is raised).
+        Args:
+           callback (`type` or [`~transformer.TrainerCallback`]):
+               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`]. In the
+               first case, will pop the first member of that class found in the list of callbacks.
+        Returns:
+            [`~transformer.TrainerCallback`]: The callback removed, if found.
+        """
+        return self.callback_handler.pop_callback(callback)
+
+    def remove_callback(self, callback):
+        """
+        Remove a callback from the current list of [`~transformer.TrainerCallback`].
+        Args:
+           callback (`type` or [`~transformer.TrainerCallback`]):
+               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`]. In the
+               first case, will remove the first member of that class found in the list of callbacks.
+        """
+        self.callback_handler.remove_callback(callback)
+
     def train(
             self,
             resume_from_checkpoint: Optional[Union[str, bool]]=None,
             ignore_keys_for_eval: Optional[List[str]]=None,
             **kwargs, ):
-        print("training!!!!")
-        logger.info("training!!!!")
+
         train_dataloader = self.get_train_dataloader()
         model = self._wrap_model(self.model_wrapped)
 
@@ -338,11 +372,13 @@ def train(
             num_train_epochs = math.ceil(args.num_train_epochs)
             num_train_samples = len(self.train_dataset) * args.num_train_epochs
 
-        if args.num_training_steps // args.eval_steps < args.minimum_eval_times:
-            exp_step = args.num_training_steps / args.minimum_eval_times
-            exp_step = max(int(exp_step - exp_step % 10), 10)
-            logger.info("Set eval step to %d" % exp_step)
-            args.eval_steps = exp_step
+        if args.minimum_eval_times is not None and args.minimum_eval_times > 0:
+            if args.num_training_steps // args.eval_steps < args.minimum_eval_times:
+                exp_step = args.num_training_steps / args.minimum_eval_times
+                exp_step = max(int(exp_step - exp_step % 10), 10)
+                logger.info("Reset eval step by minimum_eval_times to %d" %
+                            exp_step)
+                args.eval_steps = exp_step
 
         self.create_optimizer_and_scheduler(
             num_training_steps=args.num_training_steps)
@@ -365,16 +401,14 @@ def train(
         self.state.epoch = 0
         self.state.max_steps = int(args.num_training_steps)
         self.state.num_train_epochs = num_train_epochs
-        self.state.is_local_process_zero = 0
-        self.state.is_world_process_zero = 0
+        self.state.is_local_process_zero = self.is_local_process_zero()
+        self.state.is_world_process_zero = self.is_world_process_zero()
 
         start_time = time.time()
         epochs_trained = 0
         steps_trained_in_current_epoch = 0
         steps_trained_progress_bar = None
 
-        self.training_bar = tqdm(total=self.state.max_steps)
-
         epoch_iterator = train_dataloader
         steps_in_epoch = len(epoch_iterator)
 
@@ -397,26 +431,111 @@ def train(
                 args, self.state, self.control)
 
             for step, inputs in enumerate(epoch_iterator):
-                # print(inputs)
-                # print("=="*20)
-                tr_loss_step = self.training_step(model, inputs)
+
+                if step % args.gradient_accumulation_steps == 0:
+                    self.control = self.callback_handler.on_step_begin(
+                        args, self.state, self.control)
+
+                if (((step + 1) % args.gradient_accumulation_steps != 0) and
+                        args.local_rank != -1 and
+                        args._no_sync_in_gradient_accumulation):
+                    # Avoid unnecessary DDP synchronization since there will be no backward pass on this example.
+                    with model.no_sync():
+                        tr_loss_step = self.training_step(model, inputs)
+                else:
+                    tr_loss_step = self.training_step(model, inputs)
+
                 # self.scaler.step(self.optimizer)
                 # self.scaler.update()
                 tr_loss += tr_loss_step
-                self.training_bar.update(1)
 
-                self.optimizer.step()
-                self.lr_scheduler.step()
-                self.optimizer.clear_grad()
+                if (step + 1) % args.gradient_accumulation_steps == 0 or (
+                        # last step in epoch but step is always smaller than gradient_accumulation_steps
+                        steps_in_epoch <= args.gradient_accumulation_steps and
+                    (step + 1) == steps_in_epoch):
+                    self.optimizer.step()
+                    self.lr_scheduler.step()
+                    self.optimizer.clear_grad()
 
-                self.state.global_step += 1
-                self.state.epoch = epoch + (step + 1) / steps_in_epoch
+                    self.state.global_step += 1
+                    self.state.epoch = epoch + (step + 1) / steps_in_epoch
 
-                self.control = self.callback_handler.on_step_end(
-                    args, self.state, self.control)
+                    self.control = self.callback_handler.on_step_end(
+                        args, self.state, self.control)
+
+                    self._maybe_log_save_evaluate(tr_loss, model, epoch,
+                                                  ignore_keys_for_eval)
+                else:
+                    self.control = self.callback_handler.on_substep_end(
+                        args, self.state, self.control)
+
+                if self.control.should_epoch_stop or self.control.should_training_stop:
+                    break
+
+            if step < 0:
+                logger.warning(
+                    f"There seems to be not a single sample in your epoch_iterator, stopping training at step"
+                    f" {self.state.global_step}! This is expected if you're using an IterableDataset and set"
+                    f" num_steps ({max_steps}) higher than the number of available samples."
+                )
+                self.control.should_training_stop = True
 
-                self._maybe_log_save_evaluate(tr_loss, model, epoch,
-                                              ignore_keys_for_eval)
+            self.control = self.callback_handler.on_epoch_end(args, self.state,
+                                                              self.control)
+            self._maybe_log_save_evaluate(tr_loss, model, epoch,
+                                          ignore_keys_for_eval)
+
+            if self.control.should_training_stop:
+                break
+
+        if args.past_index and hasattr(self, "_past"):
+            # Clean the state at the end of training
+            delattr(self, "_past")
+
+        logger.info(
+            "\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n"
+        )
+        if args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
+            if args.local_rank != -1:
+                dist.barrier()
+
+            logger.info(
+                f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric})."
+            )
+
+            best_model_path = os.path.join(self.state.best_model_checkpoint,
+                                           WEIGHTS_NAME)
+            if os.path.exists(best_model_path):
+                # We load the model state dict on the CPU to avoid an OOM error.
+                state_dict = torch.load(best_model_path, map_location="cpu")
+                # If the model is on the GPU, it still works!
+                self._load_state_dict_in_model(state_dict)
+            else:
+                logger.warning(
+                    f"Could not locate the best model at {best_model_path}, if you are running a distributed training "
+                    "on multiple nodes, you should activate `--save_on_each_node`."
+                )
+
+        self._total_loss_scalar += tr_loss.item()
+        train_loss = self._total_loss_scalar / self.state.global_step
+
+        metrics = speed_metrics(
+            "train",
+            start_time,
+            num_samples=num_train_samples,
+            num_steps=self.state.max_steps)
+
+        metrics["total_flos"] = self.state.total_flos
+        metrics["train_loss"] = train_loss
+
+        self.is_in_train = False
+
+        self.log(metrics)
+
+        self.control = self.callback_handler.on_train_end(args, self.state,
+                                                          self.control)
+
+        return TrainOutput(self.state.global_step, train_loss, metrics)
 
     def training_step(
             self, model: nn.Layer,
@@ -464,7 +583,7 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch,
             tr_loss_scalar = tr_loss.mean().item()
 
             # reset tr_loss to zero
-            tr_loss -= tr_loss
+            tr_loss.subtract_(tr_loss)
 
             logs["loss"] = round(tr_loss_scalar / (
                 self.state.global_step - self._globalstep_last_logged), 4)
@@ -477,7 +596,6 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch,
 
         metrics = None
         if self.control.should_evaluate:
-            logger.info("evaluating!!!!!")
             metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
 
         if self.control.should_save:
@@ -740,21 +858,18 @@ def compute_loss(self, model, inputs, return_outputs=False):
         How the loss is computed by Trainer. By default, all models return the loss in the first element.
         Subclass and override for custom behavior.
         """
-        if self.criterion is not None:
+        if self.criterion is not None and "labels" in inputs:
             labels = inputs.pop("labels")
         else:
             labels = None
 
-        # print(inputs)
-
         outputs = model(**inputs)
 
-        # outputs = model(*inputs)
-
         if self.criterion is not None:
             # print(outputs)
             loss = self.criterion(outputs, labels)
             outputs = (loss, outputs)
+
         # Save past state if it exists
         # TODO: this needs to be fixed and made cleaner later.
         if self.args.past_index >= 0:
@@ -795,15 +910,14 @@ def training_step(
             loss = loss.mean(
             )  # mean() to average on multi-gpu parallel training
 
-        # if self.args.gradient_accumulation_steps > 1:
-        #     # deepspeed handles loss scaling by gradient_accumulation_steps in its `backward`
-        #     loss = loss / self.args.gradient_accumulation_steps
+        if self.args.gradient_accumulation_steps > 1:
+            loss = loss / self.args.gradient_accumulation_steps
 
         if self.do_grad_scaling:
             self.scaler.scale(loss).backward()
         else:
             loss.backward()
-        # print(loss)
+
         return loss.detach()
 
     def save_model(self, output_dir: Optional[str]=None):
@@ -832,7 +946,6 @@ def _save_checkpoint(self, model, metrics=None):
         self.save_model(output_dir)
 
         if self.args.should_save:
-            # deepspeed.save_checkpoint above saves model/optim/sched
             paddle.save(self.optimizer.state_dict(),
                         os.path.join(output_dir, OPTIMIZER_NAME))
             with warnings.catch_warnings(record=True) as caught_warnings:
@@ -867,6 +980,8 @@ def _save_checkpoint(self, model, metrics=None):
             "numpy": np.random.get_state(),
         }
 
+        # TODO: ZHUI save paddle, cudnn seed.
+
         # A process can arrive here before the process 0 has a chance to save the model, in which case output_dir may
         # not yet exist.
         os.makedirs(output_dir, exist_ok=True)
@@ -1138,9 +1253,13 @@ def evaluation_loop(
         else:
             metrics = {}
 
-        metrics["eval_loss"] = float(np.mean(losses))
+        if losses is not None:
+            metrics[f"{metric_key_prefix}_loss"] = float(np.mean(losses))
 
-        print(metrics)
+        # Prefix all keys with metric_key_prefix + '_'
+        for key in list(metrics.keys()):
+            if not key.startswith(f"{metric_key_prefix}_"):
+                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
 
         return EvalLoopOutput(
             predictions=all_preds,
@@ -1245,78 +1364,24 @@ def num_examples(self, dataloader: DataLoader) -> int:
         """
         return len(dataloader.dataset)
 
-    def create_dataloader(self,
-                          dataset,
-                          mode='train',
-                          batch_size=16,
-                          batchify_fn=None,
-                          trans_fn=None,
-                          batched=False):
+    def is_local_process_zero(self) -> bool:
         """
+        Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on several
+        machines) main process.
         """
-        if trans_fn:
-            dataset = dataset.map(trans_fn, batched=batched)
-
-        shuffle = True if mode == 'train' else False
-        if mode == 'train':
-            batch_sampler = paddle.io.DistributedBatchSampler(
-                dataset, batch_size=batch_size, shuffle=shuffle)
-        else:
-            batch_sampler = paddle.io.BatchSampler(
-                dataset, batch_size=batch_size, shuffle=shuffle)
-
-        return paddle.io.DataLoader(
-            dataset=dataset,
-            batch_sampler=batch_sampler,
-            collate_fn=batchify_fn,
-            num_workers=0,
-            return_list=True)
+        return self.args.local_process_index == 0
 
-    def eval(self, *args, **kwargs):
+    def is_world_process_zero(self) -> bool:
         """
+        Whether or not this process is the global main process (when training in a distributed fashion on several
+        machines, this is only going to be `True` for one process).
         """
-        pass
+        return self.args.process_index == 0
 
-    def prepare_train_config(self):
+    def eval(self, *args, **kwargs):
         """
         """
-        if self.args.max_steps > 0:
-            self.args.num_training_steps = self.args.max_steps
-            self.args.num_train_epochs = math.ceil(
-                self.args.num_training_steps / len(self.train_dl))
-
-        else:
-            self.args.num_training_steps = len(
-                self.train_dl) * self.args.num_train_epochs
-            self.args.num_train_epochs = self.args.num_train_epochs
-
-        if self.args.num_training_steps // self.args.eval_steps < self.args.minimum_eval_times:
-            exp_step = self.args.num_training_steps / self.args.minimum_eval_times
-            exp_step = max(int(exp_step - exp_step % 10), 10)
-            logger.info("Set eval step to %d" % exp_step)
-            self.args.eval_steps = exp_step
-
-        warmup = self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_ratio
-
-        self.lr_scheduler = LinearDecayWithWarmup(
-            self.args.learning_rate, self.args.num_training_steps, warmup)
-
-        # Generate parameter names needed to perform weight decay.
-        # All bias and LayerNorm parameters are excluded.
-        decay_params = [
-            p.name for n, p in self.model.named_parameters()
-            if not any(nd in n for nd in ["bias", "norm"])
-        ]
-
-        self.optimizer = paddle.optimizer.AdamW(
-            learning_rate=self.lr_scheduler,
-            beta1=0.9,
-            beta2=0.999,
-            epsilon=self.args.adam_epsilon,
-            parameters=self.model.parameters(),
-            weight_decay=self.args.weight_decay,
-            apply_decay_param_fun=lambda x: x in decay_params,
-            grad_clip=nn.ClipGradByGlobalNorm(self.args.max_grad_norm))
+        pass
 
     def print_config(self):
         """
diff --git a/examples/language_model/ernie-1.0/finetune/trainer_callback.py b/examples/language_model/ernie-1.0/finetune/trainer_callback.py
index 7336da8fd321..95689ff46400 100644
--- a/examples/language_model/ernie-1.0/finetune/trainer_callback.py
+++ b/examples/language_model/ernie-1.0/finetune/trainer_callback.py
@@ -1,5 +1,5 @@
-# coding=utf-8
 # Copyright 2020-present the HuggingFace Inc. team.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/examples/language_model/ernie-1.0/finetune/trainer_utils.py b/examples/language_model/ernie-1.0/finetune/trainer_utils.py
index cfa4392ba489..978d44e49469 100644
--- a/examples/language_model/ernie-1.0/finetune/trainer_utils.py
+++ b/examples/language_model/ernie-1.0/finetune/trainer_utils.py
@@ -1,5 +1,5 @@
-# coding=utf-8
 # Copyright 2020-present the HuggingFace Inc. team.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,7 +15,9 @@
 """
 Utilities for the Trainer class. 
 """
-
+import datetime
+import json
+import math
 import copy
 import functools
 import gc
@@ -209,3 +211,103 @@ class SchedulerType(ExplicitEnum):
     POLYNOMIAL = "polynomial"
     CONSTANT = "constant"
     CONSTANT_WITH_WARMUP = "constant_with_warmup"
+
+
+def _secs2timedelta(secs):
+    """
+    convert seconds to hh:mm:ss.msec, msecs rounded to 2 decimals
+    """
+
+    msec = int(abs(secs - int(secs)) * 100)
+    return f"{datetime.timedelta(seconds=int(secs))}.{msec:02d}"
+
+
+def metrics_format(self, metrics: Dict[str, float]) -> Dict[str, float]:
+    """
+    Reformat Trainer metrics values to a human-readable format
+    Args:
+        metrics (`Dict[str, float]`):
+            The metrics returned from train/evaluate/predict
+    Returns:
+        metrics (`Dict[str, float]`): The reformatted metrics
+    """
+
+    metrics_copy = metrics.copy()
+    for k, v in metrics_copy.items():
+        if "_mem_" in k:
+            metrics_copy[k] = f"{ v >> 20 }MB"
+        elif "_runtime" in k:
+            metrics_copy[k] = _secs2timedelta(v)
+        elif k == "total_flos":
+            metrics_copy[k] = f"{ int(v) >> 30 }GF"
+        elif type(metrics_copy[k]) == float:
+            metrics_copy[k] = round(v, 4)
+
+    return metrics_copy
+
+
+def log_metrics(self, split, metrics):
+    """
+    Log metrics in a specially formatted way
+    Under distributed environment this is done only for a process with rank 0.
+    Args:
+        split (`str`):
+            Mode/split name: one of `train`, `eval`, `test`
+        metrics (`Dict[str, float]`):
+            The metrics returned from train/evaluate/predictmetrics: metrics dict
+    """
+    if not self.is_world_process_zero():
+        return
+
+    print(f"***** {split} metrics *****")
+    metrics_formatted = self.metrics_format(metrics)
+    k_width = max(len(str(x)) for x in metrics_formatted.keys())
+    v_width = max(len(str(x)) for x in metrics_formatted.values())
+    for key in sorted(metrics_formatted.keys()):
+        print(f"  {key: <{k_width}} = {metrics_formatted[key]:>{v_width}}")
+
+
+def save_metrics(self, split, metrics, combined=True):
+    """
+    Save metrics into a json file for that split, e.g. `train_results.json`.
+    Under distributed environment this is done only for a process with rank 0.
+    Args:
+        split (`str`):
+            Mode/split name: one of `train`, `eval`, `test`, `all`
+        metrics (`Dict[str, float]`):
+            The metrics returned from train/evaluate/predict
+        combined (`bool`, *optional*, defaults to `True`):
+            Creates combined metrics by updating `all_results.json` with metrics of this call
+    To understand the metrics please read the docstring of [`~Trainer.log_metrics`]. The only difference is that raw
+    unformatted numbers are saved in the current method.
+    """
+    if not self.is_world_process_zero():
+        return
+
+    path = os.path.join(self.args.output_dir, f"{split}_results.json")
+    with open(path, "w") as f:
+        json.dump(metrics, f, indent=4, sort_keys=True)
+
+    if combined:
+        path = os.path.join(self.args.output_dir, "all_results.json")
+        if os.path.exists(path):
+            with open(path, "r") as f:
+                all_metrics = json.load(f)
+        else:
+            all_metrics = {}
+
+        all_metrics.update(metrics)
+        with open(path, "w") as f:
+            json.dump(all_metrics, f, indent=4, sort_keys=True)
+
+
+def save_state(self):
+    """
+    Saves the Trainer state, since Trainer.save_model saves only the tokenizer with the model
+    Under distributed environment this is done only for a process with rank 0.
+    """
+    if not self.is_world_process_zero():
+        return
+
+    path = os.path.join(self.args.output_dir, "trainer_state.json")
+    self.state.save_to_json(path)

From fdadab9680557636aeb6990d5ef0cfc729a6383c Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 24 Mar 2022 15:35:43 +0800
Subject: [PATCH 07/15] support multi-cards training evaluation.

---
 .../ernie-1.0/finetune/finetune.py            | 150 ++++++--
 .../ernie-1.0/finetune/question_answering.py  |   2 +-
 .../finetune/sequence_classification.py       |  18 +-
 .../finetune/token_classification.py          |   2 +-
 paddlenlp/__init__.py                         |   1 +
 paddlenlp/trainer/__init__.py                 |  17 +
 paddlenlp/trainer/argparser.py                | 268 ++++++++++++++
 .../trainer}/trainer_args.py                  |  30 +-
 .../trainer}/trainer_base.py                  | 263 +++++++------
 .../trainer}/trainer_callback.py              |   6 +-
 .../trainer}/trainer_utils.py                 |  11 +
 paddlenlp/trainer/utils/logging.py            | 344 ++++++++++++++++++
 12 files changed, 948 insertions(+), 164 deletions(-)
 create mode 100644 paddlenlp/trainer/__init__.py
 create mode 100644 paddlenlp/trainer/argparser.py
 rename {examples/language_model/ernie-1.0/finetune => paddlenlp/trainer}/trainer_args.py (98%)
 rename {examples/language_model/ernie-1.0/finetune => paddlenlp/trainer}/trainer_base.py (88%)
 rename {examples/language_model/ernie-1.0/finetune => paddlenlp/trainer}/trainer_callback.py (99%)
 rename {examples/language_model/ernie-1.0/finetune => paddlenlp/trainer}/trainer_utils.py (97%)
 create mode 100644 paddlenlp/trainer/utils/logging.py

diff --git a/examples/language_model/ernie-1.0/finetune/finetune.py b/examples/language_model/ernie-1.0/finetune/finetune.py
index d8554a46ee8d..b001a3e1f70b 100644
--- a/examples/language_model/ernie-1.0/finetune/finetune.py
+++ b/examples/language_model/ernie-1.0/finetune/finetune.py
@@ -22,6 +22,8 @@
 from functools import partial
 import distutils.util
 import os.path as osp
+from dataclasses import dataclass, field
+from typing import Optional
 
 import numpy as np
 import paddle
@@ -29,11 +31,15 @@
 import paddle.nn.functional as F
 
 import paddlenlp
+from paddlenlp.trainer import (
+    PdArgumentParser,
+    TrainingArguments, )
 from paddlenlp.datasets import load_dataset
-from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer
-from paddlenlp.transformers import AutoModelForTokenClassification
-from paddlenlp.transformers import AutoModelForQuestionAnswering
-from paddlenlp.transformers import AutoTokenizer
+from paddlenlp.transformers import (
+    AutoModelForSequenceClassification,
+    AutoModelForTokenClassification,
+    AutoModelForQuestionAnswering,
+    AutoTokenizer, )
 from paddlenlp.utils.log import logger
 
 sys.path.insert(0, os.path.abspath("."))
@@ -208,6 +214,84 @@ def parse_args():
     return args
 
 
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `PdArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    dataset: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the dataset to use (via the datasets library)."
+        })
+
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help":
+            "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        }, )
+
+    # Additional configs for QA task.
+    doc_stride: int = field(
+        default=128,
+        metadata={
+            "help":
+            "When splitting up a long document into chunks, how much stride to take between chunks."
+        }, )
+
+    n_best_size: int = field(
+        default=20,
+        metadata={
+            "help":
+            "The total number of n-best predictions to generate in the nbest_predictions.json output file."
+        }, )
+
+    max_query_length: int = field(
+        default=64,
+        metadata={"help": "Max query length."}, )
+
+    max_answer_length: int = field(
+        default=30,
+        metadata={"help": "Max answer length."}, )
+
+    do_lower_case: bool = field(
+        default=False,
+        metadata={
+            "help":
+            "Whether to lower case the input text. Should be True for uncased models and False for cased models."
+        }, )
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(metadata={
+        "help":
+        "Path to pretrained model or model identifier from https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html"
+    })
+    config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help":
+            "Pretrained config name or path if not the same as model_name"
+        })
+    tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help":
+            "Pretrained tokenizer name or path if not the same as model_name"
+        })
+
+
 def set_seed(args):
     # Use the same data seed(for data shuffle) for all procs to guarantee data
     # consistency after sharding.
@@ -218,78 +302,85 @@ def set_seed(args):
     paddle.seed(args.seed)
 
 
-def do_train(args):
-    paddle.set_device(args.device)
+def do_train():
+    parser = PdArgumentParser(
+        (ModelArguments, DataTrainingArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    paddle.set_device(training_args.device)
     rank = paddle.distributed.get_rank()
     if paddle.distributed.get_world_size() > 1:
         paddle.distributed.init_parallel_env()
 
-    set_seed(args)
-    args.dataset = args.dataset.strip()
+    # set_seed(args)
+    data_args.dataset = data_args.dataset.strip()
 
-    if args.dataset not in ALL_DATASETS:
-        raise ValueError("Not found {}".format(args.dataset))
+    if data_args.dataset not in ALL_DATASETS:
+        raise ValueError("Not found {}".format(data_args.dataset))
 
-    config = ALL_DATASETS[args.dataset]
-    for arg in vars(args):
-        if getattr(args, arg) is None:
+    config = ALL_DATASETS[data_args.dataset]
+    for args in (model_args, data_args, training_args):
+        for arg in vars(args):
+            # if getattr(args, arg) is None:
             if arg in config.keys():
                 setattr(args, arg, config[arg])
 
-    dataset_config = args.dataset.split(" ")
+    training_args.per_device_train_batch_size = config["batch_size"]
+    training_args.per_device_eval_batch_size = config["batch_size"]
+
+    dataset_config = data_args.dataset.split(" ")
     all_ds = load_dataset(
         dataset_config[0],
         None if len(dataset_config) <= 1 else dataset_config[1],
         # lazy=False
     )
 
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
 
-    args.label_list = getattr(all_ds['train'], "label_list", None)
+    data_args.label_list = getattr(all_ds['train'], "label_list", None)
 
     num_classes = 1 if all_ds["train"].label_list == None else len(all_ds[
         'train'].label_list)
 
     model = getattr(paddlenlp.transformers, config["model"]).from_pretrained(
-        args.model_name_or_path, num_classes=num_classes)
+        model_args.model_name_or_path, num_classes=num_classes)
 
     if paddle.distributed.get_world_size() > 1:
         model = paddle.DataParallel(model)
 
     if "SequenceClassification" in config["model"]:
-        if 'clue' in args.dataset:
+        if 'clue' in data_args.dataset:
             trainer = ClueTrainer(all_ds["train"], all_ds["dev"], model,
-                                  tokenizer, args)
+                                  tokenizer, training_args)
         else:
             trainer = SeqTrainer(
                 all_ds["train"],
                 all_ds["dev"],
                 model,
                 tokenizer,
-                args,
+                data_args,
+                training_args,
                 test_ds=all_ds["test"])
+
     elif "QuestionAnswering" in config["model"]:
         trainer = MrcTrainer(all_ds["train"], all_ds["dev"], model, tokenizer,
-                             args)
+                             training_args)
     elif 'TokenClassification' in config["model"]:
         trainer = NerTrainer(
             all_ds["train"],
             all_ds["dev"],
             model,
             tokenizer,
-            args,
+            training_args,
             test_ds=all_ds["test"])
 
     train_result = trainer.train(resume_from_checkpoint=None)
     metrics = train_result.metrics
 
-    # max_train_samples = (
-    #     data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-    # )
-    # metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
     trainer.save_model()  # Saves the tokenizer too for easy upload
 
+    # trainer.save_infer_model() -> 部署, onnx, slim, 量化后可否加速
+
     trainer.log_metrics("train", metrics)
     trainer.save_metrics("train", metrics)
     trainer.save_state()
@@ -307,6 +398,7 @@ def print_arguments(args):
 
 
 if __name__ == "__main__":
-    args = parse_args()
+    # args = parse_args()
+
     # print_arguments(args)
-    do_train(args)
+    do_train()
diff --git a/examples/language_model/ernie-1.0/finetune/question_answering.py b/examples/language_model/ernie-1.0/finetune/question_answering.py
index 258601d48b45..64ef323fe5a7 100644
--- a/examples/language_model/ernie-1.0/finetune/question_answering.py
+++ b/examples/language_model/ernie-1.0/finetune/question_answering.py
@@ -22,7 +22,7 @@
 from paddlenlp.data import Pad, Stack, Tuple, Dict
 from paddlenlp.metrics.squad import squad_evaluate, compute_prediction
 
-from trainer_base import TrainerBase
+from paddlenlp.trainer.trainer_base import TrainerBase
 from paddlenlp.utils.log import logger
 
 
diff --git a/examples/language_model/ernie-1.0/finetune/sequence_classification.py b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
index f598eac948a9..a7dd0170608c 100644
--- a/examples/language_model/ernie-1.0/finetune/sequence_classification.py
+++ b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
@@ -26,7 +26,7 @@
 from paddlenlp.data import Stack, Tuple, Pad
 from paddlenlp.utils.log import logger
 
-from trainer_base import TrainerBase, Trainer
+from paddlenlp.trainer.trainer_base import TrainerBase, Trainer
 
 
 def convert_example(example, tokenizer, max_seq_length=512, is_test=False):
@@ -331,11 +331,11 @@ def dataloader_inner(self):
 
 
 class SeqTrainer(Trainer):
-    def __init__(self, train_ds, dev_ds, model, tokenizer, args, *arg,
-                 **kwargs):
+    def __init__(self, train_ds, dev_ds, model, tokenizer, data_args,
+                 training_args, *arg, **kwargs):
 
-        trans_fn = partial(seq_trans_fn, tokenizer=tokenizer, args=args)
-        batchify_fn = clue_batchify_fn_dict(tokenizer, args)
+        trans_fn = partial(seq_trans_fn, tokenizer=tokenizer, args=data_args)
+        batchify_fn = clue_batchify_fn_dict(tokenizer, data_args)
 
         train_ds = train_ds.map(trans_fn)
         dev_ds = dev_ds.map(trans_fn)
@@ -346,10 +346,14 @@ def __init__(self, train_ds, dev_ds, model, tokenizer, args, *arg,
         def compute_metrics(p):
             preds = p.predictions[0] if isinstance(p.predictions,
                                                    tuple) else p.predictions
+
+            preds = paddle.to_tensor(preds)
+            label = paddle.to_tensor(p.label_ids)
+
             probs = F.softmax(preds, axis=1)
             metric = Accuracy()
             metric.reset()
-            result = metric.compute(preds, p.label_ids)
+            result = metric.compute(preds, label)
             metric.update(result)
             accu = metric.accumulate()
             metric.reset()
@@ -358,7 +362,7 @@ def compute_metrics(p):
         super().__init__(
             model,
             loss_fct,
-            args,
+            training_args,
             batchify_fn,
             train_ds,
             dev_ds,
diff --git a/examples/language_model/ernie-1.0/finetune/token_classification.py b/examples/language_model/ernie-1.0/finetune/token_classification.py
index c6ecb2776c12..2e6f6344bbca 100644
--- a/examples/language_model/ernie-1.0/finetune/token_classification.py
+++ b/examples/language_model/ernie-1.0/finetune/token_classification.py
@@ -30,7 +30,7 @@
 from paddlenlp.data import Stack, Tuple, Pad, Dict
 from paddlenlp.utils.log import logger
 
-from trainer_base import TrainerBase
+from paddlenlp.trainer.trainer_base import TrainerBase
 
 
 @paddle.no_grad()
diff --git a/paddlenlp/__init__.py b/paddlenlp/__init__.py
index 4aea54cd30c1..7e2eae95b3b6 100644
--- a/paddlenlp/__init__.py
+++ b/paddlenlp/__init__.py
@@ -31,6 +31,7 @@
 from . import losses
 from . import experimental
 from .taskflow import Taskflow
+from . import trainer
 import paddle
 
 paddle.disable_signal_handler()
diff --git a/paddlenlp/trainer/__init__.py b/paddlenlp/trainer/__init__.py
new file mode 100644
index 000000000000..f2a3ca873fc6
--- /dev/null
+++ b/paddlenlp/trainer/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you smay not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .argparser import PdArgumentParser
+from .trainer_args import TrainingArguments
+from .trainer_base import Trainer
\ No newline at end of file
diff --git a/paddlenlp/trainer/argparser.py b/paddlenlp/trainer/argparser.py
new file mode 100644
index 000000000000..d7df0c9406cf
--- /dev/null
+++ b/paddlenlp/trainer/argparser.py
@@ -0,0 +1,268 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# # Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+import json
+import sys
+from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, ArgumentTypeError
+from copy import copy
+from enum import Enum
+from inspect import isclass
+from pathlib import Path
+from typing import Any, Dict, Iterable, NewType, Optional, Tuple, Union, get_type_hints
+
+DataClass = NewType("DataClass", Any)
+DataClassType = NewType("DataClassType", Any)
+
+
+# From https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
+def string_to_bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise ArgumentTypeError(
+            f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
+        )
+
+
+class PdArgumentParser(ArgumentParser):
+    """
+    This subclass of `argparse.ArgumentParser` uses type hints on dataclasses to generate arguments.
+
+    The class is designed to play well with the native argparse. In particular, you can add more (non-dataclass backed)
+    arguments to the parser after initialization and you'll get the output back after parsing as an additional
+    namespace. Optional: To create sub argument groups use the `_argument_group_name` attribute in the dataclass.
+    """
+
+    dataclass_types: Iterable[DataClassType]
+
+    def __init__(self,
+                 dataclass_types: Union[DataClassType, Iterable[DataClassType]],
+                 **kwargs):
+        """
+        Args:
+            dataclass_types:
+                Dataclass type, or list of dataclass types for which we will "fill" instances with the parsed args.
+            kwargs:
+                (Optional) Passed to `argparse.ArgumentParser()` in the regular way.
+        """
+        # To make the default appear when using --help
+        if "formatter_class" not in kwargs:
+            kwargs["formatter_class"] = ArgumentDefaultsHelpFormatter
+        super().__init__(**kwargs)
+        if dataclasses.is_dataclass(dataclass_types):
+            dataclass_types = [dataclass_types]
+        self.dataclass_types = list(dataclass_types)
+        for dtype in self.dataclass_types:
+            self._add_dataclass_arguments(dtype)
+
+    @staticmethod
+    def _parse_dataclass_field(parser: ArgumentParser,
+                               field: dataclasses.Field):
+        field_name = f"--{field.name}"
+        kwargs = field.metadata.copy()
+        # field.metadata is not used at all by Data Classes,
+        # it is provided as a third-party extension mechanism.
+        if isinstance(field.type, str):
+            raise RuntimeError(
+                "Unresolved type detected, which should have been done with the help of "
+                "`typing.get_type_hints` method by default")
+
+        origin_type = getattr(field.type, "__origin__", field.type)
+        if origin_type is Union:
+            if len(field.type.__args__) != 2 or type(
+                    None) not in field.type.__args__:
+                raise ValueError(
+                    "Only `Union[X, NoneType]` (i.e., `Optional[X]`) is allowed for `Union`"
+                )
+            if bool not in field.type.__args__:
+                # filter `NoneType` in Union (except for `Union[bool, NoneType]`)
+                field.type = (field.type.__args__[0]
+                              if isinstance(None, field.type.__args__[1]) else
+                              field.type.__args__[1])
+                origin_type = getattr(field.type, "__origin__", field.type)
+
+        # A variable to store kwargs for a boolean field, if needed
+        # so that we can init a `no_*` complement argument (see below)
+        bool_kwargs = {}
+        if isinstance(field.type, type) and issubclass(field.type, Enum):
+            kwargs["choices"] = [x.value for x in field.type]
+            kwargs["type"] = type(kwargs["choices"][0])
+            if field.default is not dataclasses.MISSING:
+                kwargs["default"] = field.default
+            else:
+                kwargs["required"] = True
+        elif field.type is bool or field.type is Optional[bool]:
+            # Copy the currect kwargs to use to instantiate a `no_*` complement argument below.
+            # We do not initialize it here because the `no_*` alternative must be instantiated after the real argument
+            bool_kwargs = copy(kwargs)
+
+            # Hack because type=bool in argparse does not behave as we want.
+            kwargs["type"] = string_to_bool
+            if field.type is bool or (field.default is not None and
+                                      field.default is not dataclasses.MISSING):
+                # Default value is False if we have no default when of type bool.
+                default = False if field.default is dataclasses.MISSING else field.default
+                # This is the value that will get picked if we don't include --field_name in any way
+                kwargs["default"] = default
+                # This tells argparse we accept 0 or 1 value after --field_name
+                kwargs["nargs"] = "?"
+                # This is the value that will get picked if we do --field_name (without value)
+                kwargs["const"] = True
+        elif isclass(origin_type) and issubclass(origin_type, list):
+            kwargs["type"] = field.type.__args__[0]
+            kwargs["nargs"] = "+"
+            if field.default_factory is not dataclasses.MISSING:
+                kwargs["default"] = field.default_factory()
+            elif field.default is dataclasses.MISSING:
+                kwargs["required"] = True
+        else:
+            kwargs["type"] = field.type
+            if field.default is not dataclasses.MISSING:
+                kwargs["default"] = field.default
+            elif field.default_factory is not dataclasses.MISSING:
+                kwargs["default"] = field.default_factory()
+            else:
+                kwargs["required"] = True
+        parser.add_argument(field_name, **kwargs)
+
+        # Add a complement `no_*` argument for a boolean field AFTER the initial field has already been added.
+        # Order is important for arguments with the same destination!
+        # We use a copy of earlier kwargs because the original kwargs have changed a lot before reaching down
+        # here and we do not need those changes/additional keys.
+        if field.default is True and (field.type is bool or
+                                      field.type is Optional[bool]):
+            bool_kwargs["default"] = False
+            parser.add_argument(
+                f"--no_{field.name}",
+                action="store_false",
+                dest=field.name,
+                **bool_kwargs)
+
+    def _add_dataclass_arguments(self, dtype: DataClassType):
+        if hasattr(dtype, "_argument_group_name"):
+            parser = self.add_argument_group(dtype._argument_group_name)
+        else:
+            parser = self
+
+        try:
+            type_hints: Dict[str, type] = get_type_hints(dtype)
+        except NameError:
+            raise RuntimeError(
+                f"Type resolution failed for f{dtype}. Try declaring the class in global scope or "
+                f"removing line of `from __future__ import annotations` which opts in Postponed "
+                f"Evaluation of Annotations (PEP 563)")
+
+        for field in dataclasses.fields(dtype):
+            if not field.init:
+                continue
+            field.type = type_hints[field.name]
+            self._parse_dataclass_field(parser, field)
+
+    def parse_args_into_dataclasses(
+            self,
+            args=None,
+            return_remaining_strings=False,
+            look_for_args_file=True,
+            args_filename=None) -> Tuple[DataClass, ...]:
+        """
+        Parse command-line args into instances of the specified dataclass types.
+
+        This relies on argparse's `ArgumentParser.parse_known_args`. See the doc at:
+        docs.python.org/3.7/library/argparse.html#argparse.ArgumentParser.parse_args
+
+        Args:
+            args:
+                List of strings to parse. The default is taken from sys.argv. (same as argparse.ArgumentParser)
+            return_remaining_strings:
+                If true, also return a list of remaining argument strings.
+            look_for_args_file:
+                If true, will look for a ".args" file with the same base name as the entry point script for this
+                process, and will append its potential content to the command line args.
+            args_filename:
+                If not None, will uses this file instead of the ".args" file specified in the previous argument.
+
+        Returns:
+            Tuple consisting of:
+
+                - the dataclass instances in the same order as they were passed to the initializer.abspath
+                - if applicable, an additional namespace for more (non-dataclass backed) arguments added to the parser
+                  after initialization.
+                - The potential list of remaining argument strings. (same as argparse.ArgumentParser.parse_known_args)
+        """
+        if args_filename or (look_for_args_file and len(sys.argv)):
+            if args_filename:
+                args_file = Path(args_filename)
+            else:
+                args_file = Path(sys.argv[0]).with_suffix(".args")
+
+            if args_file.exists():
+                fargs = args_file.read_text().split()
+                args = fargs + args if args is not None else fargs + sys.argv[
+                    1:]
+                # in case of duplicate arguments the first one has precedence
+                # so we append rather than prepend.
+        namespace, remaining_args = self.parse_known_args(args=args)
+        outputs = []
+        for dtype in self.dataclass_types:
+            keys = {f.name for f in dataclasses.fields(dtype) if f.init}
+            inputs = {k: v for k, v in vars(namespace).items() if k in keys}
+            for k in keys:
+                delattr(namespace, k)
+            obj = dtype(**inputs)
+            outputs.append(obj)
+        if len(namespace.__dict__) > 0:
+            # additional namespace.
+            outputs.append(namespace)
+        if return_remaining_strings:
+            return (*outputs, remaining_args)
+        else:
+            if remaining_args:
+                raise ValueError(
+                    f"Some specified arguments are not used by the PdArgumentParser: {remaining_args}"
+                )
+
+            return (*outputs, )
+
+    def parse_json_file(self, json_file: str) -> Tuple[DataClass, ...]:
+        """
+        Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the
+        dataclass types.
+        """
+        data = json.loads(Path(json_file).read_text())
+        outputs = []
+        for dtype in self.dataclass_types:
+            keys = {f.name for f in dataclasses.fields(dtype) if f.init}
+            inputs = {k: v for k, v in data.items() if k in keys}
+            obj = dtype(**inputs)
+            outputs.append(obj)
+        return (*outputs, )
+
+    def parse_dict(self, args: dict) -> Tuple[DataClass, ...]:
+        """
+        Alternative helper method that does not use `argparse` at all, instead uses a dict and populating the dataclass
+        types.
+        """
+        outputs = []
+        for dtype in self.dataclass_types:
+            keys = {f.name for f in dataclasses.fields(dtype) if f.init}
+            inputs = {k: v for k, v in args.items() if k in keys}
+            obj = dtype(**inputs)
+            outputs.append(obj)
+        return (*outputs, )
diff --git a/examples/language_model/ernie-1.0/finetune/trainer_args.py b/paddlenlp/trainer/trainer_args.py
similarity index 98%
rename from examples/language_model/ernie-1.0/finetune/trainer_args.py
rename to paddlenlp/trainer/trainer_args.py
index fca87a350a50..03a06adb2c55 100644
--- a/examples/language_model/ernie-1.0/finetune/trainer_args.py
+++ b/paddlenlp/trainer/trainer_args.py
@@ -23,9 +23,12 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 
-from utils import logging
-from trainer_utils import (SchedulerType, IntervalStrategy, EvaluationStrategy,
-                           OptimizerNames)
+from .utils import logging
+from .trainer_utils import (
+    SchedulerType,
+    IntervalStrategy,
+    EvaluationStrategy,
+    OptimizerNames, )
 
 logger = logging.get_logger(__name__)
 log_levels = logging.get_log_levels_dict().copy()
@@ -33,10 +36,6 @@
 
 import paddle
 
-# logger = logging.get_logger(__name__)
-log_levels = logging.get_log_levels_dict().copy()
-trainer_log_levels = dict(**log_levels, passive=-1)
-
 
 def default_logdir() -> str:
     """
@@ -450,6 +449,17 @@ class TrainingArguments:
             ("For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
              "See details at https://nvidia.github.io/apex/amp.html")
         }, )
+
+    scale_loss: float = field(
+        default=2**15, metadata={"help": "The value of scale_loss for fp16."})
+
+    minimum_eval_times: int = field(
+        default=None,
+        metadata={
+            "help":
+            "If under eval_steps, the valid time is less then minimum_eval_times, the config of override eval_steps."
+        })
+
     local_rank: int = field(
         default=-1, metadata={"help": "For distributed training: local_rank"})
 
@@ -551,12 +561,6 @@ class TrainingArguments:
             "The path to a folder with a valid checkpoint for your model."
         }, )
 
-    gradient_checkpointing: bool = field(
-        default=False,
-        metadata={
-            "help":
-            "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
-        }, )
     _n_gpu: int = field(init=False, repr=False, default=-1)
 
     def __post_init__(self):
diff --git a/examples/language_model/ernie-1.0/finetune/trainer_base.py b/paddlenlp/trainer/trainer_base.py
similarity index 88%
rename from examples/language_model/ernie-1.0/finetune/trainer_base.py
rename to paddlenlp/trainer/trainer_base.py
index 2cdaf4977e6d..6aac42ffc685 100644
--- a/examples/language_model/ernie-1.0/finetune/trainer_base.py
+++ b/paddlenlp/trainer/trainer_base.py
@@ -37,13 +37,14 @@
 from paddlenlp.transformers import LinearDecayWithWarmup
 from paddlenlp.utils.log import logger
 from paddle.io import DataLoader, DistributedBatchSampler
+import paddle.amp.auto_cast as autocast
 
 import numpy as np
 
-from trainer_args import (TrainingArguments, )
+from .trainer_args import (TrainingArguments, )
 # from trainer_callback import TrainerState, TrainerControl
 
-from trainer_utils import (
+from .trainer_utils import (
     IntervalStrategy,
     EvaluationStrategy,
     TrainOutput,
@@ -53,7 +54,7 @@
     speed_metrics,
     OptimizerNames, )
 
-from trainer_callback import (
+from .trainer_callback import (
     CallbackHandler,
     DefaultFlowCallback,
     PrinterCallback,
@@ -64,7 +65,9 @@
 
 DEFAULT_CALLBACKS = [DefaultFlowCallback]
 
-from utils import logging
+from .utils import logging
+from .utils.helper import (distributed_concat, nested_concat, nested_detach,
+                           nested_numpify, nested_truncate)
 
 from paddlenlp.transformers.model_utils import PretrainedModel, unwrap_model
 from paddlenlp.transformers.tokenizer_utils import PretrainedTokenizer
@@ -75,7 +78,8 @@
 
 
 class DataCollator:
-    pass
+    def __init__(self, *args, **kwargs):
+        pass
 
 
 class DataCollatorWithPadding:
@@ -83,53 +87,6 @@ def __init__(self, *args, **kwargs):
         pass
 
 
-def paddle_pad_and_concatenate(tensor1, tensor2, padding_index=-100):
-    """Concatenates `tensor1` and `tensor2` on first axis, applying padding on the second if necessary."""
-    if len(tensor1.shape) == 1 or tensor1.shape[1] == tensor2.shape[1]:
-        return paddle.concat((tensor1, tensor2), axis=0)
-
-    raise ValueError("pass")
-    # Let's figure out the new shape
-    new_shape = (tensor1.shape[0] + tensor2.shape[0], max(
-        tensor1.shape[1], tensor2.shape[1])) + tensor1.shape[2:]
-
-    # Now let's fill the result tensor
-    result = tensor1.new_full(new_shape, padding_index)
-    result[:tensor1.shape[0], :tensor1.shape[1]] = tensor1
-    result[tensor1.shape[0]:, :tensor2.shape[1]] = tensor2
-    return result
-
-
-def nested_concat(tensors, new_tensors, padding_index=-100):
-    """
-    Concat the `new_tensors` to `tensors` on the first dim and pad them on the second if needed. Works for tensors or
-    nested list/tuples of tensors.
-    """
-    assert type(tensors) == type(
-        new_tensors
-    ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
-    if isinstance(tensors, (list, tuple)):
-        return type(tensors)(nested_concat(
-            t, n, padding_index=padding_index)
-                             for t, n in zip(tensors, new_tensors))
-    elif isinstance(tensors, paddle.Tensor):
-        return paddle_pad_and_concatenate(
-            tensors, new_tensors, padding_index=padding_index)
-    elif isinstance(tensors, np.ndarray):
-        return numpy_pad_and_concatenate(
-            tensors, new_tensors, padding_index=padding_index)
-    else:
-        raise TypeError(
-            f"Unsupported type for concatenation: got {type(tensors)}")
-
-
-def nested_detach(tensors):
-    "Detach `tensors` (even if it's a nested list/tuple of tensors)."
-    if isinstance(tensors, (list, tuple)):
-        return type(tensors)(nested_detach(t) for t in tensors)
-    return tensors.detach()
-
-
 # Name of the files used for checkpointing
 TRAINING_ARGS_NAME = "training_args.bin"
 TRAINER_STATE_NAME = "trainer_state.json"
@@ -148,6 +105,7 @@ def set_seed(seed):
     # Maybe different op seeds(for dropout) for different procs is better. By:
     # `paddle.seed(args.seed + paddle.distributed.get_rank())`
     paddle.seed(seed)
+    # TODO: cuda state seed 
 
 
 class Trainer:
@@ -213,7 +171,7 @@ class Trainer:
           in `train`)
 
     """
-    from trainer_utils import log_metrics, metrics_format, save_metrics, save_state
+    from .trainer_utils import log_metrics, metrics_format, save_metrics, save_state
 
     def __init__(
             self,
@@ -227,8 +185,6 @@ def __init__(
             compute_metrics: Optional[Callable[[EvalPrediction], Dict]]=None,
             optimizers: Tuple[paddle.optimizer.Optimizer,
                               paddle.optimizer.lr.LRScheduler]=(None, None), ):
-        logger.info("init!!!!")
-
         if args is None:
             output_dir = "tmp_trainer"
             logger.info(
@@ -236,22 +192,6 @@ def __init__(
             )
             args = TrainingArguments(output_dir=output_dir)
 
-        output_dir = "tmp_trainer"
-        new_args = TrainingArguments(output_dir=output_dir)
-
-        args.per_device_train_batch_size = args.batch_size
-        args.per_device_eval_batch_size = args.batch_size
-
-        for arg in vars(args):
-            v = getattr(args, arg)
-            if v is not None:
-                try:
-                    setattr(new_args, arg, v)
-                except Exception as e:
-                    print(arg, v)
-                    pass
-        args = new_args
-
         self.args = args
         self.do_grad_scaling = args.fp16
 
@@ -301,6 +241,8 @@ def __init__(
             )
 
         if args.fp16:
+            self.scaler = paddle.amp.GradScaler(
+                init_loss_scaling=self.args.scale_loss)
             logger.info(f"Using  half precision")
 
         default_label_names = (["start_positions", "end_positions"] if
@@ -360,15 +302,17 @@ def train(
 
         total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size
 
+        num_update_steps_per_epoch = len(
+            train_dataloader) // args.gradient_accumulation_steps
+        num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1)
+
         if args.max_steps > 0:
             args.num_training_steps = args.max_steps
-            num_train_epochs = math.ceil(args.num_training_steps /
-                                         len(train_dataloader))
+            num_train_epochs = args.max_steps // num_update_steps_per_epoch + int(
+                args.max_steps % num_update_steps_per_epoch > 0)
             num_train_samples = args.max_steps * total_train_batch_size
-
         else:
-            args.num_training_steps = len(
-                train_dataloader) * args.num_train_epochs
+            args.num_training_steps = num_update_steps_per_epoch * args.num_train_epochs
             num_train_epochs = math.ceil(args.num_train_epochs)
             num_train_samples = len(self.train_dataset) * args.num_train_epochs
 
@@ -394,7 +338,9 @@ def train(
         logger.info(
             f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}"
         )
-        logger.info(f"  Gradient Accumulation steps = {1}")
+        logger.info(
+            f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}"
+        )
         logger.info(f"  Total optimization steps = {args.num_training_steps}")
         logger.info(f"  Total num train samples = {num_train_samples}")
 
@@ -445,15 +391,17 @@ def train(
                 else:
                     tr_loss_step = self.training_step(model, inputs)
 
-                # self.scaler.step(self.optimizer)
-                # self.scaler.update()
                 tr_loss += tr_loss_step
 
                 if (step + 1) % args.gradient_accumulation_steps == 0 or (
                         # last step in epoch but step is always smaller than gradient_accumulation_steps
                         steps_in_epoch <= args.gradient_accumulation_steps and
                     (step + 1) == steps_in_epoch):
-                    self.optimizer.step()
+                    if self.do_grad_scaling:
+                        self.scaler.minimize(self.optimizer, tr_loss)
+                    else:
+                        self.optimizer.step()
+
                     self.lr_scheduler.step()
                     self.optimizer.clear_grad()
 
@@ -492,9 +440,7 @@ def train(
             # Clean the state at the end of training
             delattr(self, "_past")
 
-        logger.info(
-            "\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n"
-        )
+        logger.info("\nTraining completed. \n")
         if args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
             if args.local_rank != -1:
                 dist.barrier()
@@ -507,7 +453,7 @@ def train(
                                            WEIGHTS_NAME)
             if os.path.exists(best_model_path):
                 # We load the model state dict on the CPU to avoid an OOM error.
-                state_dict = torch.load(best_model_path, map_location="cpu")
+                state_dict = paddle.load(best_model_path, map_location="cpu")
                 # If the model is on the GPU, it still works!
                 self._load_state_dict_in_model(state_dict)
             else:
@@ -525,7 +471,6 @@ def train(
             num_samples=num_train_samples,
             num_steps=self.state.max_steps)
 
-        metrics["total_flos"] = self.state.total_flos
         metrics["train_loss"] = train_loss
 
         self.is_in_train = False
@@ -555,22 +500,18 @@ def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
             return None
 
         if self.args.world_size <= 1:
-            # return RandomSampler(self.train_dataset)
-            return DistributedBatchSampler(
-                self.train_dataset,
-                # num_replicas=self.args.world_size,
-                # rank=self.args.process_index,
-                batch_size=self.args.train_batch_size,
+            return paddle.io.BatchSampler(
+                dataset=self.train_dataset,
                 shuffle=True,
-                # seed=self.args.seed,
-            )
+                batch_size=self.args.per_device_train_batch_size,
+                drop_last=False)
         else:
             return DistributedBatchSampler(
                 self.train_dataset,
-                # num_replicas=self.args.world_size,
-                # rank=self.args.process_index,
-                # seed=self.args.seed,
-            )
+                batch_size=self.args.per_device_train_batch_size,
+                shuffle=True,
+                num_replicas=self.args.world_size,
+                rank=self.args.process_index, )
 
     def _maybe_log_save_evaluate(self, tr_loss, model, epoch,
                                  ignore_keys_for_eval):
@@ -579,15 +520,17 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch,
             logs: Dict[str, float] = {}
 
             # all_gather + mean() to get average loss over all processes
-            # tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
-            tr_loss_scalar = tr_loss.mean().item()
+            tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
+            # tr_loss_scalar = tr_loss.mean().item()
 
             # reset tr_loss to zero
             tr_loss.subtract_(tr_loss)
+            # tr_loss.zero_()
 
             logs["loss"] = round(tr_loss_scalar / (
                 self.state.global_step - self._globalstep_last_logged), 4)
             logs["learning_rate"] = self._get_learning_rate()
+            logs["global_step"] = int(self.state.global_step)
 
             self._total_loss_scalar += tr_loss_scalar
             self._globalstep_last_logged = self.state.global_step
@@ -846,7 +789,13 @@ def autocast_smart_context_manager(self):
         arguments, depending on the situation.
         """
         if self.args.fp16:
-            ctx_manager = autocast()
+            ctx_manager = autocast(
+                True,
+                custom_black_list=[
+                    "reduce_sum", "c_softmax_with_cross_entropy",
+                    "elementwise_div"
+                ],
+                level=self.args.fp16_opt_level)
         else:
             ctx_manager = contextlib.nullcontext() if sys.version_info >= (
                 3, 7) else contextlib.suppress()
@@ -866,7 +815,6 @@ def compute_loss(self, model, inputs, return_outputs=False):
         outputs = model(**inputs)
 
         if self.criterion is not None:
-            # print(outputs)
             loss = self.criterion(outputs, labels)
             outputs = (loss, outputs)
 
@@ -1069,8 +1017,9 @@ def _save(self, output_dir: Optional[str]=None, state_dict=None):
             if isinstance(unwrap_model(self.model), PretrainedModel):
                 if state_dict is None:
                     state_dict = self.model.state_dict()
-                unwrap_model(self.model).save_pretrained(
-                    output_dir, state_dict=state_dict)
+                # unwrap_model(self.model).save_pretrained(
+                #     output_dir, state_dict=state_dict)
+                unwrap_model(self.model).save_pretrained(output_dir)
             else:
                 logger.info(
                     "Trainer.model is not a `PretrainedModel`, only saving its state dict."
@@ -1202,7 +1151,7 @@ def evaluation_loop(
 
         model = self._wrap_model(self.model, training=False)
 
-        batch_size = dataloader.batch_size
+        batch_size = dataloader.batch_sampler.batch_size
         num_samples = self.num_examples(dataloader)
         logger.info(f"***** Running {description} *****")
         logger.info(f"  Num examples = {num_samples}")
@@ -1236,13 +1185,51 @@ def evaluation_loop(
             # Prediction step
             loss, logits, labels = self.prediction_step(
                 model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
-            losses.append(loss.numpy())
 
+            # Update containers on host
+            if loss is not None:
+                # losses = self._nested_gather(loss.repeat(batch_size))
+                losses = self._nested_gather(
+                    paddle.tile(
+                        loss, repeat_times=[batch_size, 1]))
+                losses_host = losses if losses_host is None else paddle.concat(
+                    (losses_host, losses), axis=0)
+            if labels is not None:
+                labels = self._pad_across_processes(labels)
+                labels = self._nested_gather(labels)
+                labels_host = labels if labels_host is None else nested_concat(
+                    labels_host, labels, padding_index=-100)
+            if logits is not None:
+                logits = self._pad_across_processes(logits)
+                logits = self._nested_gather(logits)
+                preds_host = logits if preds_host is None else nested_concat(
+                    preds_host, logits, padding_index=-100)
+            self.control = self.callback_handler.on_prediction_step(
+                args, self.state, self.control)
+
+        # Gather all remaining tensors and put them back on the CPU
+        if losses_host is not None:
+            losses = nested_numpify(losses_host)
+            all_losses = losses if all_losses is None else np.concatenate(
+                (all_losses, losses), axis=0)
+        if preds_host is not None:
+            logits = nested_numpify(preds_host)
             all_preds = logits if all_preds is None else nested_concat(
                 all_preds, logits, padding_index=-100)
+        if labels_host is not None:
+            labels = nested_numpify(labels_host)
             all_labels = labels if all_labels is None else nested_concat(
                 all_labels, labels, padding_index=-100)
 
+        # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of
+        # samplers has been rounded to a multiple of batch_size, so we truncate.
+        if all_losses is not None:
+            all_losses = all_losses[:num_samples]
+        if all_preds is not None:
+            all_preds = nested_truncate(all_preds, num_samples)
+        if all_labels is not None:
+            all_labels = nested_truncate(all_labels, num_samples)
+
         model.train()
 
         # Metrics!
@@ -1253,8 +1240,11 @@ def evaluation_loop(
         else:
             metrics = {}
 
-        if losses is not None:
-            metrics[f"{metric_key_prefix}_loss"] = float(np.mean(losses))
+        # if losses is not None:
+        #     metrics[f"{metric_key_prefix}_loss"] = float(np.mean(losses))
+
+        if all_losses is not None:
+            metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()
 
         # Prefix all keys with metric_key_prefix + '_'
         for key in list(metrics.keys()):
@@ -1289,7 +1279,7 @@ def prediction_step(
         Args:
             model (`nn.Module`):
                 The model to evaluate.
-            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+            inputs (`Dict[str, Union[paddle.Tensor, Any]]`):
                 The inputs and targets of the model.
 
                 The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
@@ -1301,7 +1291,7 @@ def prediction_step(
                 gathering predictions.
 
         Return:
-            Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
+            Tuple[Optional[paddle.Tensor], Optional[paddle.Tensor], Optional[paddle.Tensor]]: A tuple with the loss,
             logits and labels (each being optional).
         """
         has_labels = all(inputs.get(k) is not None for k in self.label_names)
@@ -1358,7 +1348,7 @@ def prediction_step(
 
     def num_examples(self, dataloader: DataLoader) -> int:
         """
-        Helper to get number of samples in a [`~torch.utils.data.DataLoader`] by accessing its dataset.
+        Helper to get number of samples in a [`~paddle.io.DataLoader`] by accessing its dataset.
 
         Will raise an exception if the underlying dataset does not implement method `__len__`
         """
@@ -1378,6 +1368,57 @@ def is_world_process_zero(self) -> bool:
         """
         return self.args.process_index == 0
 
+    def _nested_gather(self, tensors, name=None):
+        """
+        Gather value of `tensors` (tensor or list/tuple of nested tensors) and convert them to numpy before
+        concatenating them to `gathered`
+        """
+        if tensors is None:
+            return
+        if self.args.local_rank != -1:
+            tensors = distributed_concat(tensors)
+        return tensors
+
+        # Copied from Accelerate.
+    def _pad_across_processes(self, tensor, pad_index=-100):
+        """
+        Recursively pad the tensors in a nested list/tuple/dictionary of tensors from all devices to the same size so
+        they can safely be gathered.
+        """
+        if isinstance(tensor, (list, tuple)):
+            return type(tensor)(self._pad_across_processes(
+                t, pad_index=pad_index) for t in tensor)
+        elif isinstance(tensor, dict):
+            return type(tensor)({
+                k: self._pad_across_processes(
+                    v, pad_index=pad_index)
+                for k, v in tensor.items()
+            })
+        elif not isinstance(tensor, paddle.Tensor):
+            raise TypeError(
+                f"Can't pad the values of type {type(tensor)}, only of nested list/tuple/dicts of tensors."
+            )
+
+        if len(tensor.shape) < 2:
+            return tensor
+        # Gather all sizes
+        size = paddle.to_tensor(tensor.shape)[None]
+        sizes = self._nested_gather(size).cpu()
+
+        max_size = max(s[1] for s in sizes)
+        if tensor.shape[1] == max_size:
+            return tensor
+
+        # Then pad to the maximum size
+        old_size = tensor.shape
+        new_size = list(old_size)
+        new_size[1] = max_size
+        # new_tensor = tensor.new_zeros(tuple(new_size)) + pad_index
+        new_tensor = paddle.zeros(
+            tuple(new_size), dtype=tensor.dtype) + pad_index
+        new_tensor[:, :old_size[1]] = tensor
+        return new_tensor
+
     def eval(self, *args, **kwargs):
         """
         """
@@ -1386,11 +1427,13 @@ def eval(self, *args, **kwargs):
     def print_config(self):
         """
         """
+        logger.info("==" * 40)
         logger.info('{:^40}'.format("Configuration Arguments"))
         logger.info('{:30}:{}'.format("paddle commit id",
                                       paddle.version.commit))
         for arg in vars(self.args):
             logger.info('{:30}:{}'.format(arg, getattr(self.args, arg)))
+        logger.info("==" * 40)
 
 
 class TrainerBase(object):
diff --git a/examples/language_model/ernie-1.0/finetune/trainer_callback.py b/paddlenlp/trainer/trainer_callback.py
similarity index 99%
rename from examples/language_model/ernie-1.0/finetune/trainer_callback.py
rename to paddlenlp/trainer/trainer_callback.py
index 95689ff46400..f4cec4d6108f 100644
--- a/examples/language_model/ernie-1.0/finetune/trainer_callback.py
+++ b/paddlenlp/trainer/trainer_callback.py
@@ -23,9 +23,9 @@
 import numpy as np
 from tqdm.auto import tqdm
 
-from trainer_utils import IntervalStrategy
-from trainer_args import TrainingArguments
-from utils import logging
+from .trainer_utils import IntervalStrategy, has_length
+from .trainer_args import TrainingArguments
+from .utils import logging
 
 logger = logging.get_logger(__name__)
 
diff --git a/examples/language_model/ernie-1.0/finetune/trainer_utils.py b/paddlenlp/trainer/trainer_utils.py
similarity index 97%
rename from examples/language_model/ernie-1.0/finetune/trainer_utils.py
rename to paddlenlp/trainer/trainer_utils.py
index 978d44e49469..97407824d4fa 100644
--- a/examples/language_model/ernie-1.0/finetune/trainer_utils.py
+++ b/paddlenlp/trainer/trainer_utils.py
@@ -311,3 +311,14 @@ def save_state(self):
 
     path = os.path.join(self.args.output_dir, "trainer_state.json")
     self.state.save_to_json(path)
+
+
+def has_length(dataset):
+    """
+    Checks if the dataset implements __len__() and it doesn't raise an error
+    """
+    try:
+        return len(dataset) is not None
+    except TypeError:
+        # TypeError: len() of unsized object
+        return False
diff --git a/paddlenlp/trainer/utils/logging.py b/paddlenlp/trainer/utils/logging.py
new file mode 100644
index 000000000000..15814dccd143
--- /dev/null
+++ b/paddlenlp/trainer/utils/logging.py
@@ -0,0 +1,344 @@
+# coding=utf-8
+# Copyright 2020 Optuna, Hugging Face
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Logging utilities."""
+
+import logging
+import os
+import sys
+import threading
+from logging import CRITICAL  # NOQA
+from logging import DEBUG  # NOQA
+from logging import ERROR  # NOQA
+from logging import FATAL  # NOQA
+from logging import INFO  # NOQA
+from logging import NOTSET  # NOQA
+from logging import WARN  # NOQA
+from logging import WARNING  # NOQA
+from typing import Optional
+
+from tqdm import auto as tqdm_lib
+
+_lock = threading.Lock()
+_default_handler: Optional[logging.Handler] = None
+
+log_levels = {
+    "debug": logging.DEBUG,
+    "info": logging.INFO,
+    "warning": logging.WARNING,
+    "error": logging.ERROR,
+    "critical": logging.CRITICAL,
+}
+
+_default_log_level = logging.WARNING
+
+_tqdm_active = True
+
+
+def _get_default_logging_level():
+    """
+    If TRANSFORMERS_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is
+    not - fall back to `_default_log_level`
+    """
+    env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None)
+    if env_level_str:
+        if env_level_str in log_levels:
+            return log_levels[env_level_str]
+        else:
+            logging.getLogger().warning(
+                f"Unknown option TRANSFORMERS_VERBOSITY={env_level_str}, "
+                f"has to be one of: { ', '.join(log_levels.keys()) }")
+    return _default_log_level
+
+
+def _get_library_name() -> str:
+
+    return __name__.split(".")[0]
+
+
+def _get_library_root_logger() -> logging.Logger:
+
+    return logging.getLogger(_get_library_name())
+
+
+def _configure_library_root_logger() -> None:
+
+    global _default_handler
+
+    with _lock:
+        if _default_handler:
+            # This library has already configured the library root logger.
+            return
+        _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
+        _default_handler.flush = sys.stderr.flush
+
+        # Apply our default configuration to the library root logger.
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.addHandler(_default_handler)
+        library_root_logger.setLevel(_get_default_logging_level())
+        library_root_logger.propagate = False
+
+
+def _reset_library_root_logger() -> None:
+
+    global _default_handler
+
+    with _lock:
+        if not _default_handler:
+            return
+
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.removeHandler(_default_handler)
+        library_root_logger.setLevel(logging.NOTSET)
+        _default_handler = None
+
+
+def get_log_levels_dict():
+    return log_levels
+
+
+def get_logger(name: Optional[str]=None) -> logging.Logger:
+    """
+    Return a logger with the specified name.
+
+    This function is not supposed to be directly accessed unless you are writing a custom transformers module.
+    """
+
+    if name is None:
+        name = _get_library_name()
+
+    _configure_library_root_logger()
+    return logging.getLogger(name)
+
+
+def get_verbosity() -> int:
+    """
+    Return the current level for the 🤗 Transformers's root logger as an int.
+
+    Returns:
+        `int`: The logging level.
+
+    <Tip>
+
+    🤗 Transformers has following logging levels:
+
+    - 50: `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
+    - 40: `transformers.logging.ERROR`
+    - 30: `transformers.logging.WARNING` or `transformers.logging.WARN`
+    - 20: `transformers.logging.INFO`
+    - 10: `transformers.logging.DEBUG`
+
+    </Tip>"""
+
+    _configure_library_root_logger()
+    return _get_library_root_logger().getEffectiveLevel()
+
+
+def set_verbosity(verbosity: int) -> None:
+    """
+    Set the verbosity level for the 🤗 Transformers's root logger.
+
+    Args:
+        verbosity (`int`):
+            Logging level, e.g., one of:
+
+            - `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
+            - `transformers.logging.ERROR`
+            - `transformers.logging.WARNING` or `transformers.logging.WARN`
+            - `transformers.logging.INFO`
+            - `transformers.logging.DEBUG`
+    """
+
+    _configure_library_root_logger()
+    _get_library_root_logger().setLevel(verbosity)
+
+
+def set_verbosity_info():
+    """Set the verbosity to the `INFO` level."""
+    return set_verbosity(INFO)
+
+
+def set_verbosity_warning():
+    """Set the verbosity to the `WARNING` level."""
+    return set_verbosity(WARNING)
+
+
+def set_verbosity_debug():
+    """Set the verbosity to the `DEBUG` level."""
+    return set_verbosity(DEBUG)
+
+
+def set_verbosity_error():
+    """Set the verbosity to the `ERROR` level."""
+    return set_verbosity(ERROR)
+
+
+def disable_default_handler() -> None:
+    """Disable the default handler of the HuggingFace Transformers's root logger."""
+
+    _configure_library_root_logger()
+
+    assert _default_handler is not None
+    _get_library_root_logger().removeHandler(_default_handler)
+
+
+def enable_default_handler() -> None:
+    """Enable the default handler of the HuggingFace Transformers's root logger."""
+
+    _configure_library_root_logger()
+
+    assert _default_handler is not None
+    _get_library_root_logger().addHandler(_default_handler)
+
+
+def add_handler(handler: logging.Handler) -> None:
+    """adds a handler to the HuggingFace Transformers's root logger."""
+
+    _configure_library_root_logger()
+
+    assert handler is not None
+    _get_library_root_logger().addHandler(handler)
+
+
+def remove_handler(handler: logging.Handler) -> None:
+    """removes given handler from the HuggingFace Transformers's root logger."""
+
+    _configure_library_root_logger()
+
+    assert handler is not None and handler not in _get_library_root_logger(
+    ).handlers
+    _get_library_root_logger().removeHandler(handler)
+
+
+def disable_propagation() -> None:
+    """
+    Disable propagation of the library log outputs. Note that log propagation is disabled by default.
+    """
+
+    _configure_library_root_logger()
+    _get_library_root_logger().propagate = False
+
+
+def enable_propagation() -> None:
+    """
+    Enable propagation of the library log outputs. Please disable the HuggingFace Transformers's default handler to
+    prevent double logging if the root logger has been configured.
+    """
+
+    _configure_library_root_logger()
+    _get_library_root_logger().propagate = True
+
+
+def enable_explicit_format() -> None:
+    """
+    Enable explicit formatting for every HuggingFace Transformers's logger. The explicit formatter is as follows:
+    ```
+        [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE
+    ```
+    All handlers currently bound to the root logger are affected by this method.
+    """
+    handlers = _get_library_root_logger().handlers
+
+    for handler in handlers:
+        formatter = logging.Formatter(
+            "[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
+        handler.setFormatter(formatter)
+
+
+def reset_format() -> None:
+    """
+    Resets the formatting for HuggingFace Transformers's loggers.
+
+    All handlers currently bound to the root logger are affected by this method.
+    """
+    handlers = _get_library_root_logger().handlers
+
+    for handler in handlers:
+        handler.setFormatter(None)
+
+
+def warning_advice(self, *args, **kwargs):
+    """
+    This method is identical to `logger.warning()`, but if env var TRANSFORMERS_NO_ADVISORY_WARNINGS=1 is set, this
+    warning will not be printed
+    """
+    no_advisory_warnings = os.getenv("TRANSFORMERS_NO_ADVISORY_WARNINGS", False)
+    if no_advisory_warnings:
+        return
+    self.warning(*args, **kwargs)
+
+
+logging.Logger.warning_advice = warning_advice
+
+
+class EmptyTqdm:
+    """Dummy tqdm which doesn't do anything."""
+
+    def __init__(self, *args, **kwargs):  # pylint: disable=unused-argument
+        self._iterator = args[0] if args else None
+
+    def __iter__(self):
+        return iter(self._iterator)
+
+    def __getattr__(self, _):
+        """Return empty function."""
+
+        def empty_fn(*args, **kwargs):  # pylint: disable=unused-argument
+            return
+
+        return empty_fn
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type_, value, traceback):
+        return
+
+
+class _tqdm_cls:
+    def __call__(self, *args, **kwargs):
+        if _tqdm_active:
+            return tqdm_lib.tqdm(*args, **kwargs)
+        else:
+            return EmptyTqdm(*args, **kwargs)
+
+    def set_lock(self, *args, **kwargs):
+        self._lock = None
+        if _tqdm_active:
+            return tqdm_lib.tqdm.set_lock(*args, **kwargs)
+
+    def get_lock(self):
+        if _tqdm_active:
+            return tqdm_lib.tqdm.get_lock()
+
+
+tqdm = _tqdm_cls()
+
+
+def is_progress_bar_enabled() -> bool:
+    """Return a boolean indicating whether tqdm progress bars are enabled."""
+    global _tqdm_active
+    return bool(_tqdm_active)
+
+
+def enable_progress_bar():
+    """Enable tqdm progress bar."""
+    global _tqdm_active
+    _tqdm_active = True
+
+
+def disable_progress_bar():
+    """Enable tqdm progress bar."""
+    global _tqdm_active
+    _tqdm_active = False

From 984ff9803e01f7af9ecb329b5fc4c72084210ec7 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 24 Mar 2022 21:57:08 +0800
Subject: [PATCH 08/15] support load from ckpt.

---
 .../ernie-1.0/finetune/finetune.py            |   5 +-
 paddlenlp/trainer/trainer_args.py             |  19 +-
 paddlenlp/trainer/trainer_base.py             | 184 +++++++++++++-----
 paddlenlp/trainer/trainer_utils.py            |  15 ++
 paddlenlp/trainer/utils/helper.py             |  90 +++++++++
 5 files changed, 242 insertions(+), 71 deletions(-)
 create mode 100644 paddlenlp/trainer/utils/helper.py

diff --git a/examples/language_model/ernie-1.0/finetune/finetune.py b/examples/language_model/ernie-1.0/finetune/finetune.py
index b001a3e1f70b..320c9ff77aa3 100644
--- a/examples/language_model/ernie-1.0/finetune/finetune.py
+++ b/examples/language_model/ernie-1.0/finetune/finetune.py
@@ -374,7 +374,10 @@ def do_train():
             training_args,
             test_ds=all_ds["test"])
 
-    train_result = trainer.train(resume_from_checkpoint=None)
+    resume_from_checkpoint = training_args.resume_from_checkpoint
+    if training_args.resume_from_checkpoint is None:
+        resume_from_checkpoint = True
+    train_result = trainer.train(resume_from_checkpoint=resume_from_checkpoint)
     metrics = train_result.metrics
 
     trainer.save_model()  # Saves the tokenizer too for easy upload
diff --git a/paddlenlp/trainer/trainer_args.py b/paddlenlp/trainer/trainer_args.py
index 03a06adb2c55..68575e1feb96 100644
--- a/paddlenlp/trainer/trainer_args.py
+++ b/paddlenlp/trainer/trainer_args.py
@@ -561,8 +561,6 @@ class TrainingArguments:
             "The path to a folder with a valid checkpoint for your model."
         }, )
 
-    _n_gpu: int = field(init=False, repr=False, default=-1)
-
     def __post_init__(self):
         # Handle --use_env option in paddle.distributed.launch (local_rank not passed as an arg then).
         # This needs to happen before any call to self.device or self.n_gpu.
@@ -672,7 +670,7 @@ def train_batch_size(self) -> int:
         """
         The actual batch size for training.
         """
-        train_batch_size = self.per_device_train_batch_size * max(1, self.n_gpu)
+        train_batch_size = self.per_device_train_batch_size
         return train_batch_size
 
     @property
@@ -680,22 +678,9 @@ def eval_batch_size(self) -> int:
         """
         The actual batch size for evaluation.
         """
-        eval_batch_size = self.per_device_eval_batch_size * max(1, self.n_gpu)
+        eval_batch_size = self.per_device_eval_batch_size
         return eval_batch_size
 
-    @property
-    def n_gpu(self):
-        """
-        The number of GPUs used by this process.
-
-        Note:
-            This will only be greater than one when you have multiple GPUs available but are not using distributed
-            training. For distributed training, it will always be 1.
-        """
-        # Make sure `self._n_gpu` is properly setup.
-        # _ = self._setup_devices
-        return self._n_gpu
-
     @property
     def device(self) -> "paddle.device":
         """
diff --git a/paddlenlp/trainer/trainer_base.py b/paddlenlp/trainer/trainer_base.py
index 6aac42ffc685..f3a642074571 100644
--- a/paddlenlp/trainer/trainer_base.py
+++ b/paddlenlp/trainer/trainer_base.py
@@ -24,26 +24,26 @@
 import sys
 import time
 import warnings
+import types
 from collections.abc import Mapping
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 from tqdm.auto import tqdm
-
+import numpy as np
 import paddle
 import paddle.nn as nn
-import paddle.nn.functional as F
-
+import paddle.amp.auto_cast as autocast
+from paddle.io import (
+    Dataset,
+    DataLoader,
+    DistributedBatchSampler, )
 from paddlenlp.transformers import LinearDecayWithWarmup
+from paddlenlp.transformers.model_utils import PretrainedModel, unwrap_model
+from paddlenlp.transformers.tokenizer_utils import PretrainedTokenizer
 from paddlenlp.utils.log import logger
-from paddle.io import DataLoader, DistributedBatchSampler
-import paddle.amp.auto_cast as autocast
-
-import numpy as np
 
 from .trainer_args import (TrainingArguments, )
-# from trainer_callback import TrainerState, TrainerControl
-
 from .trainer_utils import (
     IntervalStrategy,
     EvaluationStrategy,
@@ -52,8 +52,9 @@
     PredictionOutput,
     EvalLoopOutput,
     speed_metrics,
-    OptimizerNames, )
-
+    OptimizerNames,
+    PREFIX_CHECKPOINT_DIR,
+    get_last_checkpoint, )
 from .trainer_callback import (
     CallbackHandler,
     DefaultFlowCallback,
@@ -62,19 +63,16 @@
     TrainerCallback,
     TrainerControl,
     TrainerState, )
-
-DEFAULT_CALLBACKS = [DefaultFlowCallback]
-
-from .utils import logging
-from .utils.helper import (distributed_concat, nested_concat, nested_detach,
-                           nested_numpify, nested_truncate)
-
-from paddlenlp.transformers.model_utils import PretrainedModel, unwrap_model
-from paddlenlp.transformers.tokenizer_utils import PretrainedTokenizer
-
+from .utils.helper import (
+    distributed_concat,
+    nested_concat,
+    nested_detach,
+    nested_numpify,
+    nested_truncate, )
+# from .utils import logging
 # logger = logging.get_logger(__name__)
 
-from paddle.io import Dataset
+DEFAULT_CALLBACKS = [DefaultFlowCallback]
 
 
 class DataCollator:
@@ -94,7 +92,8 @@ def __init__(self, *args, **kwargs):
 SCHEDULER_NAME = "scheduler.pdparams"
 SCALER_NAME = "scaler.pdparams"
 
-PREFIX_CHECKPOINT_DIR = "training"
+WEIGHTS_NAME = "model_state.pdparams"
+CONFIG_NAME = "model_config.json"
 
 
 def set_seed(seed):
@@ -113,13 +112,13 @@ class Trainer:
     Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for 🤗 Transformers.
 
     Args:
-        model ([`PretrainedModel`] or `paddle.nn.Module`, *optional*):
+        model ([`PretrainedModel`] or `paddle.nn.Layer`, *optional*):
             The model to train, evaluate or use for predictions. If not provided, a `model_init` must be passed.
 
             <Tip>
 
             [`Trainer`] is optimized to work with the [`PretrainedModel`] provided by the library. You can still use
-            your own models defined as `paddle.nn.Module` as long as they work the same way as the 🤗 Transformers
+            your own models defined as `paddle.nn.Layer` as long as they work the same way as the 🤗 Transformers
             models.
 
             </Tip>
@@ -294,10 +293,39 @@ def train(
             ignore_keys_for_eval: Optional[List[str]]=None,
             **kwargs, ):
 
+        args = self.args
+        resume_from_checkpoint = None if not resume_from_checkpoint else resume_from_checkpoint
+
+        model_reloaded = False
+
+        # Load potential model checkpoint
+        if isinstance(resume_from_checkpoint, bool) and resume_from_checkpoint:
+            resume_from_checkpoint = get_last_checkpoint(args.output_dir)
+            if resume_from_checkpoint is None:
+                raise ValueError(
+                    f"No valid checkpoint found in output directory ({args.output_dir})"
+                )
+
+        if resume_from_checkpoint is not None:
+            if not os.path.isfile(
+                    os.path.join(resume_from_checkpoint, WEIGHTS_NAME)):
+                raise ValueError(
+                    f"Can't find a valid checkpoint at {resume_from_checkpoint}")
+
+            logger.info(f"Loading model from {resume_from_checkpoint} .")
+
+            # We load the model state dict on the CPU to avoid an OOM error.
+            state_dict = paddle.load(
+                os.path.join(resume_from_checkpoint, WEIGHTS_NAME))
+            # If the model is on the GPU, it still works!
+            self._set_state_dict_in_model(state_dict)
+
+            # release memory
+            del state_dict
+
         train_dataloader = self.get_train_dataloader()
         model = self._wrap_model(self.model_wrapped)
 
-        args = self.args
         self.state = TrainerState()
 
         total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size
@@ -327,6 +355,9 @@ def train(
         self.create_optimizer_and_scheduler(
             num_training_steps=args.num_training_steps)
 
+        # Check if saved optimizer or scheduler states exist
+        self._load_optimizer_and_scheduler(resume_from_checkpoint)
+
         num_examples = len(self.train_dataset)
 
         logger.info("***** Running training *****")
@@ -455,7 +486,7 @@ def train(
                 # We load the model state dict on the CPU to avoid an OOM error.
                 state_dict = paddle.load(best_model_path, map_location="cpu")
                 # If the model is on the GPU, it still works!
-                self._load_state_dict_in_model(state_dict)
+                self._set_state_dict_in_model(state_dict)
             else:
                 logger.warning(
                     f"Could not locate the best model at {best_model_path}, if you are running a distributed training "
@@ -511,7 +542,11 @@ def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
                 batch_size=self.args.per_device_train_batch_size,
                 shuffle=True,
                 num_replicas=self.args.world_size,
-                rank=self.args.process_index, )
+                rank=self.args.process_index,
+                drop_last=False)
+
+    def _set_state_dict_in_model(self, state_dict):
+        load_result = self.model.set_state_dict(state_dict)
 
     def _maybe_log_save_evaluate(self, tr_loss, model, epoch,
                                  ignore_keys_for_eval):
@@ -743,9 +778,6 @@ def _wrap_model(self, model, training=True):
         if unwrap_model(model) is not model:
             return model
 
-        if self.args.n_gpu > 1:
-            model = nn.DistributedDataParallel(model)
-
         # Note: in paddle.distributed mode, there's no point in wrapping the model
         # inside a DistributedDataParallel as we'll be under `no_grad` anyways.
         if not training:
@@ -837,7 +869,7 @@ def training_step(
         Subclass and override to inject custom behavior.
 
         Args:
-            model (`nn.Module`):
+            model (`nn.Layer`):
                 The model to train.
             inputs (`Dict[str, Union[paddle.Tensor, Any]]`):
                 The inputs and targets of the model.
@@ -854,10 +886,6 @@ def training_step(
         with self.autocast_smart_context_manager():
             loss = self.compute_loss(model, inputs)
 
-        if self.args.n_gpu > 1:
-            loss = loss.mean(
-            )  # mean() to average on multi-gpu parallel training
-
         if self.args.gradient_accumulation_steps > 1:
             loss = loss / self.args.gradient_accumulation_steps
 
@@ -1045,18 +1073,16 @@ def _load_optimizer_and_scheduler(self, checkpoint):
                     os.path.join(checkpoint, SCHEDULER_NAME)):
             # Load in optimizer and scheduler states
             map_location = self.args.device
-            self.optimizer.load_state_dict(
-                paddle.load(
-                    os.path.join(checkpoint, OPTIMIZER_NAME),
-                    map_location=map_location))
-            with warnings.catch_warnings(record=True) as caught_warnings:
-                self.lr_scheduler.load_state_dict(
-                    paddle.load(os.path.join(checkpoint, SCHEDULER_NAME)))
-            reissue_pt_warnings(caught_warnings)
+            self.optimizer.set_state_dict(
+                paddle.load(os.path.join(checkpoint, OPTIMIZER_NAME)))
+            self.lr_scheduler.set_state_dict(
+                paddle.load(os.path.join(checkpoint, SCHEDULER_NAME)))
             if self.do_grad_scaling and os.path.isfile(
                     os.path.join(checkpoint, SCALER_NAME)):
                 self.scaler.load_state_dict(
-                    paddle.load(os.path.join(checkpoint, SCALER_NAME)))
+                    paddle.load(
+                        os.path.join(checkpoint, SCALER_NAME),
+                        return_numpy=True))
 
     def log(self, logs: Dict[str, float]) -> None:
         """
@@ -1155,7 +1181,9 @@ def evaluation_loop(
         num_samples = self.num_examples(dataloader)
         logger.info(f"***** Running {description} *****")
         logger.info(f"  Num examples = {num_samples}")
-        logger.info(f"  Batch size = {batch_size}")
+        logger.info(f"  Pre device batch size = {batch_size}")
+        logger.info(f"  Total Batch size = {batch_size * self.args.world_size}")
+        logger.info(f"  Total prediction steps = {len(dataloader)}")
 
         model.eval()
 
@@ -1167,7 +1195,7 @@ def evaluation_loop(
             self._past = None
 
         # Initialize containers
-        # losses/preds/labels on GPU/TPU (accumulated for eval_accumulation_steps)
+        # losses/preds/labels on GPU (accumulated for eval_accumulation_steps)
         losses_host = None
         preds_host = None
         labels_host = None
@@ -1261,7 +1289,52 @@ def predict(self,
                 test_dataset: Dataset,
                 ignore_keys: Optional[List[str]]=None,
                 metric_key_prefix: str="test") -> PredictionOutput:
-        pass
+        """
+        Run prediction and returns predictions and potential metrics.
+        Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
+        will also return metrics, like in `evaluate()`.
+        Args:
+            test_dataset (`Dataset`):
+                Dataset to run the predictions on. If it is an `datasets.Dataset`, columns not accepted by the
+                `model.forward()` method are automatically removed. Has to implement the method `__len__`
+            ignore_keys (`Lst[str]`, *optional*):
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions.
+            metric_key_prefix (`str`, *optional*, defaults to `"test"`):
+                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
+                "test_bleu" if the prefix is "test" (default)
+        <Tip>
+        If your predictions or labels have different sequence length (for instance because you're doing dynamic padding
+        in a token classification task) the predictions will be padded (on the right) to allow for concatenation into
+        one array. The padding index is -100.
+        </Tip>
+        Returns: *NamedTuple* A namedtuple with the following keys:
+            - predictions (`np.ndarray`): The predictions on `test_dataset`.
+            - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
+            - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
+              labels).
+        """
+        test_dataloader = self.get_test_dataloader(test_dataset)
+        start_time = time.time()
+
+        eval_loop = self.evaluation_loop
+        output = eval_loop(
+            test_dataloader,
+            description="Prediction",
+            ignore_keys=ignore_keys,
+            metric_key_prefix=metric_key_prefix)
+        total_batch_size = self.args.eval_batch_size * self.args.world_size
+        output.metrics.update(
+            speed_metrics(
+                metric_key_prefix,
+                start_time,
+                num_samples=output.num_samples,
+                num_steps=math.ceil(output.num_samples / total_batch_size), ))
+
+        return PredictionOutput(
+            predictions=output.predictions,
+            label_ids=output.label_ids,
+            metrics=output.metrics)
 
     def prediction_step(
             self,
@@ -1277,7 +1350,7 @@ def prediction_step(
         Subclass and override to inject custom behavior.
 
         Args:
-            model (`nn.Module`):
+            model (`nn.Layer`):
                 The model to evaluate.
             inputs (`Dict[str, Union[paddle.Tensor, Any]]`):
                 The inputs and targets of the model.
@@ -1427,13 +1500,18 @@ def eval(self, *args, **kwargs):
     def print_config(self):
         """
         """
-        logger.info("==" * 40)
+        logger.info("=" * 60)
         logger.info('{:^40}'.format("Configuration Arguments"))
         logger.info('{:30}:{}'.format("paddle commit id",
                                       paddle.version.commit))
-        for arg in vars(self.args):
-            logger.info('{:30}:{}'.format(arg, getattr(self.args, arg)))
-        logger.info("==" * 40)
+
+        for a in dir(self.args):
+            if (a[:2] != "__"):  #don't print double underscore methods
+                v = getattr(self.args, a)
+                if not isinstance(v, types.MethodType):
+                    logger.info('{:30}:{}'.format(a, v))
+
+        logger.info("=" * 60)
 
 
 class TrainerBase(object):
diff --git a/paddlenlp/trainer/trainer_utils.py b/paddlenlp/trainer/trainer_utils.py
index 97407824d4fa..2538b97e7d5f 100644
--- a/paddlenlp/trainer/trainer_utils.py
+++ b/paddlenlp/trainer/trainer_utils.py
@@ -322,3 +322,18 @@ def has_length(dataset):
     except TypeError:
         # TypeError: len() of unsized object
         return False
+
+
+def get_last_checkpoint(folder):
+    content = os.listdir(folder)
+    checkpoints = [
+        path for path in content
+        if _re_checkpoint.search(path) is not None and os.path.isdir(
+            os.path.join(folder, path))
+    ]
+    if len(checkpoints) == 0:
+        return
+    return os.path.join(
+        folder,
+        max(checkpoints,
+            key=lambda x: int(_re_checkpoint.search(x).groups()[0])))
diff --git a/paddlenlp/trainer/utils/helper.py b/paddlenlp/trainer/utils/helper.py
new file mode 100644
index 000000000000..380a8ea464ff
--- /dev/null
+++ b/paddlenlp/trainer/utils/helper.py
@@ -0,0 +1,90 @@
+import paddle
+import paddle.distributed as dist
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+
+def distributed_concat(tensor: Any,
+                       num_total_examples: Optional[int]=None) -> Any:
+    try:
+        if isinstance(tensor, (tuple, list)):
+            return type(tensor)(distributed_concat(t, num_total_examples)
+                                for t in tensor)
+        output_tensors = [tensor.clone() for _ in range(dist.get_world_size())]
+        output_tensors = [
+            t if len(t.shape) > 0 else t[None] for t in output_tensors
+        ]
+        dist.all_gather(output_tensors, tensor)
+        concat = paddle.concat(output_tensors, axis=0)
+
+        # truncate the dummy elements added by SequentialDistributedSampler
+        if num_total_examples is not None:
+            concat = concat[:num_total_examples]
+        return concat
+    except AssertionError:
+        raise AssertionError("Not currently using distributed training")
+
+
+def paddle_pad_and_concatenate(tensor1, tensor2, padding_index=-100):
+    """Concatenates `tensor1` and `tensor2` on first axis, applying padding on the second if necessary."""
+    if len(tensor1.shape) == 1 or tensor1.shape[1] == tensor2.shape[1]:
+        return paddle.concat((tensor1, tensor2), axis=0)
+
+    raise ValueError("Error")
+    # Let's figure out the new shape
+    new_shape = (tensor1.shape[0] + tensor2.shape[0], max(
+        tensor1.shape[1], tensor2.shape[1])) + tensor1.shape[2:]
+
+    # Now let's fill the result tensor
+    # result = tensor1.new_full(new_shape, padding_index)
+    result = paddle.full(new_shape, padding_index, dtype=tensor1.dtype)
+
+    result[:tensor1.shape[0], :tensor1.shape[1]] = tensor1
+    result[tensor1.shape[0]:, :tensor2.shape[1]] = tensor2
+    return result
+
+
+def nested_concat(tensors, new_tensors, padding_index=-100):
+    """
+    Concat the `new_tensors` to `tensors` on the first dim and pad them on the second if needed. Works for tensors or
+    nested list/tuples of tensors.
+    """
+    assert type(tensors) == type(
+        new_tensors
+    ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
+    if isinstance(tensors, (list, tuple)):
+        return type(tensors)(nested_concat(
+            t, n, padding_index=padding_index)
+                             for t, n in zip(tensors, new_tensors))
+    elif isinstance(tensors, paddle.Tensor):
+        return paddle_pad_and_concatenate(
+            tensors, new_tensors, padding_index=padding_index)
+    elif isinstance(tensors, np.ndarray):
+        return numpy_pad_and_concatenate(
+            tensors, new_tensors, padding_index=padding_index)
+    else:
+        raise TypeError(
+            f"Unsupported type for concatenation: got {type(tensors)}")
+
+
+def nested_detach(tensors):
+    "Detach `tensors` (even if it's a nested list/tuple of tensors)."
+    if isinstance(tensors, (list, tuple)):
+        return type(tensors)(nested_detach(t) for t in tensors)
+    return tensors.detach()
+
+
+def nested_numpify(tensors):
+    "Numpify `tensors` (even if it's a nested list/tuple of tensors)."
+    if isinstance(tensors, (list, tuple)):
+        return type(tensors)(nested_numpify(t) for t in tensors)
+    t = tensors.cpu()
+    if t.dtype == paddle.float16:
+        t = t.cast(paddle.float32)
+    return t.numpy()
+
+
+def nested_truncate(tensors, limit):
+    "Truncate `tensors` at `limit` (even if it's a nested list/tuple of tensors)."
+    if isinstance(tensors, (list, tuple)):
+        return type(tensors)(nested_truncate(t, limit) for t in tensors)
+    return tensors[:limit]

From 615973d6f7d13e6961924060d4c8ed1275a6b1dd Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Fri, 25 Mar 2022 16:47:09 +0800
Subject: [PATCH 09/15] support for export inference model.

---
 .../ernie-1.0/finetune/finetune.py            | 51 +++++++++--
 .../finetune/sequence_classification.py       |  4 +
 paddlenlp/datasets/chnsenticorp_v2.py         |  2 +-
 paddlenlp/trainer/trainer_args.py             |  5 --
 paddlenlp/trainer/trainer_base.py             | 85 ++++++++++++-------
 paddlenlp/trainer/trainer_callback.py         |  7 ++
 6 files changed, 109 insertions(+), 45 deletions(-)

diff --git a/examples/language_model/ernie-1.0/finetune/finetune.py b/examples/language_model/ernie-1.0/finetune/finetune.py
index 320c9ff77aa3..79810dc5e28d 100644
--- a/examples/language_model/ernie-1.0/finetune/finetune.py
+++ b/examples/language_model/ernie-1.0/finetune/finetune.py
@@ -307,6 +307,30 @@ def do_train():
         (ModelArguments, DataTrainingArguments, TrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "
+        +
+        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(
+            training_args.output_dir
+    ) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(
+                os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome.")
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
     paddle.set_device(training_args.device)
     rank = paddle.distributed.get_rank()
     if paddle.distributed.get_world_size() > 1:
@@ -374,22 +398,33 @@ def do_train():
             training_args,
             test_ds=all_ds["test"])
 
-    resume_from_checkpoint = training_args.resume_from_checkpoint
-    if training_args.resume_from_checkpoint is None:
-        resume_from_checkpoint = True
-    train_result = trainer.train(resume_from_checkpoint=resume_from_checkpoint)
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+
+    train_result = trainer.train(resume_from_checkpoint=checkpoint)
     metrics = train_result.metrics
 
     trainer.save_model()  # Saves the tokenizer too for easy upload
-
     # trainer.save_infer_model() -> 部署, onnx, slim, 量化后可否加速
-
     trainer.log_metrics("train", metrics)
     trainer.save_metrics("train", metrics)
     trainer.save_state()
 
-    # trainer.train()
-    # trainer.eval()
+    eval_metrics = trainer.evaluate()
+    trainer.log_metrics("eval", eval_metrics)
+    test_ret = trainer.predict(trainer.test_ds)
+    trainer.log_metrics("test", test_ret.metrics)
+
+    input_spec = [
+        paddle.static.InputSpec(
+            shape=[None, None], dtype="int64"),  # input_ids
+        paddle.static.InputSpec(
+            shape=[None, None], dtype="int64")  # segment_ids
+    ]
+    trainer.export_model(input_spec=input_spec, load_best_model=True)
 
 
 def print_arguments(args):
diff --git a/examples/language_model/ernie-1.0/finetune/sequence_classification.py b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
index a7dd0170608c..487c0ab4715f 100644
--- a/examples/language_model/ernie-1.0/finetune/sequence_classification.py
+++ b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
@@ -340,6 +340,10 @@ def __init__(self, train_ds, dev_ds, model, tokenizer, data_args,
         train_ds = train_ds.map(trans_fn)
         dev_ds = dev_ds.map(trans_fn)
 
+        if "test_ds" in kwargs.keys():
+            test_ds = kwargs["test_ds"]
+            self.test_ds = test_ds.map(trans_fn)
+
         loss_fct = paddle.nn.loss.CrossEntropyLoss(
         ) if train_ds.label_list else paddle.nn.loss.MSELoss()
 
diff --git a/paddlenlp/datasets/chnsenticorp_v2.py b/paddlenlp/datasets/chnsenticorp_v2.py
index 5e6056a200fa..908f558eb02c 100644
--- a/paddlenlp/datasets/chnsenticorp_v2.py
+++ b/paddlenlp/datasets/chnsenticorp_v2.py
@@ -60,7 +60,7 @@ def _get_data(self, mode, **kwargs):
     def _read(self, filename, split):
         """Reads data."""
         with open(filename, 'r', encoding='utf-8') as f:
-            head = None
+            head = True
             for line in f:
                 data = line.strip().split("\t")
                 if not head:
diff --git a/paddlenlp/trainer/trainer_args.py b/paddlenlp/trainer/trainer_args.py
index 68575e1feb96..d62144a516c3 100644
--- a/paddlenlp/trainer/trainer_args.py
+++ b/paddlenlp/trainer/trainer_args.py
@@ -280,8 +280,6 @@ class TrainingArguments:
             The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
             `"comet_ml"`, `"mlflow"`, `"tensorboard"` and `"wandb"`. Use `"all"` to report to all integrations
             installed, `"none"` for no integrations.
-        dataloader_pin_memory (`bool`, *optional*, defaults to `True`):
-            Whether you want to pin memory in data loaders or not. Will default to `True`.
         skip_memory_metrics (`bool`, *optional*, defaults to `True`):
             Whether to skip adding of memory profiler reports to metrics. This is skipped by default because it slows
             down the training and evaluation speed.
@@ -544,9 +542,6 @@ class TrainingArguments:
             "The list of integrations to report the results and logs to."
         })
 
-    dataloader_pin_memory: bool = field(
-        default=True,
-        metadata={"help": "Whether or not to pin memory for DataLoader."})
     skip_memory_metrics: bool = field(
         default=True,
         metadata={
diff --git a/paddlenlp/trainer/trainer_base.py b/paddlenlp/trainer/trainer_base.py
index f3a642074571..424739b352db 100644
--- a/paddlenlp/trainer/trainer_base.py
+++ b/paddlenlp/trainer/trainer_base.py
@@ -484,7 +484,7 @@ def train(
                                            WEIGHTS_NAME)
             if os.path.exists(best_model_path):
                 # We load the model state dict on the CPU to avoid an OOM error.
-                state_dict = paddle.load(best_model_path, map_location="cpu")
+                state_dict = paddle.load(best_model_path)
                 # If the model is on the GPU, it still works!
                 self._set_state_dict_in_model(state_dict)
             else:
@@ -535,7 +535,7 @@ def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
                 dataset=self.train_dataset,
                 shuffle=True,
                 batch_size=self.args.per_device_train_batch_size,
-                drop_last=False)
+                drop_last=self.args.dataloader_drop_last)
         else:
             return DistributedBatchSampler(
                 self.train_dataset,
@@ -543,7 +543,7 @@ def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
                 shuffle=True,
                 num_replicas=self.args.world_size,
                 rank=self.args.process_index,
-                drop_last=False)
+                drop_last=self.args.dataloader_drop_last)
 
     def _set_state_dict_in_model(self, state_dict):
         load_result = self.model.set_state_dict(state_dict)
@@ -556,11 +556,9 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch,
 
             # all_gather + mean() to get average loss over all processes
             tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
-            # tr_loss_scalar = tr_loss.mean().item()
 
             # reset tr_loss to zero
             tr_loss.subtract_(tr_loss)
-            # tr_loss.zero_()
 
             logs["loss"] = round(tr_loss_scalar / (
                 self.state.global_step - self._globalstep_last_logged), 4)
@@ -602,29 +600,25 @@ def get_train_dataloader(self):
 
         return DataLoader(
             train_dataset,
-            # batch_size=self.args.train_batch_size,
             batch_sampler=train_sampler,
             collate_fn=self.data_collator,
-            # drop_last=self.args.dataloader_drop_last,
-            num_workers=self.args.dataloader_num_workers,
-            # pin_memory=self.args.dataloader_pin_memory,
-        )
+            num_workers=self.args.dataloader_num_workers, )
 
     def _get_eval_sampler(self, eval_dataset: Dataset):
         if self.args.world_size <= 1:
-            return DistributedBatchSampler(
+            return paddle.io.BatchSampler(
                 eval_dataset,
-                # num_replicas=self.args.world_size,
-                # rank=self.args.process_index,
                 batch_size=self.args.eval_batch_size,
                 shuffle=False,
-                # seed=self.args.seed,
-            )
+                drop_last=False, )
         else:
             return DistributedBatchSampler(
                 eval_dataset,
+                num_replicas=self.args.world_size,
+                rank=self.args.process_index,
                 batch_size=self.args.eval_batch_size,
-                shuffle=False)
+                shuffle=False,
+                drop_last=False, )
 
     def get_eval_dataloader(self,
                             eval_dataset: Optional[Dataset]=None) -> DataLoader:
@@ -646,13 +640,9 @@ def get_eval_dataloader(self,
 
         return DataLoader(
             eval_dataset,
-            # batch_size=self.args.train_batch_size,
             batch_sampler=eval_sampler,
             collate_fn=self.data_collator,
-            # drop_last=self.args.dataloader_drop_last,
-            num_workers=self.args.dataloader_num_workers,
-            # pin_memory=self.args.dataloader_pin_memory,
-        )
+            num_workers=self.args.dataloader_num_workers, )
 
     def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
         """
@@ -671,11 +661,9 @@ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
         # We use the same batch_size as for eval.
         return DataLoader(
             test_dataset,
-            sampler=test_sampler,
-            batch_size=self.args.eval_batch_size,
+            batch_sampler=test_sampler,
             collate_fn=self.data_collator,
-            drop_last=self.args.dataloader_drop_last,
-            pin_memory=self.args.dataloader_pin_memory, )
+            drop_last=self.args.dataloader_drop_last, )
 
     def create_optimizer_and_scheduler(self, num_training_steps: int):
         """
@@ -909,6 +897,47 @@ def save_model(self, output_dir: Optional[str]=None):
         if self.args.should_save:
             self._save(output_dir)
 
+    def export_model(self,
+                     input_spec=None,
+                     load_best_model=False,
+                     output_dir: Optional[str]=None):
+
+        if output_dir is None:
+            output_dir = self.args.output_dir
+
+        if load_best_model and self.state.best_model_checkpoint is not None:
+            if self.args.local_rank != -1:
+                dist.barrier()
+
+            logger.info(
+                f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric})."
+            )
+
+            best_model_path = os.path.join(self.state.best_model_checkpoint,
+                                           WEIGHTS_NAME)
+            if os.path.exists(best_model_path):
+                # We load the model state dict on the CPU to avoid an OOM error.
+                state_dict = paddle.load(best_model_path)
+                # If the model is on the GPU, it still works!
+                self._set_state_dict_in_model(state_dict)
+            else:
+                logger.warning(
+                    f"Could not locate the best model at {best_model_path}, if you are running a distributed training "
+                    "on multiple nodes, you should activate `--save_on_each_node`."
+                )
+
+        model = unwrap_model(self.model)
+        model.eval()
+
+        # Convert to static graph with specific input description
+        model = paddle.jit.to_static(model, input_spec=input_spec)
+
+        # Save in static graph model.
+        save_path = os.path.join(output_dir, "inference", "infer")
+        logger.info("Exporting inference model to %s" % save_path)
+        paddle.jit.save(model, save_path)
+        logger.info("Inference model exported.")
+
     def _save_checkpoint(self, model, metrics=None):
         # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"
 
@@ -1072,7 +1101,6 @@ def _load_optimizer_and_scheduler(self, checkpoint):
                 checkpoint, OPTIMIZER_NAME)) and os.path.isfile(
                     os.path.join(checkpoint, SCHEDULER_NAME)):
             # Load in optimizer and scheduler states
-            map_location = self.args.device
             self.optimizer.set_state_dict(
                 paddle.load(os.path.join(checkpoint, OPTIMIZER_NAME)))
             self.lr_scheduler.set_state_dict(
@@ -1492,11 +1520,6 @@ def _pad_across_processes(self, tensor, pad_index=-100):
         new_tensor[:, :old_size[1]] = tensor
         return new_tensor
 
-    def eval(self, *args, **kwargs):
-        """
-        """
-        pass
-
     def print_config(self):
         """
         """
diff --git a/paddlenlp/trainer/trainer_callback.py b/paddlenlp/trainer/trainer_callback.py
index f4cec4d6108f..2d33ac0525ab 100644
--- a/paddlenlp/trainer/trainer_callback.py
+++ b/paddlenlp/trainer/trainer_callback.py
@@ -517,6 +517,13 @@ def on_step_end(self,
         # End training
         if state.global_step >= state.max_steps:
             control.should_training_stop = True
+            # Log and save on end
+            if args.logging_strategy == IntervalStrategy.STEPS:
+                control.should_log = True
+            if args.evaluation_strategy == IntervalStrategy.STEPS:
+                control.should_evaluate = True
+            if args.save_strategy == IntervalStrategy.STEPS:
+                control.should_save = True
 
         return control
 

From b7b2c7795190981a8caeea0857430f297ae02a12 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Fri, 25 Mar 2022 21:53:18 +0800
Subject: [PATCH 10/15] first version of trainer.

---
 .../ernie-1.0/finetune/finetune.py            | 442 ------------------
 .../ernie-1.0/finetune/run_seq_cls.py         | 184 ++++++++
 .../finetune/sequence_classification.py       |  50 +-
 paddlenlp/trainer/trainer_args.py             |   9 +-
 paddlenlp/trainer/trainer_base.py             |   6 +-
 requirements.txt                              |   3 +-
 6 files changed, 199 insertions(+), 495 deletions(-)
 delete mode 100644 examples/language_model/ernie-1.0/finetune/finetune.py
 create mode 100644 examples/language_model/ernie-1.0/finetune/run_seq_cls.py

diff --git a/examples/language_model/ernie-1.0/finetune/finetune.py b/examples/language_model/ernie-1.0/finetune/finetune.py
deleted file mode 100644
index 79810dc5e28d..000000000000
--- a/examples/language_model/ernie-1.0/finetune/finetune.py
+++ /dev/null
@@ -1,442 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import sys
-import random
-import time
-import copy
-import yaml
-from functools import partial
-import distutils.util
-import os.path as osp
-from dataclasses import dataclass, field
-from typing import Optional
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-import paddlenlp
-from paddlenlp.trainer import (
-    PdArgumentParser,
-    TrainingArguments, )
-from paddlenlp.datasets import load_dataset
-from paddlenlp.transformers import (
-    AutoModelForSequenceClassification,
-    AutoModelForTokenClassification,
-    AutoModelForQuestionAnswering,
-    AutoTokenizer, )
-from paddlenlp.utils.log import logger
-
-sys.path.insert(0, os.path.abspath("."))
-from sequence_classification import ClueTrainer, SeqTrainer
-from question_answering import MrcTrainer
-from token_classification import NerTrainer
-
-ALL_TASKS = {
-    "SequenceClassification": [],
-    "TokenClassification": [],
-    "QuestionAnswering": []
-}
-
-for x in dir(paddlenlp.transformers):
-    for task in ALL_TASKS.keys():
-        if x.endswith(task):
-            if not x.startswith("AutoModel"):
-                ALL_TASKS[task].append(x)
-
-CONFIG = yaml.load(
-    open(osp.join(osp.abspath("."), "./config.yml"), 'r'),
-    Loader=yaml.FullLoader)
-ARGS = CONFIG["DefaultArgs"]
-ALL_DATASETS = {}
-
-for task_type in ALL_TASKS.keys():
-    task = CONFIG[task_type]
-    for data_name in task.keys():
-        new_args = task[data_name]
-        new_args = {} if new_args is None else new_args
-        final_args = copy.deepcopy(ARGS)
-        final_args.update(new_args)
-        final_args["model"] = "AutoModelFor{}".format(task_type)
-        ALL_DATASETS[data_name] = final_args
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--dataset",
-        default=None,
-        type=str,
-        required=True,
-        help="The name of the dataset to train selected in the list: " +
-        ", ".join(ALL_DATASETS.keys()), )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: "
-        +
-        " https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html . "
-        + " Such as ernie-1.0, bert-base-uncased")
-
-    group = parser.add_argument_group(title='Common training configs.')
-    group.add_argument(
-        "--max_seq_length",
-        default=None,
-        type=int,
-        help="The maximum total input sequence length after tokenization. Sequences longer "
-        "than this will be truncated, sequences shorter will be padded.", )
-    group.add_argument(
-        "--learning_rate",
-        default=None,
-        type=float,
-        help="The initial learning rate for Adam.")
-    group.add_argument(
-        "--batch_size",
-        default=None,
-        type=int,
-        help="Batch size per GPU/CPU for training.", )
-    group.add_argument(
-        "--weight_decay",
-        default=None,
-        type=float,
-        help="Weight decay if we apply some.")
-
-    group.add_argument(
-        "--num_train_epochs",
-        default=None,
-        type=int,
-        help="Total number of training epochs to perform.", )
-    group.add_argument(
-        "--logging_steps",
-        type=int,
-        default=10,
-        help="Log every X updates steps.")
-    group.add_argument(
-        "--eval_steps",
-        type=int,
-        default=200,
-        help="Save checkpoint every X updates steps.")
-    group.add_argument(
-        "--minimum_eval_times",
-        type=int,
-        default=None,
-        help="If under eval_steps, the valid time is less then minimum_eval_times, the config of override eval_steps."
-    )
-    group.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    group.add_argument(
-        "--warmup_steps",
-        default=0,
-        type=int,
-        help="Linear warmup over warmup_steps. If > 0: Override warmup_ratio")
-    group.add_argument(
-        "--warmup_ratio",
-        default=0.1,
-        type=float,
-        help="Linear warmup proportion over total steps.")
-
-    group = parser.add_argument_group(title='Additional training configs.')
-    group.add_argument(
-        "--fp16",
-        type=distutils.util.strtobool,
-        default=False,
-        help="Enable mixed precision training.")
-    group.add_argument(
-        "--scale_loss",
-        type=float,
-        default=2**15,
-        help="The value of scale_loss for fp16.")
-    group.add_argument(
-        "--adam_epsilon",
-        default=1e-8,
-        type=float,
-        help="Epsilon for Adam optimizer.")
-    group.add_argument(
-        "--max_grad_norm",
-        default=1.0,
-        type=float,
-        help="The max value of grad norm.")
-
-    group.add_argument(
-        "--seed", default=42, type=int, help="random seed for initialization")
-    group.add_argument(
-        "--device",
-        default="gpu",
-        choices=["cpu", "gpu"],
-        help="The device to select to train the model, is must be cpu/gpu.")
-
-    group = parser.add_argument_group(title='Additional configs for QA task.')
-    group.add_argument(
-        "--doc_stride",
-        type=int,
-        default=128,
-        help="When splitting up a long document into chunks, how much stride to take between chunks."
-    )
-    group.add_argument(
-        "--n_best_size",
-        type=int,
-        default=20,
-        help="The total number of n-best predictions to generate in the nbest_predictions.json output file."
-    )
-    group.add_argument(
-        "--max_query_length", type=int, default=64, help="Max query length.")
-    group.add_argument(
-        "--max_answer_length", type=int, default=30, help="Max answer length.")
-    group.add_argument(
-        "--do_lower_case",
-        action='store_false',
-        help="Whether to lower case the input text. Should be True for uncased models and False for cased models."
-    )
-
-    args = parser.parse_args()
-    return args
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    Using `PdArgumentParser` we can turn this class
-    into argparse arguments to be able to specify them on
-    the command line.
-    """
-
-    dataset: str = field(
-        default=None,
-        metadata={
-            "help": "The name of the dataset to use (via the datasets library)."
-        })
-
-    max_seq_length: int = field(
-        default=128,
-        metadata={
-            "help":
-            "The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
-        }, )
-
-    # Additional configs for QA task.
-    doc_stride: int = field(
-        default=128,
-        metadata={
-            "help":
-            "When splitting up a long document into chunks, how much stride to take between chunks."
-        }, )
-
-    n_best_size: int = field(
-        default=20,
-        metadata={
-            "help":
-            "The total number of n-best predictions to generate in the nbest_predictions.json output file."
-        }, )
-
-    max_query_length: int = field(
-        default=64,
-        metadata={"help": "Max query length."}, )
-
-    max_answer_length: int = field(
-        default=30,
-        metadata={"help": "Max answer length."}, )
-
-    do_lower_case: bool = field(
-        default=False,
-        metadata={
-            "help":
-            "Whether to lower case the input text. Should be True for uncased models and False for cased models."
-        }, )
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(metadata={
-        "help":
-        "Path to pretrained model or model identifier from https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html"
-    })
-    config_name: Optional[str] = field(
-        default=None,
-        metadata={
-            "help":
-            "Pretrained config name or path if not the same as model_name"
-        })
-    tokenizer_name: Optional[str] = field(
-        default=None,
-        metadata={
-            "help":
-            "Pretrained tokenizer name or path if not the same as model_name"
-        })
-
-
-def set_seed(args):
-    # Use the same data seed(for data shuffle) for all procs to guarantee data
-    # consistency after sharding.
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    # Maybe different op seeds(for dropout) for different procs is better. By:
-    # `paddle.seed(args.seed + paddle.distributed.get_rank())`
-    paddle.seed(args.seed)
-
-
-def do_train():
-    parser = PdArgumentParser(
-        (ModelArguments, DataTrainingArguments, TrainingArguments))
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Log on each process the small summary:
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "
-        +
-        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(
-            training_args.output_dir
-    ) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(
-                os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome.")
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    paddle.set_device(training_args.device)
-    rank = paddle.distributed.get_rank()
-    if paddle.distributed.get_world_size() > 1:
-        paddle.distributed.init_parallel_env()
-
-    # set_seed(args)
-    data_args.dataset = data_args.dataset.strip()
-
-    if data_args.dataset not in ALL_DATASETS:
-        raise ValueError("Not found {}".format(data_args.dataset))
-
-    config = ALL_DATASETS[data_args.dataset]
-    for args in (model_args, data_args, training_args):
-        for arg in vars(args):
-            # if getattr(args, arg) is None:
-            if arg in config.keys():
-                setattr(args, arg, config[arg])
-
-    training_args.per_device_train_batch_size = config["batch_size"]
-    training_args.per_device_eval_batch_size = config["batch_size"]
-
-    dataset_config = data_args.dataset.split(" ")
-    all_ds = load_dataset(
-        dataset_config[0],
-        None if len(dataset_config) <= 1 else dataset_config[1],
-        # lazy=False
-    )
-
-    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
-
-    data_args.label_list = getattr(all_ds['train'], "label_list", None)
-
-    num_classes = 1 if all_ds["train"].label_list == None else len(all_ds[
-        'train'].label_list)
-
-    model = getattr(paddlenlp.transformers, config["model"]).from_pretrained(
-        model_args.model_name_or_path, num_classes=num_classes)
-
-    if paddle.distributed.get_world_size() > 1:
-        model = paddle.DataParallel(model)
-
-    if "SequenceClassification" in config["model"]:
-        if 'clue' in data_args.dataset:
-            trainer = ClueTrainer(all_ds["train"], all_ds["dev"], model,
-                                  tokenizer, training_args)
-        else:
-            trainer = SeqTrainer(
-                all_ds["train"],
-                all_ds["dev"],
-                model,
-                tokenizer,
-                data_args,
-                training_args,
-                test_ds=all_ds["test"])
-
-    elif "QuestionAnswering" in config["model"]:
-        trainer = MrcTrainer(all_ds["train"], all_ds["dev"], model, tokenizer,
-                             training_args)
-    elif 'TokenClassification' in config["model"]:
-        trainer = NerTrainer(
-            all_ds["train"],
-            all_ds["dev"],
-            model,
-            tokenizer,
-            training_args,
-            test_ds=all_ds["test"])
-
-    checkpoint = None
-    if training_args.resume_from_checkpoint is not None:
-        checkpoint = training_args.resume_from_checkpoint
-    elif last_checkpoint is not None:
-        checkpoint = last_checkpoint
-
-    train_result = trainer.train(resume_from_checkpoint=checkpoint)
-    metrics = train_result.metrics
-
-    trainer.save_model()  # Saves the tokenizer too for easy upload
-    # trainer.save_infer_model() -> 部署, onnx, slim, 量化后可否加速
-    trainer.log_metrics("train", metrics)
-    trainer.save_metrics("train", metrics)
-    trainer.save_state()
-
-    eval_metrics = trainer.evaluate()
-    trainer.log_metrics("eval", eval_metrics)
-    test_ret = trainer.predict(trainer.test_ds)
-    trainer.log_metrics("test", test_ret.metrics)
-
-    input_spec = [
-        paddle.static.InputSpec(
-            shape=[None, None], dtype="int64"),  # input_ids
-        paddle.static.InputSpec(
-            shape=[None, None], dtype="int64")  # segment_ids
-    ]
-    trainer.export_model(input_spec=input_spec, load_best_model=True)
-
-
-def print_arguments(args):
-    """print arguments"""
-    logger.info('{:^40}'.format("Configuration Arguments"))
-    logger.info('{:20}:{}'.format("paddle commit id", paddle.version.commit))
-    for arg in vars(args):
-        logger.info('{:20}:{}'.format(arg, getattr(args, arg)))
-
-
-if __name__ == "__main__":
-    # args = parse_args()
-
-    # print_arguments(args)
-    do_train()
diff --git a/examples/language_model/ernie-1.0/finetune/run_seq_cls.py b/examples/language_model/ernie-1.0/finetune/run_seq_cls.py
new file mode 100644
index 000000000000..54ed161940dd
--- /dev/null
+++ b/examples/language_model/ernie-1.0/finetune/run_seq_cls.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import yaml
+from functools import partial
+import distutils.util
+import os.path as osp
+from typing import Optional
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.metric import Accuracy
+
+import paddlenlp
+from paddlenlp.trainer import (PdArgumentParser, TrainingArguments, Trainer)
+from paddlenlp.datasets import load_dataset
+from paddlenlp.transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification, )
+from paddlenlp.utils.log import logger
+
+sys.path.insert(0, os.path.abspath("."))
+from sequence_classification import seq_trans_fn, defaut_batchify_fn
+from utils import (
+    ALL_DATASETS,
+    DataTrainingArguments,
+    ModelArguments, )
+
+
+def do_train():
+    parser = PdArgumentParser(
+        (ModelArguments, DataTrainingArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    paddle.set_device(training_args.device)
+    if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.init_parallel_env()
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "
+        +
+        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(
+            training_args.output_dir
+    ) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(
+                os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome.")
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # set_seed(args)
+    data_args.dataset = data_args.dataset.strip()
+
+    if data_args.dataset not in ALL_DATASETS:
+        raise ValueError("Not found dataset {}".format(data_args.dataset))
+
+    # Use yaml config to rewrite all args.
+    config = ALL_DATASETS[data_args.dataset]
+    for args in (model_args, data_args, training_args):
+        for arg in vars(args):
+            if arg in config.keys():
+                setattr(args, arg, config[arg])
+
+    training_args.per_device_train_batch_size = config["batch_size"]
+    training_args.per_device_eval_batch_size = config["batch_size"]
+
+    dataset_config = data_args.dataset.split(" ")
+    all_ds = load_dataset(
+        dataset_config[0],
+        None if len(dataset_config) <= 1 else dataset_config[1], )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+
+    data_args.label_list = getattr(all_ds['train'], "label_list", None)
+
+    num_classes = 1 if all_ds["train"].label_list == None else len(all_ds[
+        'train'].label_list)
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_args.model_name_or_path, num_classes=num_classes)
+
+    trans_fn = partial(seq_trans_fn, tokenizer=tokenizer, args=data_args)
+    batchify_fn = defaut_batchify_fn(tokenizer, data_args)
+
+    train_ds = all_ds["train"].map(trans_fn)
+    dev_ds = all_ds["dev"].map(trans_fn)
+    test_ds = all_ds["test"].map(trans_fn)
+
+    loss_fct = nn.loss.CrossEntropyLoss(
+    ) if train_ds.label_list else nn.loss.MSELoss()
+
+    def compute_metrics(p):
+        preds = p.predictions[0] if isinstance(p.predictions,
+                                               tuple) else p.predictions
+
+        preds = paddle.to_tensor(preds)
+        label = paddle.to_tensor(p.label_ids)
+
+        probs = F.softmax(preds, axis=1)
+        metric = Accuracy()
+        metric.reset()
+        result = metric.compute(preds, label)
+        metric.update(result)
+        accu = metric.accumulate()
+        metric.reset()
+        return {"accuracy": accu}
+
+    trainer = Trainer(
+        model,
+        loss_fct,
+        training_args,
+        batchify_fn,
+        train_ds,
+        dev_ds,
+        tokenizer,
+        compute_metrics=compute_metrics, )
+
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+
+    train_result = trainer.train(resume_from_checkpoint=checkpoint)
+    metrics = train_result.metrics
+    trainer.save_model()  # Saves the tokenizer too for easy upload
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
+
+    eval_metrics = trainer.evaluate()
+    trainer.log_metrics("eval", eval_metrics)
+    test_ret = trainer.predict(test_ds)
+    trainer.log_metrics("test", test_ret.metrics)
+
+    input_spec = [
+        paddle.static.InputSpec(
+            shape=[None, None], dtype="int64"),  # input_ids
+        paddle.static.InputSpec(
+            shape=[None, None], dtype="int64")  # segment_ids
+    ]
+    trainer.export_model(input_spec=input_spec, load_best_model=True)
+
+
+def print_arguments(args):
+    """print arguments"""
+    logger.info('{:^40}'.format("Configuration Arguments"))
+    logger.info('{:20}:{}'.format("paddle commit id", paddle.version.commit))
+    for arg in vars(args):
+        logger.info('{:20}:{}'.format(arg, getattr(args, arg)))
+
+
+if __name__ == "__main__":
+    # args = parse_args()
+
+    # print_arguments(args)
+    do_train()
diff --git a/examples/language_model/ernie-1.0/finetune/sequence_classification.py b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
index 487c0ab4715f..4d7ca8e0b37f 100644
--- a/examples/language_model/ernie-1.0/finetune/sequence_classification.py
+++ b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
@@ -158,7 +158,7 @@ def __call__(self, data):
         return ret
 
 
-def clue_batchify_fn_dict(tokenizer, args):
+def defaut_batchify_fn(tokenizer, args):
     batchify_fn = lambda samples, fn=Dict({
         'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
         "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
@@ -316,7 +316,7 @@ def train(self):
             best_dev_acc, corr_test_acc))
 
 
-class SeqTrainer2(ClueTrainer):
+class SeqTrainer(ClueTrainer):
     def dataloader_inner(self):
         trans_fn = partial(
             seq_trans_fn, tokenizer=self.tokenizer, args=self.args)
@@ -327,48 +327,4 @@ def dataloader_inner(self):
         self.dev_dl = self.create_dataloader(
             self.dev_ds, "dev", self.args.batch_size, batchify_fn, trans_fn)
         self.test_dl = self.create_dataloader(
-            self.test_ds, "dev", self.args.batch_size, batchify_fn, trans_fn)
-
-
-class SeqTrainer(Trainer):
-    def __init__(self, train_ds, dev_ds, model, tokenizer, data_args,
-                 training_args, *arg, **kwargs):
-
-        trans_fn = partial(seq_trans_fn, tokenizer=tokenizer, args=data_args)
-        batchify_fn = clue_batchify_fn_dict(tokenizer, data_args)
-
-        train_ds = train_ds.map(trans_fn)
-        dev_ds = dev_ds.map(trans_fn)
-
-        if "test_ds" in kwargs.keys():
-            test_ds = kwargs["test_ds"]
-            self.test_ds = test_ds.map(trans_fn)
-
-        loss_fct = paddle.nn.loss.CrossEntropyLoss(
-        ) if train_ds.label_list else paddle.nn.loss.MSELoss()
-
-        def compute_metrics(p):
-            preds = p.predictions[0] if isinstance(p.predictions,
-                                                   tuple) else p.predictions
-
-            preds = paddle.to_tensor(preds)
-            label = paddle.to_tensor(p.label_ids)
-
-            probs = F.softmax(preds, axis=1)
-            metric = Accuracy()
-            metric.reset()
-            result = metric.compute(preds, label)
-            metric.update(result)
-            accu = metric.accumulate()
-            metric.reset()
-            return {"accuracy": accu}
-
-        super().__init__(
-            model,
-            loss_fct,
-            training_args,
-            batchify_fn,
-            train_ds,
-            dev_ds,
-            tokenizer,
-            compute_metrics=compute_metrics)
+            self.test_ds, "dev", self.args.batch_size, batchify_fn, trans_fn)
\ No newline at end of file
diff --git a/paddlenlp/trainer/trainer_args.py b/paddlenlp/trainer/trainer_args.py
index d62144a516c3..f7e713383bbc 100644
--- a/paddlenlp/trainer/trainer_args.py
+++ b/paddlenlp/trainer/trainer_args.py
@@ -498,6 +498,13 @@ class TrainingArguments:
             "An optional descriptor for the run. Notably used for wandb logging."
         })
 
+    device: Optional[str] = field(
+        default="gpu",
+        metadata={
+            "help":
+            "An optional descriptor for the run. Notably used for wandb logging."
+        })
+
     disable_tqdm: Optional[bool] = field(
         default=None,
         metadata={"help": "Whether or not to disable the tqdm progress bars."})
@@ -677,7 +684,7 @@ def eval_batch_size(self) -> int:
         return eval_batch_size
 
     @property
-    def device(self) -> "paddle.device":
+    def current_device(self) -> "paddle.device":
         """
         The device used by this process.
         """
diff --git a/paddlenlp/trainer/trainer_base.py b/paddlenlp/trainer/trainer_base.py
index 424739b352db..bc264ec0c098 100644
--- a/paddlenlp/trainer/trainer_base.py
+++ b/paddlenlp/trainer/trainer_base.py
@@ -34,6 +34,7 @@
 import paddle
 import paddle.nn as nn
 import paddle.amp.auto_cast as autocast
+import paddle.distributed as dist
 from paddle.io import (
     Dataset,
     DataLoader,
@@ -785,7 +786,7 @@ def _prepare_input(
         elif isinstance(data, (tuple, list)):
             return type(data)(self._prepare_input(v) for v in data)
         elif isinstance(data, paddle.Tensor):
-            kwargs = dict(device=self.args.device)
+            kwargs = dict(device=self.args.current_device)
             # update data type for pure fp16
             return data
             # return data.to(**kwargs)
@@ -1296,9 +1297,6 @@ def evaluation_loop(
         else:
             metrics = {}
 
-        # if losses is not None:
-        #     metrics[f"{metric_key_prefix}_loss"] = float(np.mean(losses))
-
         if all_losses is not None:
             metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()
 
diff --git a/requirements.txt b/requirements.txt
index f96ce81c8a05..0dfc347567da 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,5 @@ colorlog
 colorama
 seqeval
 multiprocess
-datasets
\ No newline at end of file
+datasets
+tqdm

From 6e4113f9872299ec46abc1fd6d74980a35aa45d9 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Fri, 25 Mar 2022 22:17:41 +0800
Subject: [PATCH 11/15] fix file

---
 .../ernie-1.0/finetune/utils.py               | 121 ++++++++++++++++++
 1 file changed, 121 insertions(+)
 create mode 100644 examples/language_model/ernie-1.0/finetune/utils.py

diff --git a/examples/language_model/ernie-1.0/finetune/utils.py b/examples/language_model/ernie-1.0/finetune/utils.py
new file mode 100644
index 000000000000..510f49401f4b
--- /dev/null
+++ b/examples/language_model/ernie-1.0/finetune/utils.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Optional
+import copy
+
+import yaml
+import os.path as osp
+
+TASKS = [
+    "SequenceClassification",
+    "TokenClassification",
+    "QuestionAnswering",
+]
+
+config = yaml.load(
+    open(osp.join(osp.abspath("."), "./config.yml"), 'r'),
+    Loader=yaml.FullLoader)
+default_args = config["DefaultArgs"]
+
+ALL_DATASETS = {}
+
+for task_type in TASKS:
+    task = config[task_type]
+    for data_name in task.keys():
+        new_args = task[data_name]
+        new_args = {} if new_args is None else new_args
+        final_args = copy.deepcopy(default_args)
+        final_args.update(new_args)
+        final_args["model"] = "AutoModelFor{}".format(task_type)
+        ALL_DATASETS[data_name] = final_args
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `PdArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    dataset: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the dataset to use (via the datasets library)."
+        })
+
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help":
+            "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        }, )
+
+    # Additional configs for QA task.
+    doc_stride: int = field(
+        default=128,
+        metadata={
+            "help":
+            "When splitting up a long document into chunks, how much stride to take between chunks."
+        }, )
+
+    n_best_size: int = field(
+        default=20,
+        metadata={
+            "help":
+            "The total number of n-best predictions to generate in the nbest_predictions.json output file."
+        }, )
+
+    max_query_length: int = field(
+        default=64,
+        metadata={"help": "Max query length."}, )
+
+    max_answer_length: int = field(
+        default=30,
+        metadata={"help": "Max answer length."}, )
+
+    do_lower_case: bool = field(
+        default=False,
+        metadata={
+            "help":
+            "Whether to lower case the input text. Should be True for uncased models and False for cased models."
+        }, )
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(metadata={
+        "help":
+        "Path to pretrained model or model identifier from https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html"
+    })
+    config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help":
+            "Pretrained config name or path if not the same as model_name"
+        })
+    tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help":
+            "Pretrained tokenizer name or path if not the same as model_name"
+        })

From 6b70df769f91f63d6845306a2f3c4969e60677be Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Fri, 25 Mar 2022 22:26:37 +0800
Subject: [PATCH 12/15] add init

---
 paddlenlp/trainer/utils/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 paddlenlp/trainer/utils/__init__.py

diff --git a/paddlenlp/trainer/utils/__init__.py b/paddlenlp/trainer/utils/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1

From 27efa977bf7a168ba7f62f0f0a442dfdef093501 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Tue, 29 Mar 2022 18:11:27 +0800
Subject: [PATCH 13/15] seq cls support clue.

---
 .../ernie-1.0/finetune/run_seq_cls.py         |  51 ++--
 .../finetune/sequence_classification.py       | 270 +++---------------
 .../ernie-1.0/finetune/utils.py               |  51 +++-
 paddlenlp/trainer/trainer_base.py             | 109 +------
 paddlenlp/trainer/trainer_callback.py         |   6 +-
 5 files changed, 129 insertions(+), 358 deletions(-)

diff --git a/examples/language_model/ernie-1.0/finetune/run_seq_cls.py b/examples/language_model/ernie-1.0/finetune/run_seq_cls.py
index 54ed161940dd..12a442358e18 100644
--- a/examples/language_model/ernie-1.0/finetune/run_seq_cls.py
+++ b/examples/language_model/ernie-1.0/finetune/run_seq_cls.py
@@ -35,11 +35,12 @@
 from paddlenlp.utils.log import logger
 
 sys.path.insert(0, os.path.abspath("."))
-from sequence_classification import seq_trans_fn, defaut_batchify_fn
+from sequence_classification import seq_trans_fn, clue_trans_fn
 from utils import (
     ALL_DATASETS,
     DataTrainingArguments,
-    ModelArguments, )
+    ModelArguments,
+    defaut_collator, )
 
 
 def do_train():
@@ -77,7 +78,6 @@ def do_train():
 
     # set_seed(args)
     data_args.dataset = data_args.dataset.strip()
-
     if data_args.dataset not in ALL_DATASETS:
         raise ValueError("Not found dataset {}".format(data_args.dataset))
 
@@ -96,26 +96,32 @@ def do_train():
         dataset_config[0],
         None if len(dataset_config) <= 1 else dataset_config[1], )
 
-    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
-
     data_args.label_list = getattr(all_ds['train'], "label_list", None)
-
     num_classes = 1 if all_ds["train"].label_list == None else len(all_ds[
         'train'].label_list)
 
+    # Define tokenizer, model, loss function. 
+    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
     model = AutoModelForSequenceClassification.from_pretrained(
         model_args.model_name_or_path, num_classes=num_classes)
+    loss_fct = nn.loss.CrossEntropyLoss(
+    ) if data_args.label_list else nn.loss.MSELoss()
+
+    # Define dataset pre-process function
+    if "clue" in data_args.dataset:
+        trans_fn = partial(clue_trans_fn, tokenizer=tokenizer, args=data_args)
+    else:
+        trans_fn = partial(seq_trans_fn, tokenizer=tokenizer, args=data_args)
 
-    trans_fn = partial(seq_trans_fn, tokenizer=tokenizer, args=data_args)
-    batchify_fn = defaut_batchify_fn(tokenizer, data_args)
+    # Define data collector
+    batchify_fn = defaut_collator(tokenizer, data_args)
 
+    # Dataset pre-process
     train_ds = all_ds["train"].map(trans_fn)
     dev_ds = all_ds["dev"].map(trans_fn)
     test_ds = all_ds["test"].map(trans_fn)
 
-    loss_fct = nn.loss.CrossEntropyLoss(
-    ) if train_ds.label_list else nn.loss.MSELoss()
-
+    # Define the metrics of tasks.
     def compute_metrics(p):
         preds = p.predictions[0] if isinstance(p.predictions,
                                                tuple) else p.predictions
@@ -142,12 +148,17 @@ def compute_metrics(p):
         tokenizer,
         compute_metrics=compute_metrics, )
 
+    # Log model and data config
+    trainer.print_config(model_args, "Model")
+    trainer.print_config(data_args, "Data")
+
     checkpoint = None
     if training_args.resume_from_checkpoint is not None:
         checkpoint = training_args.resume_from_checkpoint
     elif last_checkpoint is not None:
         checkpoint = last_checkpoint
 
+    # Training
     train_result = trainer.train(resume_from_checkpoint=checkpoint)
     metrics = train_result.metrics
     trainer.save_model()  # Saves the tokenizer too for easy upload
@@ -155,11 +166,18 @@ def compute_metrics(p):
     trainer.save_metrics("train", metrics)
     trainer.save_state()
 
+    # Evaluate and tests model
     eval_metrics = trainer.evaluate()
     trainer.log_metrics("eval", eval_metrics)
+
     test_ret = trainer.predict(test_ds)
     trainer.log_metrics("test", test_ret.metrics)
+    if test_ret.label_ids is None:
+        paddle.save(
+            test_ret.predictions,
+            os.path.join(training_args.output_dir, "test_results.pdtensor"), )
 
+    # export inference model
     input_spec = [
         paddle.static.InputSpec(
             shape=[None, None], dtype="int64"),  # input_ids
@@ -169,16 +187,5 @@ def compute_metrics(p):
     trainer.export_model(input_spec=input_spec, load_best_model=True)
 
 
-def print_arguments(args):
-    """print arguments"""
-    logger.info('{:^40}'.format("Configuration Arguments"))
-    logger.info('{:20}:{}'.format("paddle commit id", paddle.version.commit))
-    for arg in vars(args):
-        logger.info('{:20}:{}'.format(arg, getattr(args, arg)))
-
-
 if __name__ == "__main__":
-    # args = parse_args()
-
-    # print_arguments(args)
     do_train()
diff --git a/examples/language_model/ernie-1.0/finetune/sequence_classification.py b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
index 4d7ca8e0b37f..56b50681643f 100644
--- a/examples/language_model/ernie-1.0/finetune/sequence_classification.py
+++ b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
@@ -12,24 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import time
 from functools import partial
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.metric import Accuracy
 import numpy as np
 
-import paddlenlp
-from paddlenlp.data import Stack, Tuple, Pad
-from paddlenlp.utils.log import logger
-
-from paddlenlp.trainer.trainer_base import TrainerBase, Trainer
-
 
 def convert_example(example, tokenizer, max_seq_length=512, is_test=False):
+    is_test = True
+    if 'label' in example.keys():
+        is_test = False
 
     if "text_b" in example.keys():
         text = example["text_a"]
@@ -44,28 +34,30 @@ def convert_example(example, tokenizer, max_seq_length=512, is_test=False):
     token_type_ids = encoded_inputs["token_type_ids"]
 
     if is_test:
-        return input_ids, token_type_ids
-    label = np.array([example["label"]], dtype="int64")
-    # return input_ids, token_type_ids, label
-    return {
-        "input_ids": input_ids,
-        "token_type_ids": token_type_ids,
-        "labels": label
-    }
-
-
-def seq_trans_fn(example, tokenizer, args):
-    return convert_example(
-        example, tokenizer=tokenizer, max_seq_length=args.max_seq_length)
+        return {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+        }
+    else:
+        label = np.array([example["label"]], dtype="int64")
+        return {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "labels": label
+        }
 
 
+# Data pre-process function for clue benchmark datatset
 def convert_clue(example,
                  label_list,
                  tokenizer=None,
-                 is_test=False,
                  max_seq_length=512,
                  **kwargs):
     """convert a glue example into necessary features"""
+    is_test = False
+    if 'label' not in example.keys():
+        is_test = True
+
     if not is_test:
         # `label_list == None` is for regression task
         label_dtype = "int64" if label_list else "float32"
@@ -111,10 +103,25 @@ def convert_clue(example,
             example['sentence1'],
             text_pair=example['sentence2'],
             max_seq_len=max_seq_length)
+
     if not is_test:
-        return example['input_ids'], example['token_type_ids'], label
+        return {
+            "input_ids": example['input_ids'],
+            "token_type_ids": example['token_type_ids'],
+            "labels": label
+        }
     else:
-        return example['input_ids'], example['token_type_ids']
+        return {
+            "input_ids": example['input_ids'],
+            "token_type_ids": example['token_type_ids']
+        }
+
+
+def seq_trans_fn(example, tokenizer, args):
+    return convert_example(
+        example,
+        tokenizer=tokenizer,
+        max_seq_length=args.max_seq_length, )
 
 
 def clue_trans_fn(examples, tokenizer, args):
@@ -123,208 +130,3 @@ def clue_trans_fn(examples, tokenizer, args):
         tokenizer=tokenizer,
         label_list=args.label_list,
         max_seq_length=args.max_seq_length)
-
-
-def clue_batchify_fn(tokenizer, args):
-    batchify_fn = lambda samples, fn=Tuple(
-        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
-        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
-        Stack(dtype="int64" if args.label_list else "float32")  # label
-    ): fn(samples)
-
-    return batchify_fn
-
-
-class Dict(object):
-    def __init__(self, fn):
-        assert isinstance(fn, (dict)), 'Input pattern not understood. The input of Dict must be a dict with key of input column name and value of collate_fn ' \
-                                   'Received fn=%s' % (str(fn))
-
-        self._fn = fn
-
-        for col_name, ele_fn in self._fn.items():
-            assert callable(
-                ele_fn
-            ), 'Batchify functions must be callable! type(fn[%d]) = %s' % (
-                col_name, str(type(ele_fn)))
-
-    def __call__(self, data):
-
-        ret = {}
-        for col_name, ele_fn in self._fn.items():
-            result = ele_fn([ele[col_name] for ele in data])
-            ret[col_name] = result
-
-        return ret
-
-
-def defaut_batchify_fn(tokenizer, args):
-    batchify_fn = lambda samples, fn=Dict({
-        'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
-        "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
-        "labels": Stack(dtype="int64" if args.label_list else "float32")  # label
-    }): fn(samples)
-
-    return batchify_fn
-
-
-@paddle.no_grad()
-def evaluate(model, criterion, metric, data_loader, mode="dev"):
-    """
-    Given a dataset, it evals model and computes the metric.
-
-    Args:
-        model(obj:`paddle.nn.Layer`): A model to classify texts.
-        data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
-        criterion(obj:`paddle.nn.Layer`): It can compute the loss.
-        metric(obj:`paddle.metric.Metric`): The evaluation metric.
-    """
-    model.eval()
-    metric.reset()
-    losses = []
-    for batch in data_loader:
-        input_ids, token_type_ids, labels = batch
-        logits = model(input_ids, token_type_ids)
-        loss = criterion(logits, labels)
-        losses.append(loss.numpy())
-        correct = metric.compute(logits, labels)
-        metric.update(correct)
-    accu = metric.accumulate()
-    logger.info("%s: eval loss: %.5f, accuracy: %.5f" %
-                (mode, np.mean(losses), accu))
-    metric.reset()
-    model.train()
-    return accu
-
-
-class ClueTrainer(TrainerBase):
-    def __init__(self, train_ds, dev_ds, model, tokenizer, args, *arg,
-                 **kwargs):
-        super().__init__()
-        self.rank = paddle.distributed.get_rank()
-        self.train_ds = train_ds
-        self.dev_ds = dev_ds
-        if "test_ds" in kwargs.keys():
-            self.test_ds = kwargs["test_ds"]
-
-        self.model = model
-        self.tokenizer = tokenizer
-        self.args = args
-
-        self.dataloader_inner()
-        self.prepare_train_config()
-        self.print_config()
-
-    def dataloader_inner(self):
-        trans_fn = partial(
-            clue_trans_fn, tokenizer=self.tokenizer, args=self.args)
-        batchify_fn = clue_batchify_fn(self.tokenizer, self.args)
-
-        self.train_dl = self.create_dataloader(
-            self.train_ds, "train", self.args.batch_size, batchify_fn, trans_fn)
-        self.dev_dl = self.create_dataloader(
-            self.dev_ds, "dev", self.args.batch_size, batchify_fn, trans_fn)
-
-        self.test_dl = None
-
-    def eval(self):
-        pass
-
-    def train(self):
-        num_classes = 1 if self.train_ds.label_list == None else len(
-            self.train_ds.label_list)
-
-        loss_fct = paddle.nn.loss.CrossEntropyLoss(
-        ) if self.train_ds.label_list else paddle.nn.loss.MSELoss()
-
-        metric = Accuracy()
-
-        if self.args.fp16:
-            scaler = paddle.amp.GradScaler(
-                init_loss_scaling=self.args.scale_loss)
-
-        best_dev_acc = 0.0
-        corr_test_acc = -1.0
-        global_step = 0
-        tic_train = time.time()
-
-        for epoch in range(self.args.num_train_epochs):
-            for step, batch in enumerate(self.train_dl):
-                global_step += 1
-                input_ids, segment_ids, labels = batch
-                with paddle.amp.auto_cast(
-                        bool(self.args.fp16),
-                        custom_white_list=["layer_norm", "softmax", "gelu"], ):
-                    logits = self.model(input_ids, segment_ids)
-                    loss = loss_fct(logits, labels)
-
-                probs = F.softmax(logits, axis=1)
-                correct = metric.compute(probs, labels)
-                metric.update(correct)
-                acc = metric.accumulate()
-
-                if self.args.fp16:
-                    scaler.scale(loss).backward()
-                    scaler.minimize(self.optimizer, loss)
-                else:
-                    loss.backward()
-                    self.optimizer.step()
-
-                self.lr_scheduler.step()
-                self.optimizer.clear_grad()
-
-                if global_step % self.args.logging_steps == 0:
-                    logger.info(
-                        "global step %d/%d, epoch: %d, batch: %d, acc: %.5f, loss: %f, lr: %.10f, speed: %.4f step/s"
-                        % (global_step, self.args.num_training_steps, epoch,
-                           step, metric.accumulate(), loss,
-                           self.optimizer.get_lr(),
-                           self.args.logging_steps / (time.time() - tic_train)))
-                    metric.reset()
-                    tic_train = time.time()
-                if global_step % self.args.eval_steps == 0 or global_step == self.args.num_training_steps:
-                    tic_eval = time.time()
-                    metric.reset()
-                    if self.dev_dl is not None:
-                        dev_acc = evaluate(self.model, loss_fct, metric,
-                                           self.dev_dl, "dev")
-                    else:
-                        dev_acc = -1.0
-                    metric.reset()
-
-                    if self.test_dl is not None:
-                        test_acc = evaluate(self.model, loss_fct, metric,
-                                            self.test_dl, "test")
-                    else:
-                        test_acc = -1.0
-                    metric.reset()
-
-                    logger.info("eval done total : %s s" %
-                                (time.time() - tic_eval))
-                    if dev_acc > best_dev_acc:
-                        best_dev_acc = dev_acc
-                        corr_test_acc = test_acc
-
-                    logger.warning(
-                        "best_dev_acc: {:.6f}, corr_test_acc: {:.6f}".format(
-                            best_dev_acc, corr_test_acc))
-
-                if global_step >= self.args.num_training_steps:
-                    return
-
-        logger.warning("best_dev_acc: {:.6f}, corr_test_acc: {:.6f}".format(
-            best_dev_acc, corr_test_acc))
-
-
-class SeqTrainer(ClueTrainer):
-    def dataloader_inner(self):
-        trans_fn = partial(
-            seq_trans_fn, tokenizer=self.tokenizer, args=self.args)
-        batchify_fn = clue_batchify_fn(self.tokenizer, self.args)
-
-        self.train_dl = self.create_dataloader(
-            self.train_ds, "train", self.args.batch_size, batchify_fn, trans_fn)
-        self.dev_dl = self.create_dataloader(
-            self.dev_ds, "dev", self.args.batch_size, batchify_fn, trans_fn)
-        self.test_dl = self.create_dataloader(
-            self.test_ds, "dev", self.args.batch_size, batchify_fn, trans_fn)
\ No newline at end of file
diff --git a/examples/language_model/ernie-1.0/finetune/utils.py b/examples/language_model/ernie-1.0/finetune/utils.py
index 510f49401f4b..fbe639bd3b42 100644
--- a/examples/language_model/ernie-1.0/finetune/utils.py
+++ b/examples/language_model/ernie-1.0/finetune/utils.py
@@ -15,10 +15,11 @@
 from dataclasses import dataclass, field
 from typing import Optional
 import copy
-
 import yaml
 import os.path as osp
 
+from paddlenlp.data import Stack, Tuple, Pad
+
 TASKS = [
     "SequenceClassification",
     "TokenClassification",
@@ -43,6 +44,54 @@
         ALL_DATASETS[data_name] = final_args
 
 
+class Dict(object):
+    def __init__(self, fn):
+        assert isinstance(fn, (dict)), 'Input pattern not understood. The input of Dict must be a dict with key of input column name and value of collate_fn ' \
+                                   'Received fn=%s' % (str(fn))
+
+        self._fn = fn
+
+        for col_name, ele_fn in self._fn.items():
+            assert callable(
+                ele_fn
+            ), 'Batchify functions must be callable! type(fn[%d]) = %s' % (
+                col_name, str(type(ele_fn)))
+
+    def __call__(self, data):
+
+        ret = {}
+        if len(data) <= 0:
+            return ret
+
+        for col_name, ele_fn in self._fn.items():
+            # skip unused col_name, such as labels in test mode.
+            if col_name not in data[0].keys():
+                continue
+            result = ele_fn([ele[col_name] for ele in data])
+            ret[col_name] = result
+
+        return ret
+
+
+def defaut_collator(tokenizer, args):
+    """ Defaut collator for sequences classification
+
+    Args:
+        tokenizer (PretrainedTokenizer): tokenizer of PretrainedModel
+        args : data argument, need label list.
+
+    Returns:
+        batchify_fn (function): collator
+    """
+    batchify_fn = lambda samples, fn=Dict({
+        'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input_ids
+        "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # token_type_ids
+        "labels": Stack(dtype="int64" if args.label_list else "float32")  # labels
+    }): fn(samples)
+
+    return batchify_fn
+
+
 @dataclass
 class DataTrainingArguments:
     """
diff --git a/paddlenlp/trainer/trainer_base.py b/paddlenlp/trainer/trainer_base.py
index bc264ec0c098..b42bb4dec49e 100644
--- a/paddlenlp/trainer/trainer_base.py
+++ b/paddlenlp/trainer/trainer_base.py
@@ -89,7 +89,8 @@ def __init__(self, *args, **kwargs):
 # Name of the files used for checkpointing
 TRAINING_ARGS_NAME = "training_args.bin"
 TRAINER_STATE_NAME = "trainer_state.json"
-OPTIMIZER_NAME = "optimizer.pdparams"
+
+OPTIMIZER_NAME = "optimizer.pdopt"
 SCHEDULER_NAME = "scheduler.pdparams"
 SCALER_NAME = "scaler.pdparams"
 
@@ -1518,110 +1519,22 @@ def _pad_across_processes(self, tensor, pad_index=-100):
         new_tensor[:, :old_size[1]] = tensor
         return new_tensor
 
-    def print_config(self):
+    def print_config(self, args=None, key=""):
         """
         """
         logger.info("=" * 60)
-        logger.info('{:^40}'.format("Configuration Arguments"))
+        if args is None:
+            args = self.args
+            key = "Training"
+
+        logger.info('{:^40}'.format("{} Configuration Arguments".format(key)))
         logger.info('{:30}:{}'.format("paddle commit id",
                                       paddle.version.commit))
 
-        for a in dir(self.args):
+        for a in dir(args):
             if (a[:2] != "__"):  #don't print double underscore methods
-                v = getattr(self.args, a)
+                v = getattr(args, a)
                 if not isinstance(v, types.MethodType):
                     logger.info('{:30}:{}'.format(a, v))
 
-        logger.info("=" * 60)
-
-
-class TrainerBase(object):
-    """
-    """
-
-    def create_dataloader(self,
-                          dataset,
-                          mode='train',
-                          batch_size=16,
-                          batchify_fn=None,
-                          trans_fn=None,
-                          batched=False):
-        """
-        """
-        if trans_fn:
-            dataset = dataset.map(trans_fn, batched=batched)
-
-        shuffle = True if mode == 'train' else False
-        if mode == 'train':
-            batch_sampler = paddle.io.DistributedBatchSampler(
-                dataset, batch_size=batch_size, shuffle=shuffle)
-        else:
-            batch_sampler = paddle.io.BatchSampler(
-                dataset, batch_size=batch_size, shuffle=shuffle)
-
-        return paddle.io.DataLoader(
-            dataset=dataset,
-            batch_sampler=batch_sampler,
-            collate_fn=batchify_fn,
-            num_workers=0,
-            return_list=True)
-
-    def train(self, *args, **kwargs):
-        """
-        """
-        pass
-
-    def eval(self, *args, **kwargs):
-        """
-        """
-        pass
-
-    def prepare_train_config(self):
-        """
-        """
-        if self.args.max_steps > 0:
-            self.args.num_training_steps = self.args.max_steps
-            self.args.num_train_epochs = math.ceil(
-                self.args.num_training_steps / len(self.train_dl))
-
-        else:
-            self.args.num_training_steps = len(
-                self.train_dl) * self.args.num_train_epochs
-            self.args.num_train_epochs = self.args.num_train_epochs
-
-        if self.args.num_training_steps // self.args.eval_steps < self.args.minimum_eval_times:
-            exp_step = self.args.num_training_steps / self.args.minimum_eval_times
-            exp_step = max(int(exp_step - exp_step % 10), 10)
-            logger.info("Set eval step to %d" % exp_step)
-            self.args.eval_steps = exp_step
-
-        warmup = self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_ratio
-
-        self.lr_scheduler = LinearDecayWithWarmup(
-            self.args.learning_rate, self.args.num_training_steps, warmup)
-
-        # Generate parameter names needed to perform weight decay.
-        # All bias and LayerNorm parameters are excluded.
-        decay_params = [
-            p.name for n, p in self.model.named_parameters()
-            if not any(nd in n for nd in ["bias", "norm"])
-        ]
-
-        self.optimizer = paddle.optimizer.AdamW(
-            learning_rate=self.lr_scheduler,
-            beta1=0.9,
-            beta2=0.999,
-            epsilon=self.args.adam_epsilon,
-            parameters=self.model.parameters(),
-            weight_decay=self.args.weight_decay,
-            apply_decay_param_fun=lambda x: x in decay_params,
-            grad_clip=nn.ClipGradByGlobalNorm(self.args.max_grad_norm))
-
-    def print_config(self):
-        """
-        """
-        logger.info('{:^40}'.format("Configuration Arguments"))
-        logger.info('{:20}:{}'.format("paddle commit id",
-                                      paddle.version.commit))
-        for arg in vars(self.args):
-            logger.info('{:20}:{}'.format(arg, getattr(self.args, arg)))
+        logger.info("")
diff --git a/paddlenlp/trainer/trainer_callback.py b/paddlenlp/trainer/trainer_callback.py
index 2d33ac0525ab..9549a3d476e5 100644
--- a/paddlenlp/trainer/trainer_callback.py
+++ b/paddlenlp/trainer/trainer_callback.py
@@ -518,11 +518,11 @@ def on_step_end(self,
         if state.global_step >= state.max_steps:
             control.should_training_stop = True
             # Log and save on end
-            if args.logging_strategy == IntervalStrategy.STEPS:
+            if args.logging_strategy == IntervalStrategy.STEPS and state.global_step >= args.logging_steps:
                 control.should_log = True
-            if args.evaluation_strategy == IntervalStrategy.STEPS:
+            if args.evaluation_strategy == IntervalStrategy.STEPS and state.global_step >= args.eval_steps:
                 control.should_evaluate = True
-            if args.save_strategy == IntervalStrategy.STEPS:
+            if args.save_strategy == IntervalStrategy.STEPS and args.save_steps > 0 and state.global_step >= args.save_steps:
                 control.should_save = True
 
         return control

From 3c412ffee81d29433439f0e08347ccbb8e49a826 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Wed, 30 Mar 2022 15:21:14 +0800
Subject: [PATCH 14/15] trainer support for token classification and question
 answersing tasks.

---
 .../ernie-1.0/finetune/question_answering.py  | 358 +++++++++---------
 .../ernie-1.0/finetune/run_ner.py             | 211 +++++++++++
 .../ernie-1.0/finetune/run_qa.py              | 238 ++++++++++++
 .../finetune/sequence_classification.py       |   4 +-
 .../finetune/token_classification.py          | 138 +------
 .../ernie-1.0/finetune/utils.py               |  22 ++
 paddlenlp/trainer/trainer_base.py             |   6 +
 paddlenlp/trainer/utils/helper.py             |   4 +-
 8 files changed, 683 insertions(+), 298 deletions(-)
 create mode 100644 examples/language_model/ernie-1.0/finetune/run_ner.py
 create mode 100644 examples/language_model/ernie-1.0/finetune/run_qa.py

diff --git a/examples/language_model/ernie-1.0/finetune/question_answering.py b/examples/language_model/ernie-1.0/finetune/question_answering.py
index 64ef323fe5a7..b9001fefe4d9 100644
--- a/examples/language_model/ernie-1.0/finetune/question_answering.py
+++ b/examples/language_model/ernie-1.0/finetune/question_answering.py
@@ -14,16 +14,123 @@
 
 import time
 import json
+import os
+import sys
 from functools import partial
 
 import numpy as np
 import paddle
 import paddlenlp as ppnlp
-from paddlenlp.data import Pad, Stack, Tuple, Dict
-from paddlenlp.metrics.squad import squad_evaluate, compute_prediction
-
-from paddlenlp.trainer.trainer_base import TrainerBase
+from paddlenlp.data import Pad, Stack, Tuple
 from paddlenlp.utils.log import logger
+from paddlenlp.trainer import Trainer
+from paddlenlp.trainer.trainer_utils import PredictionOutput
+
+sys.path.insert(0, os.path.abspath("."))
+from utils import Dict
+
+
+class QuestionAnsweringTrainer(Trainer):
+    def __init__(self,
+                 *args,
+                 eval_examples=None,
+                 post_process_function=None,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.eval_examples = eval_examples
+        self.post_process_function = post_process_function
+
+    def evaluate(self,
+                 eval_dataset=None,
+                 eval_examples=None,
+                 ignore_keys=None,
+                 metric_key_prefix: str="eval"):
+        eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
+        eval_dataloader = self.get_eval_dataloader(eval_dataset)
+        eval_examples = self.eval_examples if eval_examples is None else eval_examples
+
+        # Temporarily disable metric computation, we will do it in the loop here.
+        compute_metrics = self.compute_metrics
+        self.compute_metrics = None
+        eval_loop = self.evaluation_loop
+        try:
+            output = eval_loop(
+                eval_dataloader,
+                description="Evaluation",
+                # No point gathering the predictions if there are no metrics, otherwise we defer to
+                # self.args.prediction_loss_only
+                prediction_loss_only=True if compute_metrics is None else None,
+                ignore_keys=ignore_keys, )
+        finally:
+            self.compute_metrics = compute_metrics
+
+        if self.post_process_function is not None and self.compute_metrics is not None:
+            eval_preds = self.post_process_function(eval_examples, eval_dataset,
+                                                    output.predictions)
+            metrics = self.compute_metrics(eval_preds)
+
+            # Prefix all keys with metric_key_prefix + '_'
+            for key in list(metrics.keys()):
+                if not key.startswith(f"{metric_key_prefix}_"):
+                    metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+            self.log(metrics)
+        else:
+            metrics = {}
+
+        self.control = self.callback_handler.on_evaluate(self.args, self.state,
+                                                         self.control, metrics)
+        return metrics
+
+    def predict(self,
+                predict_dataset,
+                predict_examples,
+                ignore_keys=None,
+                metric_key_prefix: str="test"):
+        predict_dataloader = self.get_test_dataloader(predict_dataset)
+
+        # Temporarily disable metric computation, we will do it in the loop here.
+        compute_metrics = self.compute_metrics
+        self.compute_metrics = None
+        eval_loop = self.evaluation_loop
+        try:
+            output = eval_loop(
+                predict_dataloader,
+                description="Prediction",
+                # No point gathering the predictions if there are no metrics, otherwise we defer to
+                # self.args.prediction_loss_only
+                prediction_loss_only=True if compute_metrics is None else None,
+                ignore_keys=ignore_keys, )
+        finally:
+            self.compute_metrics = compute_metrics
+
+        if self.post_process_function is None or self.compute_metrics is None:
+            return output
+
+        predictions = self.post_process_function(
+            predict_examples, predict_dataset, output.predictions, "predict")
+        metrics = self.compute_metrics(predictions)
+
+        # Prefix all keys with metric_key_prefix + '_'
+        for key in list(metrics.keys()):
+            if not key.startswith(f"{metric_key_prefix}_"):
+                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+        return PredictionOutput(
+            predictions=predictions.predictions,
+            label_ids=predictions.label_ids,
+            metrics=metrics)
+
+
+def qa_collator(tokenizer, args):
+    train_batchify_fn = lambda samples, fn=Dict({
+        "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
+        "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
+        "start_positions": Stack(dtype="int64"),
+        "end_positions": Stack(dtype="int64")
+    }): fn(samples)
+
+    return train_batchify_fn
 
 
 class CrossEntropyLossForSQuAD(paddle.nn.Layer):
@@ -43,57 +150,14 @@ def forward(self, y, label):
         return loss
 
 
-@paddle.no_grad()
-def evaluate(model, data_loader, args):
-    model.eval()
-
-    all_start_logits = []
-    all_end_logits = []
-    tic_eval = time.time()
-
-    for batch in data_loader:
-        input_ids, token_type_ids = batch
-        start_logits_tensor, end_logits_tensor = model(input_ids,
-                                                       token_type_ids)
-
-        for idx in range(start_logits_tensor.shape[0]):
-            if len(all_start_logits) % 1000 == 0 and len(all_start_logits):
-                print("Processing example: %d" % len(all_start_logits))
-                print('time per 1000:', time.time() - tic_eval)
-                tic_eval = time.time()
-
-            all_start_logits.append(start_logits_tensor.numpy()[idx])
-            all_end_logits.append(end_logits_tensor.numpy()[idx])
-
-    all_predictions, _, _ = compute_prediction(
-        data_loader.dataset.data, data_loader.dataset.new_data,
-        (all_start_logits, all_end_logits), False, args.n_best_size,
-        args.max_answer_length)
-
-    # Can also write all_nbest_json and scores_diff_json files if needed
-    with open('prediction.json', "w", encoding='utf-8') as writer:
-        writer.write(
-            json.dumps(
-                all_predictions, ensure_ascii=False, indent=4) + "\n")
-
-    squad_evaluate(
-        examples=data_loader.dataset.data,
-        preds=all_predictions,
-        is_whitespace_splited=False)
-
-    model.train()
-
-
 def prepare_train_features(examples, tokenizer, args):
     # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
     # in one example possible giving several features when a context is long, each of those features having a
     # context that overlaps a bit the context of the previous feature.
     # NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is
     # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead.
-    contexts = [examples[i]['context'] for i in range(len(examples))]
-    questions = [examples[i]['question'] for i in range(len(examples))]
-
-    print(len(examples))
+    contexts = examples['context']
+    questions = examples['question']
 
     tokenized_examples = tokenizer(
         questions,
@@ -101,55 +165,64 @@ def prepare_train_features(examples, tokenizer, args):
         stride=args.doc_stride,
         max_seq_len=args.max_seq_length)
 
+    # Since one example might give us several features if it has a long context, we need a map from a feature to
+    # its corresponding example. This key gives us just that.
+    sample_mapping = tokenized_examples.pop("overflow_to_sample")
+    # The offset mappings will give us a map from token to character position in the original context. This will
+    # help us compute the start_positions and end_positions.
+    offset_mapping = tokenized_examples.pop("offset_mapping")
+
     # Let's label those examples!
-    for i, tokenized_example in enumerate(tokenized_examples):
+    tokenized_examples["start_positions"] = []
+    tokenized_examples["end_positions"] = []
+
+    for i, offsets in enumerate(offset_mapping):
         # We will label impossible answers with the index of the CLS token.
-        input_ids = tokenized_example["input_ids"]
+        input_ids = tokenized_examples["input_ids"][i]
         cls_index = input_ids.index(tokenizer.cls_token_id)
 
-        # The offset mappings will give us a map from token to character position in the original context. This will
-        # help us compute the start_positions and end_positions.
-        offsets = tokenized_example['offset_mapping']
-
         # Grab the sequence corresponding to that example (to know what is the context and what is the question).
-        sequence_ids = tokenized_example['token_type_ids']
+        sequence_ids = tokenized_examples['token_type_ids'][i]
 
         # One example can give several spans, this is the index of the example containing this span of text.
-        sample_index = tokenized_example['overflow_to_sample']
-        answers = examples[sample_index]['answers']
-        answer_starts = examples[sample_index]['answer_starts']
-
-        # Start/end character index of the answer in the text.
-        start_char = answer_starts[0]
-        end_char = start_char + len(answers[0])
-
-        # Start token index of the current span in the text.
-        token_start_index = 0
-        while sequence_ids[token_start_index] != 1:
-            token_start_index += 1
-
-        # End token index of the current span in the text.
-        token_end_index = len(input_ids) - 1
-        while sequence_ids[token_end_index] != 1:
-            token_end_index -= 1
-        # Minus one more to reach actual text
-        token_end_index -= 1
-
-        # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
-        if not (offsets[token_start_index][0] <= start_char and
-                offsets[token_end_index][1] >= end_char):
-            tokenized_examples[i]["start_positions"] = cls_index
-            tokenized_examples[i]["end_positions"] = cls_index
+        sample_index = sample_mapping[i]
+        answers = examples['answers'][sample_index]
+        # If no answers are given, set the cls_index as answer.
+        if len(answers["answer_start"]) == 0:
+            tokenized_examples["start_positions"].append(cls_index)
+            tokenized_examples["end_positions"].append(cls_index)
         else:
-            # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
-            # Note: we could go after the last offset if the answer is the last word (edge case).
-            while token_start_index < len(offsets) and offsets[
-                    token_start_index][0] <= start_char:
+            # Start/end character index of the answer in the text.
+            start_char = answers["answer_start"][0]
+            end_char = start_char + len(answers["text"][0])
+
+            # Start token index of the current span in the text.
+            token_start_index = 0
+            while sequence_ids[token_start_index] != 1:
                 token_start_index += 1
-            tokenized_examples[i]["start_positions"] = token_start_index - 1
-            while offsets[token_end_index][1] >= end_char:
+
+            # End token index of the current span in the text.
+            token_end_index = len(input_ids) - 1
+            while sequence_ids[token_end_index] != 1:
                 token_end_index -= 1
-            tokenized_examples[i]["end_positions"] = token_end_index + 1
+            token_end_index -= 1
+
+            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
+            if not (offsets[token_start_index][0] <= start_char and
+                    offsets[token_end_index][1] >= end_char):
+                tokenized_examples["start_positions"].append(cls_index)
+                tokenized_examples["end_positions"].append(cls_index)
+            else:
+                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
+                # Note: we could go after the last offset if the answer is the last word (edge case).
+                while token_start_index < len(offsets) and offsets[
+                        token_start_index][0] <= start_char:
+                    token_start_index += 1
+                tokenized_examples["start_positions"].append(token_start_index -
+                                                             1)
+                while offsets[token_end_index][1] >= end_char:
+                    token_end_index -= 1
+                tokenized_examples["end_positions"].append(token_end_index + 1)
 
     return tokenized_examples
 
@@ -158,107 +231,40 @@ def prepare_validation_features(examples, tokenizer, args):
     # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
     # in one example possible giving several features when a context is long, each of those features having a
     # context that overlaps a bit the context of the previous feature.
-    # NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is
+    #NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is
     # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead.
-    contexts = [examples[i]['context'] for i in range(len(examples))]
-    questions = [examples[i]['question'] for i in range(len(examples))]
+    contexts = examples['context']
+    questions = examples['question']
 
     tokenized_examples = tokenizer(
         questions,
         contexts,
         stride=args.doc_stride,
-        max_seq_len=args.max_seq_length)
+        max_seq_len=args.max_seq_length,
+        return_attention_mask=True)
+
+    # Since one example might give us several features if it has a long context, we need a map from a feature to
+    # its corresponding example. This key gives us just that.
+    sample_mapping = tokenized_examples.pop("overflow_to_sample")
 
-    # For validation, there is no need to compute start and end positions
-    for i, tokenized_example in enumerate(tokenized_examples):
+    # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+    # corresponding example_id and we will store the offset mappings.
+    tokenized_examples["example_id"] = []
+
+    for i in range(len(tokenized_examples["input_ids"])):
         # Grab the sequence corresponding to that example (to know what is the context and what is the question).
-        sequence_ids = tokenized_example['token_type_ids']
+        sequence_ids = tokenized_examples['token_type_ids'][i]
+        context_index = 1
 
         # One example can give several spans, this is the index of the example containing this span of text.
-        sample_index = tokenized_example['overflow_to_sample']
-        tokenized_examples[i]["example_id"] = examples[sample_index]['id']
+        sample_index = sample_mapping[i]
+        tokenized_examples["example_id"].append(examples["id"][sample_index])
 
         # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
         # position is part of the context or not.
-        tokenized_examples[i]["offset_mapping"] = [
-            (o if sequence_ids[k] == 1 else None)
-            for k, o in enumerate(tokenized_example["offset_mapping"])
+        tokenized_examples["offset_mapping"][i] = [
+            (o if sequence_ids[k] == context_index else None)
+            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
         ]
 
     return tokenized_examples
-
-
-class MrcTrainer(TrainerBase):
-    def __init__(self, train_ds, dev_ds, model, tokenizer, args):
-        super().__init__()
-        self.rank = paddle.distributed.get_rank()
-        self.train_ds = train_ds
-        self.dev_ds = dev_ds
-        self.model = model
-        self.tokenizer = tokenizer
-        self.args = args
-
-        self.dataloader_inner()
-        self.prepare_train_config()
-        self.print_config()
-
-    def eval(self):
-        if self.rank == 0:
-            evaluate(self.model, self.dev_dl, self.args)
-
-    def dataloader_inner(self):
-        train_batchify_fn = lambda samples, fn=Dict({
-            "input_ids": Pad(axis=0, pad_val=self.tokenizer.pad_token_id),
-            "token_type_ids": Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id),
-            "start_positions": Stack(dtype="int64"),
-            "end_positions": Stack(dtype="int64")
-        }): fn(samples)
-
-        dev_batchify_fn = lambda samples, fn=Dict({
-                "input_ids": Pad(axis=0, pad_val=self.tokenizer.pad_token_id),
-                "token_type_ids": Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id)
-         }): fn(samples)
-
-        trans_train_fn = partial(
-            prepare_train_features, tokenizer=self.tokenizer, args=self.args)
-        trans_dev_fn = partial(
-            prepare_validation_features,
-            tokenizer=self.tokenizer,
-            args=self.args)
-
-        self.train_dl = self.create_dataloader(
-            self.train_ds, "train", self.args.batch_size, train_batchify_fn,
-            trans_train_fn, True)
-        self.dev_dl = self.create_dataloader(
-            self.dev_ds, "dev", self.args.batch_size, dev_batchify_fn,
-            trans_dev_fn, True)
-
-    def train(self):
-        criterion = CrossEntropyLossForSQuAD()
-
-        global_step = 0
-        tic_train = time.time()
-        for epoch in range(self.args.num_train_epochs):
-            for step, batch in enumerate(self.train_dl):
-                global_step += 1
-                input_ids, token_type_ids, start_positions, end_positions = batch
-                logits = self.model(
-                    input_ids=input_ids, token_type_ids=token_type_ids)
-                loss = criterion(logits, (start_positions, end_positions))
-
-                if global_step % self.args.logging_steps == 0:
-                    logger.info(
-                        "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
-                        % (global_step, epoch + 1, step + 1, loss,
-                           self.args.logging_steps / (time.time() - tic_train)))
-                    tic_train = time.time()
-                loss.backward()
-                self.optimizer.step()
-                self.lr_scheduler.step()
-                self.optimizer.clear_grad()
-
-                if global_step % self.args.eval_steps == 0:
-                    self.eval()
-
-                if global_step == self.args.num_training_steps:
-                    break
diff --git a/examples/language_model/ernie-1.0/finetune/run_ner.py b/examples/language_model/ernie-1.0/finetune/run_ner.py
new file mode 100644
index 000000000000..a40be180bc09
--- /dev/null
+++ b/examples/language_model/ernie-1.0/finetune/run_ner.py
@@ -0,0 +1,211 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import yaml
+from functools import partial
+import distutils.util
+import os.path as osp
+from typing import Optional
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from datasets import load_metric
+import paddlenlp
+from paddlenlp.trainer import (
+    PdArgumentParser,
+    TrainingArguments,
+    Trainer, )
+from paddlenlp.datasets import load_dataset
+from paddlenlp.transformers import (
+    AutoTokenizer,
+    AutoModelForTokenClassification, )
+from paddlenlp.utils.log import logger
+
+sys.path.insert(0, os.path.abspath("."))
+from token_classification import ner_trans_fn, ner_collator
+from utils import (
+    ALL_DATASETS,
+    DataTrainingArguments,
+    ModelArguments, )
+
+
+def do_train():
+    parser = PdArgumentParser(
+        (ModelArguments, DataTrainingArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    paddle.set_device(training_args.device)
+    if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.init_parallel_env()
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "
+        +
+        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(
+            training_args.output_dir
+    ) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(
+                os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome.")
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # set_seed(args)
+    data_args.dataset = data_args.dataset.strip()
+    if data_args.dataset not in ALL_DATASETS:
+        raise ValueError("Not found dataset {}".format(data_args.dataset))
+
+    # Use yaml config to rewrite all args.
+    config = ALL_DATASETS[data_args.dataset]
+    for args in (model_args, data_args, training_args):
+        for arg in vars(args):
+            if arg in config.keys():
+                setattr(args, arg, config[arg])
+
+    training_args.per_device_train_batch_size = config["batch_size"]
+    training_args.per_device_eval_batch_size = config["batch_size"]
+
+    dataset_config = data_args.dataset.split(" ")
+    all_ds = load_dataset(
+        dataset_config[0],
+        None if len(dataset_config) <= 1 else dataset_config[1], )
+
+    label_list = getattr(all_ds['train'], "label_list", None)
+    data_args.label_list = label_list
+    data_args.ignore_label = -100
+    data_args.no_entity_id = len(data_args.label_list) - 1
+
+    num_classes = 1 if all_ds["train"].label_list == None else len(all_ds[
+        'train'].label_list)
+
+    # Define tokenizer, model, loss function. 
+    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+    model = AutoModelForTokenClassification.from_pretrained(
+        model_args.model_name_or_path, num_classes=num_classes)
+
+    class criterion(nn.Layer):
+        def __init__(self):
+            super(criterion, self).__init__()
+            self.loss_fn = paddle.nn.loss.CrossEntropyLoss(
+                ignore_index=data_args.ignore_label)
+
+        def forward(self, *args, **kwargs):
+            return paddle.mean(self.loss_fn(*args, **kwargs))
+
+    loss_fct = criterion()
+
+    # Define dataset pre-process function
+    trans_fn = partial(ner_trans_fn, tokenizer=tokenizer, args=data_args)
+
+    # Define data collector
+    batchify_fn = ner_collator(tokenizer, data_args)
+
+    # Dataset pre-process
+    train_ds = all_ds["train"].map(trans_fn)
+    dev_ds = all_ds["dev"].map(trans_fn)
+    test_ds = all_ds["test"].map(trans_fn)
+
+    # Define the metrics of tasks.
+    # Metrics
+    metric = load_metric("seqeval")
+
+    def compute_metrics(p):
+        predictions, labels = p
+        predictions = np.argmax(predictions, axis=2)
+
+        # Remove ignored index (special tokens)
+        true_predictions = [
+            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
+            for prediction, label in zip(predictions, labels)
+        ]
+        true_labels = [
+            [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
+            for prediction, label in zip(predictions, labels)
+        ]
+        results = metric.compute(
+            predictions=true_predictions, references=true_labels)
+        return {
+            "precision": results["overall_precision"],
+            "recall": results["overall_recall"],
+            "f1": results["overall_f1"],
+            "accuracy": results["overall_accuracy"],
+        }
+
+    trainer = Trainer(
+        model,
+        loss_fct,
+        training_args,
+        batchify_fn,
+        train_ds,
+        dev_ds,
+        tokenizer,
+        compute_metrics=compute_metrics, )
+
+    # Log model and data config
+    trainer.print_config(model_args, "Model")
+    trainer.print_config(data_args, "Data")
+
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+
+    # Training
+    train_result = trainer.train(resume_from_checkpoint=checkpoint)
+    metrics = train_result.metrics
+    trainer.save_model()  # Saves the tokenizer too for easy upload
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
+
+    # Evaluate and tests model
+    eval_metrics = trainer.evaluate()
+    trainer.log_metrics("eval", eval_metrics)
+
+    test_ret = trainer.predict(test_ds)
+    trainer.log_metrics("test", test_ret.metrics)
+    if test_ret.label_ids is None:
+        paddle.save(
+            test_ret.predictions,
+            os.path.join(training_args.output_dir, "test_results.pdtensor"), )
+
+    # export inference model
+    input_spec = [
+        paddle.static.InputSpec(
+            shape=[None, None], dtype="int64"),  # input_ids
+        paddle.static.InputSpec(
+            shape=[None, None], dtype="int64")  # segment_ids
+    ]
+    trainer.export_model(input_spec=input_spec, load_best_model=True)
+
+
+if __name__ == "__main__":
+    do_train()
diff --git a/examples/language_model/ernie-1.0/finetune/run_qa.py b/examples/language_model/ernie-1.0/finetune/run_qa.py
new file mode 100644
index 000000000000..ce81b018135e
--- /dev/null
+++ b/examples/language_model/ernie-1.0/finetune/run_qa.py
@@ -0,0 +1,238 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import yaml
+from functools import partial
+import distutils.util
+import os.path as osp
+from typing import Optional
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from datasets import load_metric, load_dataset
+
+import paddlenlp
+from paddlenlp.trainer import (
+    PdArgumentParser,
+    TrainingArguments,
+    Trainer, )
+from paddlenlp.trainer.trainer_utils import EvalPrediction
+
+# from paddlenlp.datasets import load_dataset
+from paddlenlp.transformers import (
+    AutoTokenizer,
+    AutoModelForQuestionAnswering, )
+from paddlenlp.utils.log import logger
+from paddlenlp.metrics.squad import squad_evaluate, compute_prediction
+
+sys.path.insert(0, os.path.abspath("."))
+from question_answering import (
+    QuestionAnsweringTrainer,
+    CrossEntropyLossForSQuAD,
+    prepare_train_features,
+    prepare_validation_features,
+    qa_collator, )
+
+from utils import (
+    ALL_DATASETS,
+    DataTrainingArguments,
+    ModelArguments, )
+
+
+def do_train():
+    parser = PdArgumentParser(
+        (ModelArguments, DataTrainingArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    paddle.set_device(training_args.device)
+    if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.init_parallel_env()
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "
+        +
+        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(
+            training_args.output_dir
+    ) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(
+                os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome.")
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # set_seed(args)
+    data_args.dataset = data_args.dataset.strip()
+    if data_args.dataset not in ALL_DATASETS:
+        raise ValueError("Not found dataset {}".format(data_args.dataset))
+
+    # Use yaml config to rewrite all args.
+    config = ALL_DATASETS[data_args.dataset]
+    for args in (model_args, data_args, training_args):
+        for arg in vars(args):
+            if arg in config.keys():
+                setattr(args, arg, config[arg])
+
+    training_args.per_device_train_batch_size = config["batch_size"]
+    training_args.per_device_eval_batch_size = config["batch_size"]
+
+    dataset_config = data_args.dataset.split(" ")
+    raw_datasets = load_dataset(
+        dataset_config[0],
+        None if len(dataset_config) <= 1 else dataset_config[1],
+        cache_dir=model_args.cache_dir)
+
+    label_list = getattr(raw_datasets['train'], "label_list", None)
+    data_args.label_list = label_list
+
+    # Define tokenizer, model, loss function. 
+    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+    model = AutoModelForQuestionAnswering.from_pretrained(
+        model_args.model_name_or_path)
+
+    loss_fct = CrossEntropyLossForSQuAD()
+
+    train_dataset = raw_datasets["train"]
+    eval_examples = raw_datasets["validation"]
+    predict_examples = raw_datasets["test"]
+
+    column_names = raw_datasets["train"].column_names
+    # Dataset pre-process
+    train_dataset = train_dataset.map(
+        partial(
+            prepare_train_features, tokenizer=tokenizer, args=data_args),
+        batched=True,
+        num_proc=4,
+        remove_columns=column_names,
+        load_from_cache_file=not data_args.overwrite_cache,
+        desc="Running tokenizer on train dataset", )
+
+    eval_dataset = eval_examples.map(
+        partial(
+            prepare_validation_features, tokenizer=tokenizer, args=data_args),
+        batched=True,
+        num_proc=4,
+        remove_columns=column_names,
+        load_from_cache_file=not data_args.overwrite_cache,
+        desc="Running tokenizer on validation dataset", )
+
+    predict_dataset = predict_examples.map(
+        partial(
+            prepare_validation_features, tokenizer=tokenizer, args=data_args),
+        batched=True,
+        num_proc=4,
+        remove_columns=column_names,
+        load_from_cache_file=not data_args.overwrite_cache,
+        desc="Running tokenizer on prediction dataset", )
+
+    # Define data collector
+    data_collator = qa_collator(tokenizer, data_args)
+
+    # Post-processing:
+    def post_processing_function(examples, features, predictions, stage="eval"):
+        # Post-processing: we match the start logits and end logits to answers in the original context.
+        predictions, all_nbest_json, scores_diff_json = compute_prediction(
+            examples=examples,
+            features=features,
+            predictions=predictions,
+            n_best_size=data_args.n_best_size,
+            max_answer_length=data_args.max_answer_length,
+            null_score_diff_threshold=data_args.null_score_diff_threshold, )
+        # Format the result to the format the metric expects.
+        formatted_predictions = [{
+            "id": k,
+            "prediction_text": v
+        } for k, v in predictions.items()]
+        references = [{
+            "id": ex["id"],
+            "answers": ex["answers"]
+        } for ex in examples]
+        return EvalPrediction(
+            predictions=formatted_predictions, label_ids=references)
+
+    # Define the metrics of tasks.
+    # Metrics
+    metric = load_metric("squad")
+
+    def compute_metrics(p: EvalPrediction):
+        return metric.compute(predictions=p.predictions, references=p.label_ids)
+
+    trainer = QuestionAnsweringTrainer(
+        model=model,
+        criterion=loss_fct,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        eval_examples=eval_examples,
+        data_collator=data_collator,
+        post_process_function=post_processing_function,
+        tokenizer=tokenizer,
+        compute_metrics=compute_metrics, )
+
+    # Log model and data config
+    trainer.print_config(model_args, "Model")
+    trainer.print_config(data_args, "Data")
+
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+
+    # Training
+    train_result = trainer.train(resume_from_checkpoint=checkpoint)
+    metrics = train_result.metrics
+    trainer.save_model()  # Saves the tokenizer too for easy upload
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
+
+    # Evaluate and tests model
+    eval_metrics = trainer.evaluate()
+    trainer.log_metrics("eval", eval_metrics)
+
+    test_ret = trainer.predict(predict_dataset, predict_examples)
+    trainer.log_metrics("predict", test_ret.metrics)
+    if test_ret.label_ids is None:
+        paddle.save(
+            test_ret.predictions,
+            os.path.join(training_args.output_dir, "test_results.pdtensor"), )
+
+    # export inference model
+    input_spec = [
+        paddle.static.InputSpec(
+            shape=[None, None], dtype="int64"),  # input_ids
+        paddle.static.InputSpec(
+            shape=[None, None], dtype="int64")  # segment_ids
+    ]
+    trainer.export_model(input_spec=input_spec, load_best_model=True)
+
+
+if __name__ == "__main__":
+    do_train()
diff --git a/examples/language_model/ernie-1.0/finetune/sequence_classification.py b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
index 56b50681643f..7edf621836b0 100644
--- a/examples/language_model/ernie-1.0/finetune/sequence_classification.py
+++ b/examples/language_model/ernie-1.0/finetune/sequence_classification.py
@@ -124,9 +124,9 @@ def seq_trans_fn(example, tokenizer, args):
         max_seq_length=args.max_seq_length, )
 
 
-def clue_trans_fn(examples, tokenizer, args):
+def clue_trans_fn(example, tokenizer, args):
     return convert_clue(
-        examples,
+        example,
         tokenizer=tokenizer,
         label_list=args.label_list,
         max_seq_length=args.max_seq_length)
diff --git a/examples/language_model/ernie-1.0/finetune/token_classification.py b/examples/language_model/ernie-1.0/finetune/token_classification.py
index 2e6f6344bbca..c25898c9b429 100644
--- a/examples/language_model/ernie-1.0/finetune/token_classification.py
+++ b/examples/language_model/ernie-1.0/finetune/token_classification.py
@@ -18,6 +18,7 @@
 import random
 import time
 import math
+import sys
 from functools import partial
 
 import numpy as np
@@ -27,33 +28,12 @@
 from paddlenlp.transformers import LinearDecayWithWarmup
 from paddlenlp.metrics import ChunkEvaluator
 from paddlenlp.datasets import load_dataset
-from paddlenlp.data import Stack, Tuple, Pad, Dict
+from paddlenlp.data import Stack, Tuple, Pad
 from paddlenlp.utils.log import logger
 
-from paddlenlp.trainer.trainer_base import TrainerBase
-
-
-@paddle.no_grad()
-def evaluate(model, loss_fct, metric, data_loader, label_num, mode="valid"):
-    model.eval()
-    metric.reset()
-    avg_loss, precision, recall, f1_score = 0, 0, 0, 0
-    for batch in data_loader:
-        input_ids, token_type_ids, length, labels = batch
-        logits = model(input_ids, token_type_ids)
-        loss = loss_fct(logits, labels)
-        avg_loss = paddle.mean(loss)
-        preds = logits.argmax(axis=2)
-        num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute(
-            length, preds, labels)
-        metric.update(num_infer_chunks.numpy(),
-                      num_label_chunks.numpy(), num_correct_chunks.numpy())
-        precision, recall, f1_score = metric.accumulate()
-    logger.info("%s: eval loss: %f, precision: %f, recall: %f, f1: %f" %
-                (mode, avg_loss, precision, recall, f1_score))
-    model.train()
-
-    return f1_score
+# from paddlenlp.trainer.trainer_base import TrainerBase
+sys.path.insert(0, os.path.abspath("."))
+from utils import Dict
 
 
 def tokenize_and_align_labels(example, tokenizer, no_entity_id,
@@ -62,9 +42,8 @@ def tokenize_and_align_labels(example, tokenizer, no_entity_id,
     example = example['tokens']
     tokenized_input = tokenizer(
         example,
-        return_length=True,
         is_split_into_words=True,
-        max_seq_len=max_seq_len)
+        max_seq_len=max_seq_len, )
 
     # -2 for [CLS] and [SEP]
     if len(tokenized_input['input_ids']) - 2 < len(labels):
@@ -72,100 +51,23 @@ def tokenize_and_align_labels(example, tokenizer, no_entity_id,
     tokenized_input['labels'] = [no_entity_id] + labels + [no_entity_id]
     tokenized_input['labels'] += [no_entity_id] * (
         len(tokenized_input['input_ids']) - len(tokenized_input['labels']))
-    return tokenized_input
-
-
-class NerTrainer(TrainerBase):
-    def __init__(self, train_ds, dev_ds, model, tokenizer, args, *arg,
-                 **kwargs):
-        super().__init__()
-        self.rank = paddle.distributed.get_rank()
-        self.train_ds = train_ds
-        self.dev_ds = dev_ds
-        if "test_ds" in kwargs.keys():
-            self.test_ds = kwargs["test_ds"]
-        self.model = model
-        self.tokenizer = tokenizer
-        self.args = args
-        self.dataloader_inner()
-        self.prepare_train_config()
-        self.print_config()
-
-    def dataloader_inner(self):
-        label_list = self.train_ds.label_list
-        label_num = len(label_list)
-        no_entity_id = label_num - 1
-
-        trans_fn = partial(
-            tokenize_and_align_labels,
-            tokenizer=self.tokenizer,
-            no_entity_id=no_entity_id,
-            max_seq_len=self.args.max_seq_length)
 
-        ignore_label = -100
-
-        batchify_fn = lambda samples, fn=Dict({
-            'input_ids': Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype='int32'),  # input
-            'token_type_ids': Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id, dtype='int32'),  # segment
-            'seq_len': Stack(dtype='int64'),  # seq_len
-            'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64')  # label
-        }): fn(samples)
-
-        self.train_dl = self.create_dataloader(
-            self.train_ds, "train", self.args.batch_size, batchify_fn, trans_fn)
-        self.dev_dl = self.create_dataloader(
-            self.dev_ds, "dev", self.args.batch_size, batchify_fn, trans_fn)
-        self.test_dl = self.create_dataloader(
-            self.test_ds, "test", self.args.batch_size, batchify_fn, trans_fn)
-
-    def train(self):
-        ignore_label = -100
-        label_num = len(self.train_ds.label_list)
-
-        loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)
-        metric = ChunkEvaluator(label_list=self.args.label_list)
-
-        global_step = 0
-        tic_train = time.time()
-        best_dev_f1 = -1
-        corr_test_f1 = -1
+    return tokenized_input
 
-        for epoch in range(self.args.num_train_epochs):
-            for step, batch in enumerate(self.train_dl):
-                global_step += 1
-                input_ids, token_type_ids, _, labels = batch
-                logits = self.model(input_ids, token_type_ids)
-                loss = loss_fct(logits, labels)
-                avg_loss = paddle.mean(loss)
 
-                if global_step % self.args.logging_steps == 0:
-                    logger.info(
-                        "global step %d/%d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
-                        % (global_step, self.args.num_training_steps, epoch,
-                           step, avg_loss,
-                           self.args.logging_steps / (time.time() - tic_train)))
-                    tic_train = time.time()
+def ner_collator(tokenizer, args):
+    batchify_fn = lambda samples, fn=Dict({
+        'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'),  # input
+        'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32'),  # segment
+        'labels': Pad(axis=0, pad_val=args.ignore_label, dtype='int64')  # label
+    }): fn(samples)
 
-                avg_loss.backward()
-                self.optimizer.step()
-                self.lr_scheduler.step()
-                self.optimizer.clear_grad()
+    return batchify_fn
 
-                if global_step % self.args.eval_steps == 0 or global_step == self.args.num_training_steps:
-                    if paddle.distributed.get_rank() == 0:
-                        dev_f1 = evaluate(self.model, loss_fct, metric,
-                                          self.dev_dl, label_num, "valid")
-                        test_f1 = evaluate(self.model, loss_fct, metric,
-                                           self.test_dl, label_num, "test")
-                        if dev_f1 > best_dev_f1:
-                            best_dev_f1 = dev_f1
-                            corr_test_f1 = test_f1
-                        logger.warning(
-                            "Currently, best_dev_f1: %.4f, corr_test_f1: %.4f" %
-                            (best_dev_f1, corr_test_f1))
 
-                if global_step >= self.args.num_training_steps:
-                    logger.warning(
-                        "Currently, best_dev_f1: %.4f, corr_test_f1: %.4f" %
-                        (best_dev_f1, corr_test_f1))
-                    return
+def ner_trans_fn(example, tokenizer, args):
+    return tokenize_and_align_labels(
+        example,
+        tokenizer=tokenizer,
+        no_entity_id=args.no_entity_id,
+        max_seq_len=args.max_seq_length)
diff --git a/examples/language_model/ernie-1.0/finetune/utils.py b/examples/language_model/ernie-1.0/finetune/utils.py
index fbe639bd3b42..93573bfc7ed6 100644
--- a/examples/language_model/ernie-1.0/finetune/utils.py
+++ b/examples/language_model/ernie-1.0/finetune/utils.py
@@ -144,6 +144,22 @@ class DataTrainingArguments:
             "help":
             "Whether to lower case the input text. Should be True for uncased models and False for cased models."
         }, )
+    overwrite_cache: bool = field(
+        default=False,
+        metadata={"help": "Overwrite the cached training and evaluation sets"})
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The number of processes to use for the preprocessing."
+        }, )
+    null_score_diff_threshold: float = field(
+        default=0.0,
+        metadata={
+            "help":
+            "The threshold used to select the null answer: if the best answer has a score that is less than "
+            "the score of the null answer minus this threshold, the null answer is selected for this example. "
+            "Only useful when `version_2_with_negative=True`."
+        }, )
 
 
 @dataclass
@@ -168,3 +184,9 @@ class ModelArguments:
             "help":
             "Pretrained tokenizer name or path if not the same as model_name"
         })
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={
+            "help":
+            "Path to directory to store the pretrained models downloaded from huggingface.co"
+        }, )
diff --git a/paddlenlp/trainer/trainer_base.py b/paddlenlp/trainer/trainer_base.py
index b42bb4dec49e..fe92026cf729 100644
--- a/paddlenlp/trainer/trainer_base.py
+++ b/paddlenlp/trainer/trainer_base.py
@@ -834,6 +834,12 @@ def compute_loss(self, model, inputs, return_outputs=False):
         else:
             labels = None
 
+        if self.criterion is not None and "start_positions" in inputs and "end_positions" in inputs:
+            labels = (inputs.pop("start_positions"),
+                      inputs.pop("end_positions"))
+        else:
+            labels = None
+
         outputs = model(**inputs)
 
         if self.criterion is not None:
diff --git a/paddlenlp/trainer/utils/helper.py b/paddlenlp/trainer/utils/helper.py
index 380a8ea464ff..017fc3032282 100644
--- a/paddlenlp/trainer/utils/helper.py
+++ b/paddlenlp/trainer/utils/helper.py
@@ -29,10 +29,10 @@ def paddle_pad_and_concatenate(tensor1, tensor2, padding_index=-100):
     if len(tensor1.shape) == 1 or tensor1.shape[1] == tensor2.shape[1]:
         return paddle.concat((tensor1, tensor2), axis=0)
 
-    raise ValueError("Error")
+    # raise ValueError("Error")
     # Let's figure out the new shape
     new_shape = (tensor1.shape[0] + tensor2.shape[0], max(
-        tensor1.shape[1], tensor2.shape[1])) + tensor1.shape[2:]
+        tensor1.shape[1], tensor2.shape[1])) + tuple(tensor1.shape[2:])
 
     # Now let's fill the result tensor
     # result = tensor1.new_full(new_shape, padding_index)

From 36ff4cd9f5201e7eac7a57cfced4b4cf6e5ccce6 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Wed, 30 Mar 2022 16:57:17 +0800
Subject: [PATCH 15/15] fix as reviews.

---
 .../ernie-1.0/finetune/question_answering.py  |   1 +
 .../ernie-1.0/finetune/run_ner.py             |  27 +-
 .../ernie-1.0/finetune/run_qa.py              |   5 +-
 .../ernie-1.0/finetune/run_seq_cls.py         |  35 +-
 .../finetune/token_classification.py          |   1 -
 .../ernie-1.0/finetune/utils.py               |   6 +
 paddlenlp/trainer/trainer_args.py             |  30 +-
 paddlenlp/trainer/trainer_base.py             |   2 -
 paddlenlp/trainer/trainer_callback.py         |   4 +-
 paddlenlp/trainer/utils/logging.py            | 344 ------------------
 10 files changed, 55 insertions(+), 400 deletions(-)
 delete mode 100644 paddlenlp/trainer/utils/logging.py

diff --git a/examples/language_model/ernie-1.0/finetune/question_answering.py b/examples/language_model/ernie-1.0/finetune/question_answering.py
index b9001fefe4d9..12a2822eb2c2 100644
--- a/examples/language_model/ernie-1.0/finetune/question_answering.py
+++ b/examples/language_model/ernie-1.0/finetune/question_answering.py
@@ -1,3 +1,4 @@
+# Copyright 2020-present the HuggingFace Inc. team.
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/examples/language_model/ernie-1.0/finetune/run_ner.py b/examples/language_model/ernie-1.0/finetune/run_ner.py
index a40be180bc09..0cbb5e464fb5 100644
--- a/examples/language_model/ernie-1.0/finetune/run_ner.py
+++ b/examples/language_model/ernie-1.0/finetune/run_ner.py
@@ -128,9 +128,9 @@ def forward(self, *args, **kwargs):
     batchify_fn = ner_collator(tokenizer, data_args)
 
     # Dataset pre-process
-    train_ds = all_ds["train"].map(trans_fn)
-    dev_ds = all_ds["dev"].map(trans_fn)
-    test_ds = all_ds["test"].map(trans_fn)
+    train_dataset = all_ds["train"].map(trans_fn)
+    eval_dataset = all_ds["dev"].map(trans_fn)
+    test_dataset = all_ds["test"].map(trans_fn)
 
     # Define the metrics of tasks.
     # Metrics
@@ -159,13 +159,13 @@ def compute_metrics(p):
         }
 
     trainer = Trainer(
-        model,
-        loss_fct,
-        training_args,
-        batchify_fn,
-        train_ds,
-        dev_ds,
-        tokenizer,
+        model=model,
+        criterion=loss_fct,
+        args=training_args,
+        data_collator=batchify_fn,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        tokenizer=tokenizer,
         compute_metrics=compute_metrics, )
 
     # Log model and data config
@@ -190,7 +190,7 @@ def compute_metrics(p):
     eval_metrics = trainer.evaluate()
     trainer.log_metrics("eval", eval_metrics)
 
-    test_ret = trainer.predict(test_ds)
+    test_ret = trainer.predict(test_dataset)
     trainer.log_metrics("test", test_ret.metrics)
     if test_ret.label_ids is None:
         paddle.save(
@@ -204,7 +204,10 @@ def compute_metrics(p):
         paddle.static.InputSpec(
             shape=[None, None], dtype="int64")  # segment_ids
     ]
-    trainer.export_model(input_spec=input_spec, load_best_model=True)
+    trainer.export_model(
+        input_spec=input_spec,
+        load_best_model=True,
+        output_dir=model_args.export_model_dir)
 
 
 if __name__ == "__main__":
diff --git a/examples/language_model/ernie-1.0/finetune/run_qa.py b/examples/language_model/ernie-1.0/finetune/run_qa.py
index ce81b018135e..7ccaf482fb2e 100644
--- a/examples/language_model/ernie-1.0/finetune/run_qa.py
+++ b/examples/language_model/ernie-1.0/finetune/run_qa.py
@@ -231,7 +231,10 @@ def compute_metrics(p: EvalPrediction):
         paddle.static.InputSpec(
             shape=[None, None], dtype="int64")  # segment_ids
     ]
-    trainer.export_model(input_spec=input_spec, load_best_model=True)
+    trainer.export_model(
+        input_spec=input_spec,
+        load_best_model=True,
+        output_dir=model_args.export_model_dir)
 
 
 if __name__ == "__main__":
diff --git a/examples/language_model/ernie-1.0/finetune/run_seq_cls.py b/examples/language_model/ernie-1.0/finetune/run_seq_cls.py
index 12a442358e18..e0440e4f1daa 100644
--- a/examples/language_model/ernie-1.0/finetune/run_seq_cls.py
+++ b/examples/language_model/ernie-1.0/finetune/run_seq_cls.py
@@ -92,13 +92,13 @@ def do_train():
     training_args.per_device_eval_batch_size = config["batch_size"]
 
     dataset_config = data_args.dataset.split(" ")
-    all_ds = load_dataset(
+    raw_datasets = load_dataset(
         dataset_config[0],
         None if len(dataset_config) <= 1 else dataset_config[1], )
 
-    data_args.label_list = getattr(all_ds['train'], "label_list", None)
-    num_classes = 1 if all_ds["train"].label_list == None else len(all_ds[
-        'train'].label_list)
+    data_args.label_list = getattr(raw_datasets['train'], "label_list", None)
+    num_classes = 1 if raw_datasets["train"].label_list == None else len(
+        raw_datasets['train'].label_list)
 
     # Define tokenizer, model, loss function. 
     tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
@@ -117,9 +117,9 @@ def do_train():
     batchify_fn = defaut_collator(tokenizer, data_args)
 
     # Dataset pre-process
-    train_ds = all_ds["train"].map(trans_fn)
-    dev_ds = all_ds["dev"].map(trans_fn)
-    test_ds = all_ds["test"].map(trans_fn)
+    train_dataset = raw_datasets["train"].map(trans_fn)
+    eval_dataset = raw_datasets["dev"].map(trans_fn)
+    test_dataset = raw_datasets["test"].map(trans_fn)
 
     # Define the metrics of tasks.
     def compute_metrics(p):
@@ -139,13 +139,13 @@ def compute_metrics(p):
         return {"accuracy": accu}
 
     trainer = Trainer(
-        model,
-        loss_fct,
-        training_args,
-        batchify_fn,
-        train_ds,
-        dev_ds,
-        tokenizer,
+        model=model,
+        criterion=loss_fct,
+        args=training_args,
+        data_collator=batchify_fn,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        tokenizer=tokenizer,
         compute_metrics=compute_metrics, )
 
     # Log model and data config
@@ -170,7 +170,7 @@ def compute_metrics(p):
     eval_metrics = trainer.evaluate()
     trainer.log_metrics("eval", eval_metrics)
 
-    test_ret = trainer.predict(test_ds)
+    test_ret = trainer.predict(test_dataset)
     trainer.log_metrics("test", test_ret.metrics)
     if test_ret.label_ids is None:
         paddle.save(
@@ -184,7 +184,10 @@ def compute_metrics(p):
         paddle.static.InputSpec(
             shape=[None, None], dtype="int64")  # segment_ids
     ]
-    trainer.export_model(input_spec=input_spec, load_best_model=True)
+    trainer.export_model(
+        input_spec=input_spec,
+        load_best_model=True,
+        output_dir=model_args.export_model_dir)
 
 
 if __name__ == "__main__":
diff --git a/examples/language_model/ernie-1.0/finetune/token_classification.py b/examples/language_model/ernie-1.0/finetune/token_classification.py
index c25898c9b429..d001edb7e8ed 100644
--- a/examples/language_model/ernie-1.0/finetune/token_classification.py
+++ b/examples/language_model/ernie-1.0/finetune/token_classification.py
@@ -1,4 +1,3 @@
-# Copyright 2020-present the HuggingFace Inc. team.
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/examples/language_model/ernie-1.0/finetune/utils.py b/examples/language_model/ernie-1.0/finetune/utils.py
index 93573bfc7ed6..149808446039 100644
--- a/examples/language_model/ernie-1.0/finetune/utils.py
+++ b/examples/language_model/ernie-1.0/finetune/utils.py
@@ -190,3 +190,9 @@ class ModelArguments:
             "help":
             "Path to directory to store the pretrained models downloaded from huggingface.co"
         }, )
+    export_model_dir: Optional[str] = field(
+        default=None,
+        metadata={
+            "help":
+            "Path to directory to store the pretrained models downloaded from huggingface.co"
+        }, )
diff --git a/paddlenlp/trainer/trainer_args.py b/paddlenlp/trainer/trainer_args.py
index f7e713383bbc..9a03ff536127 100644
--- a/paddlenlp/trainer/trainer_args.py
+++ b/paddlenlp/trainer/trainer_args.py
@@ -23,17 +23,17 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 
-from .utils import logging
+# from .utils import logging
 from .trainer_utils import (
     SchedulerType,
     IntervalStrategy,
     EvaluationStrategy,
     OptimizerNames, )
 
-logger = logging.get_logger(__name__)
-log_levels = logging.get_log_levels_dict().copy()
-trainer_log_levels = dict(**log_levels, passive=-1)
-
+# logger = logging.get_logger(__name__)
+# log_levels = logging.get_log_levels_dict().copy()
+# trainer_log_levels = dict(**log_levels, passive=-1)
+from paddlenlp.utils.log import logger
 import paddle
 
 
@@ -376,20 +376,6 @@ class TrainingArguments:
     warmup_steps: int = field(
         default=0, metadata={"help": "Linear warmup over warmup_steps."})
 
-    log_level: Optional[str] = field(
-        default="passive",
-        metadata={
-            "help":
-            "Logger log level to use on the main node. Possible choices are the log levels as strings: 'debug', 'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and lets the application set the level. Defaults to 'passive'.",
-            "choices": trainer_log_levels.keys(),
-        }, )
-    log_level_replica: Optional[str] = field(
-        default="passive",
-        metadata={
-            "help":
-            "Logger log level to use on replica nodes. Same choices and defaults as ``log_level``",
-            "choices": trainer_log_levels.keys(),
-        }, )
     log_on_each_node: bool = field(
         default=True,
         metadata={
@@ -571,8 +557,8 @@ def __post_init__(self):
             self.local_rank = env_local_rank
 
         # convert to int
-        self.log_level = trainer_log_levels[self.log_level]
-        self.log_level_replica = trainer_log_levels[self.log_level_replica]
+        self.log_level = -1
+        self.log_level_replica = -1
 
         # expand paths, if not os.makedirs("~/bar") will make directory
         # in the current directory instead of the actual home
@@ -585,7 +571,7 @@ def __post_init__(self):
             self.logging_dir = os.path.expanduser(self.logging_dir)
 
         if self.disable_tqdm is None:
-            self.disable_tqdm = logger.getEffectiveLevel() > logging.WARN
+            self.disable_tqdm = False  # logger.getEffectiveLevel() > logging.WARN
 
         if isinstance(self.evaluation_strategy, EvaluationStrategy):
             warnings.warn(
diff --git a/paddlenlp/trainer/trainer_base.py b/paddlenlp/trainer/trainer_base.py
index fe92026cf729..1287f985f482 100644
--- a/paddlenlp/trainer/trainer_base.py
+++ b/paddlenlp/trainer/trainer_base.py
@@ -70,8 +70,6 @@
     nested_detach,
     nested_numpify,
     nested_truncate, )
-# from .utils import logging
-# logger = logging.get_logger(__name__)
 
 DEFAULT_CALLBACKS = [DefaultFlowCallback]
 
diff --git a/paddlenlp/trainer/trainer_callback.py b/paddlenlp/trainer/trainer_callback.py
index 9549a3d476e5..79b6ca490a31 100644
--- a/paddlenlp/trainer/trainer_callback.py
+++ b/paddlenlp/trainer/trainer_callback.py
@@ -25,9 +25,9 @@
 
 from .trainer_utils import IntervalStrategy, has_length
 from .trainer_args import TrainingArguments
-from .utils import logging
+from paddlenlp.utils.log import logger
 
-logger = logging.get_logger(__name__)
+# logger = logging.get_logger(__name__)
 
 
 @dataclass
diff --git a/paddlenlp/trainer/utils/logging.py b/paddlenlp/trainer/utils/logging.py
deleted file mode 100644
index 15814dccd143..000000000000
--- a/paddlenlp/trainer/utils/logging.py
+++ /dev/null
@@ -1,344 +0,0 @@
-# coding=utf-8
-# Copyright 2020 Optuna, Hugging Face
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Logging utilities."""
-
-import logging
-import os
-import sys
-import threading
-from logging import CRITICAL  # NOQA
-from logging import DEBUG  # NOQA
-from logging import ERROR  # NOQA
-from logging import FATAL  # NOQA
-from logging import INFO  # NOQA
-from logging import NOTSET  # NOQA
-from logging import WARN  # NOQA
-from logging import WARNING  # NOQA
-from typing import Optional
-
-from tqdm import auto as tqdm_lib
-
-_lock = threading.Lock()
-_default_handler: Optional[logging.Handler] = None
-
-log_levels = {
-    "debug": logging.DEBUG,
-    "info": logging.INFO,
-    "warning": logging.WARNING,
-    "error": logging.ERROR,
-    "critical": logging.CRITICAL,
-}
-
-_default_log_level = logging.WARNING
-
-_tqdm_active = True
-
-
-def _get_default_logging_level():
-    """
-    If TRANSFORMERS_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is
-    not - fall back to `_default_log_level`
-    """
-    env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None)
-    if env_level_str:
-        if env_level_str in log_levels:
-            return log_levels[env_level_str]
-        else:
-            logging.getLogger().warning(
-                f"Unknown option TRANSFORMERS_VERBOSITY={env_level_str}, "
-                f"has to be one of: { ', '.join(log_levels.keys()) }")
-    return _default_log_level
-
-
-def _get_library_name() -> str:
-
-    return __name__.split(".")[0]
-
-
-def _get_library_root_logger() -> logging.Logger:
-
-    return logging.getLogger(_get_library_name())
-
-
-def _configure_library_root_logger() -> None:
-
-    global _default_handler
-
-    with _lock:
-        if _default_handler:
-            # This library has already configured the library root logger.
-            return
-        _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
-        _default_handler.flush = sys.stderr.flush
-
-        # Apply our default configuration to the library root logger.
-        library_root_logger = _get_library_root_logger()
-        library_root_logger.addHandler(_default_handler)
-        library_root_logger.setLevel(_get_default_logging_level())
-        library_root_logger.propagate = False
-
-
-def _reset_library_root_logger() -> None:
-
-    global _default_handler
-
-    with _lock:
-        if not _default_handler:
-            return
-
-        library_root_logger = _get_library_root_logger()
-        library_root_logger.removeHandler(_default_handler)
-        library_root_logger.setLevel(logging.NOTSET)
-        _default_handler = None
-
-
-def get_log_levels_dict():
-    return log_levels
-
-
-def get_logger(name: Optional[str]=None) -> logging.Logger:
-    """
-    Return a logger with the specified name.
-
-    This function is not supposed to be directly accessed unless you are writing a custom transformers module.
-    """
-
-    if name is None:
-        name = _get_library_name()
-
-    _configure_library_root_logger()
-    return logging.getLogger(name)
-
-
-def get_verbosity() -> int:
-    """
-    Return the current level for the 🤗 Transformers's root logger as an int.
-
-    Returns:
-        `int`: The logging level.
-
-    <Tip>
-
-    🤗 Transformers has following logging levels:
-
-    - 50: `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
-    - 40: `transformers.logging.ERROR`
-    - 30: `transformers.logging.WARNING` or `transformers.logging.WARN`
-    - 20: `transformers.logging.INFO`
-    - 10: `transformers.logging.DEBUG`
-
-    </Tip>"""
-
-    _configure_library_root_logger()
-    return _get_library_root_logger().getEffectiveLevel()
-
-
-def set_verbosity(verbosity: int) -> None:
-    """
-    Set the verbosity level for the 🤗 Transformers's root logger.
-
-    Args:
-        verbosity (`int`):
-            Logging level, e.g., one of:
-
-            - `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
-            - `transformers.logging.ERROR`
-            - `transformers.logging.WARNING` or `transformers.logging.WARN`
-            - `transformers.logging.INFO`
-            - `transformers.logging.DEBUG`
-    """
-
-    _configure_library_root_logger()
-    _get_library_root_logger().setLevel(verbosity)
-
-
-def set_verbosity_info():
-    """Set the verbosity to the `INFO` level."""
-    return set_verbosity(INFO)
-
-
-def set_verbosity_warning():
-    """Set the verbosity to the `WARNING` level."""
-    return set_verbosity(WARNING)
-
-
-def set_verbosity_debug():
-    """Set the verbosity to the `DEBUG` level."""
-    return set_verbosity(DEBUG)
-
-
-def set_verbosity_error():
-    """Set the verbosity to the `ERROR` level."""
-    return set_verbosity(ERROR)
-
-
-def disable_default_handler() -> None:
-    """Disable the default handler of the HuggingFace Transformers's root logger."""
-
-    _configure_library_root_logger()
-
-    assert _default_handler is not None
-    _get_library_root_logger().removeHandler(_default_handler)
-
-
-def enable_default_handler() -> None:
-    """Enable the default handler of the HuggingFace Transformers's root logger."""
-
-    _configure_library_root_logger()
-
-    assert _default_handler is not None
-    _get_library_root_logger().addHandler(_default_handler)
-
-
-def add_handler(handler: logging.Handler) -> None:
-    """adds a handler to the HuggingFace Transformers's root logger."""
-
-    _configure_library_root_logger()
-
-    assert handler is not None
-    _get_library_root_logger().addHandler(handler)
-
-
-def remove_handler(handler: logging.Handler) -> None:
-    """removes given handler from the HuggingFace Transformers's root logger."""
-
-    _configure_library_root_logger()
-
-    assert handler is not None and handler not in _get_library_root_logger(
-    ).handlers
-    _get_library_root_logger().removeHandler(handler)
-
-
-def disable_propagation() -> None:
-    """
-    Disable propagation of the library log outputs. Note that log propagation is disabled by default.
-    """
-
-    _configure_library_root_logger()
-    _get_library_root_logger().propagate = False
-
-
-def enable_propagation() -> None:
-    """
-    Enable propagation of the library log outputs. Please disable the HuggingFace Transformers's default handler to
-    prevent double logging if the root logger has been configured.
-    """
-
-    _configure_library_root_logger()
-    _get_library_root_logger().propagate = True
-
-
-def enable_explicit_format() -> None:
-    """
-    Enable explicit formatting for every HuggingFace Transformers's logger. The explicit formatter is as follows:
-    ```
-        [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE
-    ```
-    All handlers currently bound to the root logger are affected by this method.
-    """
-    handlers = _get_library_root_logger().handlers
-
-    for handler in handlers:
-        formatter = logging.Formatter(
-            "[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
-        handler.setFormatter(formatter)
-
-
-def reset_format() -> None:
-    """
-    Resets the formatting for HuggingFace Transformers's loggers.
-
-    All handlers currently bound to the root logger are affected by this method.
-    """
-    handlers = _get_library_root_logger().handlers
-
-    for handler in handlers:
-        handler.setFormatter(None)
-
-
-def warning_advice(self, *args, **kwargs):
-    """
-    This method is identical to `logger.warning()`, but if env var TRANSFORMERS_NO_ADVISORY_WARNINGS=1 is set, this
-    warning will not be printed
-    """
-    no_advisory_warnings = os.getenv("TRANSFORMERS_NO_ADVISORY_WARNINGS", False)
-    if no_advisory_warnings:
-        return
-    self.warning(*args, **kwargs)
-
-
-logging.Logger.warning_advice = warning_advice
-
-
-class EmptyTqdm:
-    """Dummy tqdm which doesn't do anything."""
-
-    def __init__(self, *args, **kwargs):  # pylint: disable=unused-argument
-        self._iterator = args[0] if args else None
-
-    def __iter__(self):
-        return iter(self._iterator)
-
-    def __getattr__(self, _):
-        """Return empty function."""
-
-        def empty_fn(*args, **kwargs):  # pylint: disable=unused-argument
-            return
-
-        return empty_fn
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, type_, value, traceback):
-        return
-
-
-class _tqdm_cls:
-    def __call__(self, *args, **kwargs):
-        if _tqdm_active:
-            return tqdm_lib.tqdm(*args, **kwargs)
-        else:
-            return EmptyTqdm(*args, **kwargs)
-
-    def set_lock(self, *args, **kwargs):
-        self._lock = None
-        if _tqdm_active:
-            return tqdm_lib.tqdm.set_lock(*args, **kwargs)
-
-    def get_lock(self):
-        if _tqdm_active:
-            return tqdm_lib.tqdm.get_lock()
-
-
-tqdm = _tqdm_cls()
-
-
-def is_progress_bar_enabled() -> bool:
-    """Return a boolean indicating whether tqdm progress bars are enabled."""
-    global _tqdm_active
-    return bool(_tqdm_active)
-
-
-def enable_progress_bar():
-    """Enable tqdm progress bar."""
-    global _tqdm_active
-    _tqdm_active = True
-
-
-def disable_progress_bar():
-    """Enable tqdm progress bar."""
-    global _tqdm_active
-    _tqdm_active = False