diff --git a/examples/language_model/ernie-1.0/finetune/config.yml b/examples/language_model/ernie-1.0/finetune/config.yml index fcbd7d433af5..298b430964df 100644 --- a/examples/language_model/ernie-1.0/finetune/config.yml +++ b/examples/language_model/ernie-1.0/finetune/config.yml @@ -5,10 +5,10 @@ DefaultArgs: num_train_epochs: 3 batch_size: 64 max_seq_length: 128 - weight_decay: 0.0 + weight_decay: 0.01 logging_steps: 10 - valid_steps: 100 - minimum_valid_times: 20 # If under valid_steps, the valid time is less then 20, the config of valid_steps will be changed. + valid_steps: 200 + minimum_valid_times: 20 max_steps: -1 warmup_steps: 0 metric: "Accuracy" @@ -34,25 +34,30 @@ SequenceClassification: max_seq_length: 256 batch_size: 32 xnli_cn: - learning_rate: 0.00005 + learning_rate: 0.0001 num_train_epochs: 3 + batch_size: 256 chnsenticorp_v2: - learning_rate: 0.00001 - num_train_epochs: 5 + learning_rate: 0.00005 + batch_size: 16 + num_train_epochs: 8 # Datasets which used for token classfication TokenClassification: peoples_daily_ner: - num_train_epochs: 5 + learning_rate: 0.00005 + num_train_epochs: 8 + batch_size: 16 msra_ner: num_train_epochs: 3 # Datasets which used for question answersing QuestionAnswering: cmrc2018: - num_train_epochs: 1 - batch_size: 12 - max_seq_length: 384 + learning_rate: 0.00005 + num_train_epochs: 5 + batch_size: 32 + max_seq_length: 512 dureader_nlp: num_train_epochs: 1 batch_size: 12 diff --git a/examples/language_model/ernie-1.0/finetune/finetune.py b/examples/language_model/ernie-1.0/finetune/finetune.py index 705efec52564..0a6db96097da 100644 --- a/examples/language_model/ernie-1.0/finetune/finetune.py +++ b/examples/language_model/ernie-1.0/finetune/finetune.py @@ -13,12 +13,10 @@ # limitations under the License. import argparse -import logging import os import sys import random import time -import math import copy import yaml from functools import partial @@ -27,24 +25,21 @@ import numpy as np import paddle -from paddle.io import DataLoader import paddle.nn as nn import paddle.nn.functional as F -from paddle.metric import Accuracy -from paddlenlp.metrics.squad import squad_evaluate, compute_prediction import paddlenlp from paddlenlp.datasets import load_dataset -from paddlenlp.data import Stack, Tuple, Pad, Dict from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer from paddlenlp.transformers import AutoModelForTokenClassification from paddlenlp.transformers import AutoModelForQuestionAnswering -from paddlenlp.transformers import LinearDecayWithWarmup +from paddlenlp.transformers import AutoTokenizer from paddlenlp.utils.log import logger sys.path.insert(0, os.path.abspath(".")) -from sequence_classification import CLUE_TRAINING -from question_answering import QA_TRAINING +from sequence_classification import ClueTrainer, SeqTrainer +from question_answering import MrcTrainer +from token_classification import NerTrainer ALL_TASKS = { "SequenceClassification": [], @@ -77,7 +72,6 @@ def parse_args(): parser = argparse.ArgumentParser() - # Required parameters parser.add_argument( "--dataset", @@ -115,7 +109,7 @@ def parse_args(): help="Batch size per GPU/CPU for training.", ) group.add_argument( "--weight_decay", - default=0.0, + default=None, type=float, help="Weight decay if we apply some.") @@ -134,6 +128,12 @@ def parse_args(): type=int, default=200, help="Save checkpoint every X updates steps.") + group.add_argument( + "--minimum_valid_times", + type=int, + default=None, + help="If under valid_steps, the valid time is less then minimum_valid_times, the config of override valid_steps." + ) group.add_argument( "--max_steps", default=-1, @@ -257,12 +257,29 @@ def do_train(args): if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) - if 'clue' in args.dataset: - trainer = CLUE_TRAINING(all_ds["train"], all_ds["dev"], model, - tokenizer, args) - elif "Answering" in config["model"]: - trainer = QA_TRAINING(all_ds["train"], all_ds["dev"], model, tokenizer, - args) + if "SequenceClassification" in config["model"]: + if 'clue' in args.dataset: + trainer = ClueTrainer(all_ds["train"], all_ds["dev"], model, + tokenizer, args) + else: + trainer = SeqTrainer( + all_ds["train"], + all_ds["dev"], + model, + tokenizer, + args, + test_ds=all_ds["test"]) + elif "QuestionAnswering" in config["model"]: + trainer = MrcTrainer(all_ds["train"], all_ds["dev"], model, tokenizer, + args) + elif 'TokenClassification' in config["model"]: + trainer = NerTrainer( + all_ds["train"], + all_ds["dev"], + model, + tokenizer, + args, + test_ds=all_ds["test"]) trainer.train() trainer.eval() diff --git a/examples/language_model/ernie-1.0/finetune/question_answering.py b/examples/language_model/ernie-1.0/finetune/question_answering.py index c2255b831747..03be17723111 100644 --- a/examples/language_model/ernie-1.0/finetune/question_answering.py +++ b/examples/language_model/ernie-1.0/finetune/question_answering.py @@ -22,7 +22,7 @@ from paddlenlp.data import Pad, Stack, Tuple, Dict from paddlenlp.metrics.squad import squad_evaluate, compute_prediction -from sequence_classification import BaseTrainer +from trainer_base import TrainerBase from paddlenlp.utils.log import logger @@ -188,7 +188,7 @@ def prepare_validation_features(examples, tokenizer, args): return tokenized_examples -class QA_TRAINING(BaseTrainer): +class MrcTrainer(TrainerBase): def __init__(self, train_ds, dev_ds, model, tokenizer, args): super().__init__() self.rank = paddle.distributed.get_rank() @@ -257,5 +257,8 @@ def train(self): self.lr_scheduler.step() self.optimizer.clear_grad() + if global_step % self.args.valid_steps == 0: + self.eval() + if global_step == self.args.num_training_steps: break diff --git a/examples/language_model/ernie-1.0/finetune/sequence_classification.py b/examples/language_model/ernie-1.0/finetune/sequence_classification.py index 84a918a95e15..324399299eee 100644 --- a/examples/language_model/ernie-1.0/finetune/sequence_classification.py +++ b/examples/language_model/ernie-1.0/finetune/sequence_classification.py @@ -12,133 +12,46 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle -import paddle -from paddle.io import DataLoader -import paddle.nn as nn -import paddle.nn.functional as F -from paddle.metric import Accuracy -import numpy as np - -from paddlenlp.data import Stack, Tuple, Pad, Dict - -import argparse import os -import sys -import random import time -import math -import copy -import yaml from functools import partial -import numpy as np import paddle -from paddle.io import DataLoader import paddle.nn as nn import paddle.nn.functional as F from paddle.metric import Accuracy -from paddlenlp.metrics.squad import squad_evaluate, compute_prediction +import numpy as np import paddlenlp -from paddlenlp.datasets import load_dataset from paddlenlp.data import Stack, Tuple, Pad, Dict -from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer -from paddlenlp.transformers import AutoModelForTokenClassification -from paddlenlp.transformers import AutoModelForQuestionAnswering -from paddlenlp.transformers import LinearDecayWithWarmup from paddlenlp.utils.log import logger +from trainer_base import TrainerBase -class BaseTrainer(object): - def create_dataloader(self, - dataset, - mode='train', - batch_size=16, - batchify_fn=None, - trans_fn=None, - batched=False): - if trans_fn: - dataset = dataset.map(trans_fn, batched=batched) - - shuffle = True if mode == 'train' else False - if mode == 'train': - batch_sampler = paddle.io.DistributedBatchSampler( - dataset, batch_size=batch_size, shuffle=shuffle) - else: - batch_sampler = paddle.io.BatchSampler( - dataset, batch_size=batch_size, shuffle=shuffle) - - return paddle.io.DataLoader( - dataset=dataset, - batch_sampler=batch_sampler, - collate_fn=batchify_fn, - num_workers=0, - return_list=True) - - def prepare_train_config(self): - if self.args.max_steps > 0: - self.args.num_training_steps = self.args.max_steps - self.args.num_train_epochs = math.ceil( - self.args.num_training_steps / len(self.train_dl)) - else: - self.args.num_training_steps = len( - self.train_dl) * self.args.num_train_epochs - self.args.num_train_epochs = self.args.num_train_epochs - - if self.args.num_training_steps // self.args.valid_steps < 20: - exp_step = self.args.num_training_steps / 20 - exp_step = max(int(exp_step - exp_step % 10), 10) - logger.info("Set eval step to %d" % exp_step) - self.args.valid_steps = exp_step - - warmup = self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_proportion - - self.lr_scheduler = LinearDecayWithWarmup( - self.args.learning_rate, self.args.num_training_steps, warmup) - - # Generate parameter names needed to perform weight decay. - # All bias and LayerNorm parameters are excluded. - decay_params = [ - p.name for n, p in self.model.named_parameters() - if not any(nd in n for nd in ["bias", "norm"]) - ] - - self.optimizer = paddle.optimizer.AdamW( - learning_rate=self.lr_scheduler, - beta1=0.9, - beta2=0.999, - epsilon=self.args.adam_epsilon, - parameters=self.model.parameters(), - weight_decay=self.args.weight_decay, - apply_decay_param_fun=lambda x: x in decay_params, - grad_clip=nn.ClipGradByGlobalNorm(self.args.max_grad_norm)) - - def print_config(self): - logger.info('{:^40}'.format("Configuration Arguments")) - logger.info('{:20}:{}'.format("paddle commit id", - paddle.version.commit)) - for arg in vars(self.args): - logger.info('{:20}:{}'.format(arg, getattr(self.args, arg))) +def convert_example(example, tokenizer, max_seq_length=512, is_test=False): + if "text_b" in example.keys(): + text = example["text_a"] + text_pair = example["text_b"] + else: + text = example["text"] + text_pair = None -def clue_trans_fn(examples, tokenizer, args): - return convert_clue( - examples, - tokenizer=tokenizer, - label_list=args.label_list, - max_seq_length=args.max_seq_length) + encoded_inputs = tokenizer( + text=text, text_pair=text_pair, max_seq_len=max_seq_length) + input_ids = encoded_inputs["input_ids"] + token_type_ids = encoded_inputs["token_type_ids"] + if is_test: + return input_ids, token_type_ids + label = np.array([example["label"]], dtype="int64") + return input_ids, token_type_ids, label -def clue_batchify_fn(tokenizer, args): - batchify_fn = lambda samples, fn=Tuple( - Pad(axis=0, pad_val=tokenizer.pad_token_id), # input - Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment - Stack(dtype="int64" if args.label_list else "float32") # label - ): fn(samples) - return batchify_fn +def seq_trans_fn(example, tokenizer, args): + return convert_example( + example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) def convert_clue(example, @@ -199,6 +112,24 @@ def convert_clue(example, return example['input_ids'], example['token_type_ids'] +def clue_trans_fn(examples, tokenizer, args): + return convert_clue( + examples, + tokenizer=tokenizer, + label_list=args.label_list, + max_seq_length=args.max_seq_length) + + +def clue_batchify_fn(tokenizer, args): + batchify_fn = lambda samples, fn=Tuple( + Pad(axis=0, pad_val=tokenizer.pad_token_id), # input + Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment + Stack(dtype="int64" if args.label_list else "float32") # label + ): fn(samples) + + return batchify_fn + + @paddle.no_grad() def evaluate(model, criterion, metric, data_loader, mode="dev"): """ @@ -228,37 +159,16 @@ def evaluate(model, criterion, metric, data_loader, mode="dev"): return accu -def create_dataloader(dataset, - mode='train', - batch_size=16, - batched=False, - batchify_fn=None, - trans_fn=None): - if trans_fn: - dataset = dataset.map(trans_fn, batched=False) - - shuffle = True if mode == 'train' else False - if mode == 'train': - batch_sampler = paddle.io.DistributedBatchSampler( - dataset, batch_size=batch_size, shuffle=shuffle) - else: - batch_sampler = paddle.io.BatchSampler( - dataset, batch_size=batch_size, shuffle=shuffle) - - return paddle.io.DataLoader( - dataset=dataset, - batch_sampler=batch_sampler, - collate_fn=batchify_fn, - num_workers=0, - return_list=True) - - -class CLUE_TRAINING(BaseTrainer): - def __init__(self, train_ds, dev_ds, model, tokenizer, args): +class ClueTrainer(TrainerBase): + def __init__(self, train_ds, dev_ds, model, tokenizer, args, *arg, + **kwargs): super().__init__() self.rank = paddle.distributed.get_rank() self.train_ds = train_ds self.dev_ds = dev_ds + if "test_ds" in kwargs.keys(): + self.test_ds = kwargs["test_ds"] + self.model = model self.tokenizer = tokenizer self.args = args @@ -277,6 +187,8 @@ def dataloader_inner(self): self.dev_dl = self.create_dataloader( self.dev_ds, "dev", self.args.batch_size, batchify_fn, trans_fn) + self.test_dl = None + def eval(self): pass @@ -341,7 +253,12 @@ def train(self): else: dev_acc = -1.0 metric.reset() - test_acc = -1 + + if self.test_dl is not None: + test_acc = evaluate(self.model, loss_fct, metric, + self.test_dl, "test") + else: + test_acc = -1.0 metric.reset() logger.info("eval done total : %s s" % @@ -350,10 +267,26 @@ def train(self): best_dev_acc = dev_acc corr_test_acc = test_acc + logger.warning( + "best_dev_acc: {:.6f}, corr_test_acc: {:.6f}".format( + best_dev_acc, corr_test_acc)) + if global_step >= self.args.num_training_steps: - logger.info("best_dev_acc: {:.6f}".format(best_dev_acc)) - logger.info("corr_test_acc: {:.6f}".format(corr_test_acc)) return - logger.info("best_dev_acc: {:.6f}".format(best_dev_acc)) - logger.info("corr_test_acc: {:.6f}".format(corr_test_acc)) + logger.warning("best_dev_acc: {:.6f}, corr_test_acc: {:.6f}".format( + best_dev_acc, corr_test_acc)) + + +class SeqTrainer(ClueTrainer): + def dataloader_inner(self): + trans_fn = partial( + seq_trans_fn, tokenizer=self.tokenizer, args=self.args) + batchify_fn = clue_batchify_fn(self.tokenizer, self.args) + + self.train_dl = self.create_dataloader( + self.train_ds, "train", self.args.batch_size, batchify_fn, trans_fn) + self.dev_dl = self.create_dataloader( + self.dev_ds, "dev", self.args.batch_size, batchify_fn, trans_fn) + self.test_dl = self.create_dataloader( + self.test_ds, "dev", self.args.batch_size, batchify_fn, trans_fn) diff --git a/examples/language_model/ernie-1.0/finetune/token_classification.py b/examples/language_model/ernie-1.0/finetune/token_classification.py index e69de29bb2d1..bb20a1764026 100644 --- a/examples/language_model/ernie-1.0/finetune/token_classification.py +++ b/examples/language_model/ernie-1.0/finetune/token_classification.py @@ -0,0 +1,170 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import random +import time +import math +from functools import partial + +import numpy as np +import paddle + +import paddlenlp as ppnlp +from paddlenlp.transformers import LinearDecayWithWarmup +from paddlenlp.metrics import ChunkEvaluator +from paddlenlp.datasets import load_dataset +from paddlenlp.data import Stack, Tuple, Pad, Dict +from paddlenlp.utils.log import logger + +from trainer_base import TrainerBase + + +@paddle.no_grad() +def evaluate(model, loss_fct, metric, data_loader, label_num, mode="valid"): + model.eval() + metric.reset() + avg_loss, precision, recall, f1_score = 0, 0, 0, 0 + for batch in data_loader: + input_ids, token_type_ids, length, labels = batch + logits = model(input_ids, token_type_ids) + loss = loss_fct(logits, labels) + avg_loss = paddle.mean(loss) + preds = logits.argmax(axis=2) + num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( + length, preds, labels) + metric.update(num_infer_chunks.numpy(), + num_label_chunks.numpy(), num_correct_chunks.numpy()) + precision, recall, f1_score = metric.accumulate() + logger.info("%s: eval loss: %f, precision: %f, recall: %f, f1: %f" % + (mode, avg_loss, precision, recall, f1_score)) + model.train() + + return f1_score + + +def tokenize_and_align_labels(example, tokenizer, no_entity_id, + max_seq_len=512): + labels = example['labels'] + example = example['tokens'] + tokenized_input = tokenizer( + example, + return_length=True, + is_split_into_words=True, + max_seq_len=max_seq_len) + + # -2 for [CLS] and [SEP] + if len(tokenized_input['input_ids']) - 2 < len(labels): + labels = labels[:len(tokenized_input['input_ids']) - 2] + tokenized_input['labels'] = [no_entity_id] + labels + [no_entity_id] + tokenized_input['labels'] += [no_entity_id] * ( + len(tokenized_input['input_ids']) - len(tokenized_input['labels'])) + return tokenized_input + + +class NerTrainer(TrainerBase): + def __init__(self, train_ds, dev_ds, model, tokenizer, args, *arg, + **kwargs): + super().__init__() + self.rank = paddle.distributed.get_rank() + self.train_ds = train_ds + self.dev_ds = dev_ds + if "test_ds" in kwargs.keys(): + self.test_ds = kwargs["test_ds"] + self.model = model + self.tokenizer = tokenizer + self.args = args + self.dataloader_inner() + self.prepare_train_config() + self.print_config() + + def dataloader_inner(self): + label_list = self.train_ds.label_list + label_num = len(label_list) + no_entity_id = label_num - 1 + + trans_fn = partial( + tokenize_and_align_labels, + tokenizer=self.tokenizer, + no_entity_id=no_entity_id, + max_seq_len=self.args.max_seq_length) + + ignore_label = -100 + + batchify_fn = lambda samples, fn=Dict({ + 'input_ids': Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype='int32'), # input + 'token_type_ids': Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id, dtype='int32'), # segment + 'seq_len': Stack(dtype='int64'), # seq_len + 'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64') # label + }): fn(samples) + + self.train_dl = self.create_dataloader( + self.train_ds, "train", self.args.batch_size, batchify_fn, trans_fn) + self.dev_dl = self.create_dataloader( + self.dev_ds, "dev", self.args.batch_size, batchify_fn, trans_fn) + self.test_dl = self.create_dataloader( + self.test_ds, "test", self.args.batch_size, batchify_fn, trans_fn) + + def train(self): + ignore_label = -100 + label_num = len(self.train_ds.label_list) + + loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) + metric = ChunkEvaluator(label_list=self.args.label_list) + + global_step = 0 + tic_train = time.time() + best_dev_f1 = -1 + corr_test_f1 = -1 + + for epoch in range(self.args.num_train_epochs): + for step, batch in enumerate(self.train_dl): + global_step += 1 + input_ids, token_type_ids, _, labels = batch + logits = self.model(input_ids, token_type_ids) + loss = loss_fct(logits, labels) + avg_loss = paddle.mean(loss) + + if global_step % self.args.logging_steps == 0: + logger.info( + "global step %d/%d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" + % (global_step, self.args.num_training_steps, epoch, + step, avg_loss, + self.args.logging_steps / (time.time() - tic_train))) + tic_train = time.time() + + avg_loss.backward() + self.optimizer.step() + self.lr_scheduler.step() + self.optimizer.clear_grad() + + if global_step % self.args.valid_steps == 0 or global_step == self.args.num_training_steps: + if paddle.distributed.get_rank() == 0: + dev_f1 = evaluate(self.model, loss_fct, metric, + self.dev_dl, label_num, "valid") + test_f1 = evaluate(self.model, loss_fct, metric, + self.test_dl, label_num, "test") + if dev_f1 > best_dev_f1: + best_dev_f1 = dev_f1 + corr_test_f1 = test_f1 + logger.warning( + "Currently, best_dev_f1: %.4f, corr_test_f1: %.4f" % + (best_dev_f1, corr_test_f1)) + + if global_step >= self.args.num_training_steps: + logger.warning( + "Currently, best_dev_f1: %.4f, corr_test_f1: %.4f" % + (best_dev_f1, corr_test_f1)) + return diff --git a/examples/language_model/ernie-1.0/finetune/trainer_base.py b/examples/language_model/ernie-1.0/finetune/trainer_base.py new file mode 100644 index 000000000000..6b1790063aa4 --- /dev/null +++ b/examples/language_model/ernie-1.0/finetune/trainer_base.py @@ -0,0 +1,372 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddlenlp.transformers import LinearDecayWithWarmup +from paddlenlp.utils.log import logger +from paddle.io import DataLoader + + +class TrainerBase(object): + """ + """ + + def create_dataloader(self, + dataset, + mode='train', + batch_size=16, + batchify_fn=None, + trans_fn=None, + batched=False): + """ + """ + if trans_fn: + dataset = dataset.map(trans_fn, batched=batched) + + shuffle = True if mode == 'train' else False + if mode == 'train': + batch_sampler = paddle.io.DistributedBatchSampler( + dataset, batch_size=batch_size, shuffle=shuffle) + else: + batch_sampler = paddle.io.BatchSampler( + dataset, batch_size=batch_size, shuffle=shuffle) + + return paddle.io.DataLoader( + dataset=dataset, + batch_sampler=batch_sampler, + collate_fn=batchify_fn, + num_workers=0, + return_list=True) + + def train(self, *args, **kwargs): + """ + """ + pass + + def eval(self, *args, **kwargs): + """ + """ + pass + + def prepare_train_config(self): + """ + """ + if self.args.max_steps > 0: + self.args.num_training_steps = self.args.max_steps + self.args.num_train_epochs = math.ceil( + self.args.num_training_steps / len(self.train_dl)) + + else: + self.args.num_training_steps = len( + self.train_dl) * self.args.num_train_epochs + self.args.num_train_epochs = self.args.num_train_epochs + + if self.args.num_training_steps // self.args.valid_steps < self.args.minimum_valid_times: + exp_step = self.args.num_training_steps / self.args.minimum_valid_times + exp_step = max(int(exp_step - exp_step % 10), 10) + logger.info("Set eval step to %d" % exp_step) + self.args.valid_steps = exp_step + + warmup = self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_proportion + + self.lr_scheduler = LinearDecayWithWarmup( + self.args.learning_rate, self.args.num_training_steps, warmup) + + # Generate parameter names needed to perform weight decay. + # All bias and LayerNorm parameters are excluded. + decay_params = [ + p.name for n, p in self.model.named_parameters() + if not any(nd in n for nd in ["bias", "norm"]) + ] + + self.optimizer = paddle.optimizer.AdamW( + learning_rate=self.lr_scheduler, + beta1=0.9, + beta2=0.999, + epsilon=self.args.adam_epsilon, + parameters=self.model.parameters(), + weight_decay=self.args.weight_decay, + apply_decay_param_fun=lambda x: x in decay_params, + grad_clip=nn.ClipGradByGlobalNorm(self.args.max_grad_norm)) + + def print_config(self): + """ + """ + logger.info('{:^40}'.format("Configuration Arguments")) + logger.info('{:20}:{}'.format("paddle commit id", + paddle.version.commit)) + for arg in vars(self.args): + logger.info('{:20}:{}'.format(arg, getattr(self.args, arg))) + + +class Trainer: + """ + """ + + def __init__( + self, + model: Union[PreTrainedModel, nn.Layer]=None, + args: TrainingArguments=None, + data_collator: Optional[DataCollator]=None, + train_dataset: Optional[Dataset]=None, + eval_dataset: Optional[Dataset]=None, + tokenizer: Optional[PreTrainedTokenizerBase]=None, + compute_metrics: Optional[Callable[[EvalPrediction], Dict]]=None, + optimizers: Tuple[paddle.optim.Optimizer, paddle.optim.lr_scheduler. + LambdaLR]=(None, None), ): + if args is None: + output_dir = "tmp_trainer" + logger.info( + f"No `TrainingArguments` passed, using `output_dir={output_dir}`." + ) + args = TrainingArguments(output_dir=output_dir) + self.args = args + # Seed must be set before instantiating the model when using model + set_seed(self.args.seed) + if model is None: + raise RuntimeError( + "`Trainer` requires either a `model` or `model_init` argument") + + default_collator = default_data_collator if tokenizer is None else DataCollatorWithPadding( + tokenizer) + self.data_collator = data_collator if data_collator is not None else default_collator + self.train_dataset = train_dataset + self.eval_dataset = eval_dataset + self.tokenizer = tokenizer + + self.model_wrapped = model + self.model = model + + self.compute_metrics = compute_metrics + self.optimizer, self.lr_scheduler = optimizers + + if args.max_steps > 0: + logger.info( + "max_steps is given, it will override any value given in num_train_epochs" + ) + + if train_dataset is not None and not isinstance( + train_dataset, collections.abc.Sized) and args.max_steps <= 0: + raise ValueError( + "train_dataset does not implement __len__, max_steps has to be specified" + ) + + if args.fp16: + logger.info(f"Using half precision backend") + + def train( + self, + resume_from_checkpoint: Optional[Union[str, bool]]=None, + trial: Union["optuna.Trial", Dict[str, Any]]=None, + ignore_keys_for_eval: Optional[List[str]]=None, + **kwargs, ): + train_dataloader = self.get_train_dataloader() + model = self._wrap_model(self.model_wrapped) + self.create_optimizer_and_scheduler(num_training_steps=max_steps) + + for epoch in range(epochs_trained, num_train_epochs): + step = -1 + for step, inputs in enumerate(epoch_iterator): + tr_loss_step = self.training_step(model, inputs) + self.scaler.step(self.optimizer) + self.scaler.update() + self.optimizer.step() + + self.lr_scheduler.step() + model.zero_grad() + + def training_step( + self, model: nn.Layer, + inputs: Dict[str, Union[paddle.Tensor, Any]]) -> paddle.Tensor: + model.train() + inputs = self._prepare_inputs(inputs) + + with self.autocast_smart_context_manager(): + loss = self.compute_loss(model, inputs) + + loss.backward() + + return loss.detach() + + def get_train_dataloader(self): + pass + + def _get_eval_sampler(self, eval_dataset: Dataset): + pass + + def get_eval_dataloader(self, + eval_dataset: Optional[Dataset]=None) -> DataLoader: + pass + + def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: + pass + + def create_optimizer_and_scheduler(self, num_training_steps: int): + pass + + def create_optimizer(self): + pass + + @staticmethod + def get_optimizer_cls_and_kwargs( + args: TrainingArguments) -> Tuple[Any, Any]: + pass + + def create_scheduler(self, + num_training_steps: int, + optimizer: paddle.optim.Optimizer=None): + pass + + def _wrap_model(self, model, training=True): + pass + + def _prepare_input( + self, data: Union[paddle.Tensor, Any]) -> Union[paddle.Tensor, Any]: + pass + + def _prepare_inputs(self, inputs: Dict[str, Union[paddle.Tensor, Any]] + ) -> Dict[str, Union[paddle.Tensor, Any]]: + pass + + def autocast_smart_context_manager(self): + pass + + def training_step( + self, model: nn.Layer, + inputs: Dict[str, Union[paddle.Tensor, Any]]) -> paddle.Tensor: + pass + + def save_model(self, + output_dir: Optional[str]=None, + _internal_call: bool=False): + pass + + def _save(self, output_dir: Optional[str]=None, state_dict=None): + pass + + def _load_optimizer_and_scheduler(self, checkpoint): + pass + + def evaluate( + self, + eval_dataset: Optional[Dataset]=None, + ignore_keys: Optional[List[str]]=None, + metric_key_prefix: str="eval", ) -> Dict[str, float]: + pass + + def predict(self, + test_dataset: Dataset, + ignore_keys: Optional[List[str]]=None, + metric_key_prefix: str="test") -> PredictionOutput: + pass + + def prediction_step( + self, + model: nn.Layer, + inputs: Dict[str, Union[paddle.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[List[str]]=None, ) -> Tuple[Optional[ + paddle.Tensor], Optional[paddle.Tensor], Optional[ + paddle.Tensor]]: + pass + + def create_dataloader(self, + dataset, + mode='train', + batch_size=16, + batchify_fn=None, + trans_fn=None, + batched=False): + """ + """ + if trans_fn: + dataset = dataset.map(trans_fn, batched=batched) + + shuffle = True if mode == 'train' else False + if mode == 'train': + batch_sampler = paddle.io.DistributedBatchSampler( + dataset, batch_size=batch_size, shuffle=shuffle) + else: + batch_sampler = paddle.io.BatchSampler( + dataset, batch_size=batch_size, shuffle=shuffle) + + return paddle.io.DataLoader( + dataset=dataset, + batch_sampler=batch_sampler, + collate_fn=batchify_fn, + num_workers=0, + return_list=True) + + def train(self, *args, **kwargs): + """ + """ + pass + + def eval(self, *args, **kwargs): + """ + """ + pass + + def prepare_train_config(self): + """ + """ + if self.args.max_steps > 0: + self.args.num_training_steps = self.args.max_steps + self.args.num_train_epochs = math.ceil( + self.args.num_training_steps / len(self.train_dl)) + + else: + self.args.num_training_steps = len( + self.train_dl) * self.args.num_train_epochs + self.args.num_train_epochs = self.args.num_train_epochs + + if self.args.num_training_steps // self.args.valid_steps < self.args.minimum_valid_times: + exp_step = self.args.num_training_steps / self.args.minimum_valid_times + exp_step = max(int(exp_step - exp_step % 10), 10) + logger.info("Set eval step to %d" % exp_step) + self.args.valid_steps = exp_step + + warmup = self.args.warmup_steps if self.args.warmup_steps > 0 else self.args.warmup_proportion + + self.lr_scheduler = LinearDecayWithWarmup( + self.args.learning_rate, self.args.num_training_steps, warmup) + + # Generate parameter names needed to perform weight decay. + # All bias and LayerNorm parameters are excluded. + decay_params = [ + p.name for n, p in self.model.named_parameters() + if not any(nd in n for nd in ["bias", "norm"]) + ] + + self.optimizer = paddle.optimizer.AdamW( + learning_rate=self.lr_scheduler, + beta1=0.9, + beta2=0.999, + epsilon=self.args.adam_epsilon, + parameters=self.model.parameters(), + weight_decay=self.args.weight_decay, + apply_decay_param_fun=lambda x: x in decay_params, + grad_clip=nn.ClipGradByGlobalNorm(self.args.max_grad_norm)) + + def print_config(self): + """ + """ + logger.info('{:^40}'.format("Configuration Arguments")) + logger.info('{:20}:{}'.format("paddle commit id", + paddle.version.commit)) + for arg in vars(self.args): + logger.info('{:20}:{}'.format(arg, getattr(self.args, arg)))