Source code for EduNLP.Pretrain.bert_vec

import os
from typing import List, Union
from transformers import BertForMaskedLM
from transformers import DataCollatorForLanguageModeling, DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from copy import deepcopy

from ..ModelZoo.bert import BertForPropertyPrediction, BertForKnowledgePrediction
from .pretrian_utils import EduDataset
from .hugginface_utils import TokenizerForHuggingface

__all__ = ["BertTokenizer", "BertDataset", "finetune_bert", "finetune_bert_for_property_prediction",
           "finetune_bert_for_knowledge_prediction"]

DEFAULT_TRAIN_PARAMS = {
    # default
    "output_dir": None,
    "num_train_epochs": 1,
    "per_device_train_batch_size": 32,
    # "per_device_eval_batch_size": 32,
    # evaluation_strategy: "steps",
    # eval_steps:200,
    "save_steps": 1000,
    "save_total_limit": 2,
    # "load_best_model_at_end": True,
    # metric_for_best_model: "loss",
    # greater_is_better: False,
    "logging_dir": None,
    "logging_steps": 5,
    "gradient_accumulation_steps": 1,
    "learning_rate": 5e-5,
    # disable_tqdm: True,
    # no_cuda: True,
}


[docs]class BertTokenizer(TokenizerForHuggingface): """ Examples ---------- >>> tokenizer = BertTokenizer(add_special_tokens=True) >>> item = "有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$" >>> token_item = tokenizer(item) >>> print(token_item.input_ids) tensor([[ 101, 1062, 2466, 1963, 1745, 138, 100, 140, 166, 117, 167, 5276, 3338, 3340, 816, 1062, 2466, 102, 168, 134, 166, 116, 128, 167, 3297, 1920, 966, 138, 100, 140, 102]]) >>> print(tokenizer.tokenize(item)[:10]) ['公', '式', '如', '图', '[', '[UNK]', ']', 'x', ',', 'y'] >>> items = [item, item] >>> token_items = tokenizer(items, return_tensors='pt') >>> print(token_items.input_ids.shape) torch.Size([2, 31]) >>> print(len(tokenizer.tokenize(items))) 2 >>> tokenizer.save_pretrained('test_dir') # doctest: +SKIP >>> tokenizer = BertTokenizer.from_pretrained('test_dir') # doctest: +SKIP """ pass
[docs]class BertDataset(EduDataset): pass
[docs]def finetune_bert(items: Union[List[dict], List[str]], output_dir: str, pretrained_model="bert-base-chinese", tokenizer_params=None, data_params=None, model_params=None, train_params=None): """ Parameters ---------- items: list, required The training corpus, each item could be str or dict output_dir: str, required The directory to save trained model files pretrained_model: str, optional The pretrained model name or path for model and tokenizer eval_items: list, required The evaluating items, each item could be str or dict tokenizer_params: dict, optional, default=None The parameters passed to ElmoTokenizer data_params: dict, optional, default=None The parameters passed to ElmoDataset and ElmoTokenizer model_params: dict, optional, default=None The parameters passed to Trainer train_params: dict, optional, default=None Examples ---------- >>> stems = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$", ... "有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$"] >>> finetune_bert(stems, "examples/test_model/data/data/bert") # doctest: +SKIP {'train_runtime': ..., ..., 'epoch': 1.0} """ tokenizer_params = tokenizer_params if tokenizer_params else {} data_params = data_params if data_params is not None else {} model_params = model_params if model_params is not None else {} train_params = train_params if train_params is not None else {} # tokenizer configuration if os.path.exists(pretrained_model): tokenizer = BertTokenizer.from_pretrained(pretrained_model, **tokenizer_params) else: work_tokenizer_params = { "add_specials": True, "tokenize_method": "pure_text", } work_tokenizer_params.update(tokenizer_params) tokenizer = BertTokenizer(pretrained_model, **work_tokenizer_params) # TODO: tokenizer.set_vocab() # model configuration model = BertForMaskedLM.from_pretrained(pretrained_model, **model_params) # resize embedding for additional special tokens model.resize_token_embeddings(len(tokenizer.bert_tokenizer)) # dataset configuration dataset = BertDataset(tokenizer, items=items, stem_key=data_params.get("stem_key", None)) mlm_probability = train_params.pop('mlm_probability', 0.15) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer.bert_tokenizer, mlm=True, mlm_probability=mlm_probability ) # training configuration work_train_params = deepcopy(DEFAULT_TRAIN_PARAMS) work_train_params["output_dir"] = output_dir if train_params is not None: work_train_params.update(train_params if train_params else {}) train_args = TrainingArguments(**work_train_params) trainer = Trainer( model=model, args=train_args, data_collator=data_collator, train_dataset=dataset, ) trainer.train() trainer.save_model(output_dir) tokenizer.save_pretrained(output_dir)
[docs]def finetune_bert_for_property_prediction(train_items, output_dir, pretrained_model="bert-base-chinese", eval_items=None, tokenizer_params=None, data_params=None, train_params=None, model_params=None ): """ Parameters ---------- train_items: list, required The training corpus, each item could be str or dict output_dir: str, required The directory to save trained model files pretrained_model: str, optional The pretrained model name or path for model and tokenizer eval_items: list, required The evaluating items, each item could be str or dict tokenizer_params: dict, optional, default=None The parameters passed to ElmoTokenizer data_params: dict, optional, default=None The parameters passed to ElmoDataset and ElmoTokenizer model_params: dict, optional, default=None The parameters passed to Trainer train_params: dict, optional, default=None """ tokenizer_params = tokenizer_params if tokenizer_params else {} data_params = data_params if data_params is not None else {} model_params = model_params if model_params is not None else {} train_params = train_params if train_params is not None else {} # tokenizer configuration tokenizer = BertTokenizer.from_pretrained(pretrained_model, **tokenizer_params) # dataset configuration train_dataset = BertDataset(tokenizer=tokenizer, items=train_items, stem_key=data_params.get("stem_key", "ques_content"), label_key=data_params.get("label_key", "difficulty")) if eval_items is not None: eval_dataset = BertDataset(tokenizer=tokenizer, items=eval_items, stem_key=data_params.get("stem_key", "ques_content"), label_key=data_params.get("label_key", "difficulty")) else: eval_dataset = None # model configuration model = BertForPropertyPrediction(pretrained_model, **model_params) model.bert.resize_token_embeddings(len(tokenizer.bert_tokenizer)) # training configuration work_train_params = deepcopy(DEFAULT_TRAIN_PARAMS) work_train_params["output_dir"] = output_dir if train_params is not None: work_train_params.update(train_params if train_params else {}) train_args = TrainingArguments(**work_train_params) data_collator = DataCollatorWithPadding(tokenizer.bert_tokenizer) trainer = Trainer( model=model, args=train_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator, ) trainer.train() # trainer.model.save_pretrained(output_dir) trainer.save_model(output_dir) trainer.model.save_config(output_dir) tokenizer.save_pretrained(output_dir)
[docs]def finetune_bert_for_knowledge_prediction(train_items, output_dir, pretrained_model="bert-base-chinese", eval_items=None, tokenizer_params=None, data_params=None, train_params=None, model_params=None ): """ Parameters ---------- train_items: list, required The training corpus, each item could be str or dict output_dir: str, required The directory to save trained model files pretrained_model: str, optional The pretrained model name or path for model and tokenizer eval_items: list, required The evaluating items, each item could be str or dict tokenizer_params: dict, optional, default=None The parameters passed to ElmoTokenizer data_params: dict, optional, default=None The parameters passed to ElmoDataset and ElmoTokenizer model_params: dict, optional, default=None The parameters passed to Trainer train_params: dict, optional, default=None """ tokenizer_params = tokenizer_params if tokenizer_params else {} data_params = data_params if data_params is not None else {} model_params = model_params if model_params is not None else {} train_params = train_params if train_params is not None else {} # tokenizer configuration tokenizer = BertTokenizer.from_pretrained(pretrained_model, **tokenizer_params) # dataset configuration train_dataset = BertDataset(tokenizer=tokenizer, items=train_items, stem_key=data_params.get("stem_key", "ques_content"), label_key=data_params.get("label_key", "know_list")) if eval_items is not None: eval_dataset = BertDataset(tokenizer=tokenizer, items=eval_items, stem_key=data_params.get("stem_key", "ques_content"), label_key=data_params.get("label_key", "know_list")) else: eval_dataset = None # model configuration model = BertForKnowledgePrediction(pretrained_model_dir=pretrained_model, **model_params) model.bert.resize_token_embeddings(len(tokenizer.bert_tokenizer)) # training configuration work_train_params = deepcopy(DEFAULT_TRAIN_PARAMS) work_train_params["output_dir"] = output_dir if train_params is not None: work_train_params.update(train_params if train_params else {}) train_args = TrainingArguments(**work_train_params) data_collator = DataCollatorWithPadding(tokenizer.bert_tokenizer) trainer = Trainer( model=model, args=train_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator, ) trainer.train() # trainer.model.save_pretrained(output_dir) trainer.save_model(output_dir) trainer.model.save_config(output_dir) tokenizer.save_pretrained(output_dir)