Source code for EduNLP.Pretrain.hugginface_utils

import os
import json
from transformers.file_utils import TensorType
from transformers import AutoTokenizer
from typing import List, Optional, Union, Tuple
from ..SIF import EDU_SPYMBOLS
from ..Tokenizer import get_tokenizer


[docs]class TokenizerForHuggingface(object): """ Parameterss ---------- pretrained_model: used pretrained model add_specials: Whether to add tokens like [FIGURE], [TAG], etc. tokenize_method: Which text tokenizer to use. Must be consistent with TOKENIZER dictionary. Returns ---------- Examples ---------- >>> tokenizer = TokenizerForHuggingface(add_special_tokens=True) >>> item = "有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$" >>> token_item = tokenizer(item) >>> print(token_item.input_ids[:10]) tensor([[ 101, 1062, 2466, 1963, 1745, 138, 100, 140, 166, 117, 167, 5276, 3338, 3340, 816, 1062, 2466, 102, 168, 134, 166, 116, 128, 167, 3297, 1920, 966, 138, 100, 140, 102]]) >>> print(tokenizer.tokenize(item)[:10]) ['公', '式', '如', '图', '[', '[UNK]', ']', 'x', ',', 'y'] >>> items = [item, item] >>> token_items = tokenizer(items, return_tensors='pt') >>> print(token_items.input_ids.shape) torch.Size([2, 31]) >>> print(len(tokenizer.tokenize(items))) 2 >>> tokenizer.save_pretrained('test_dir') # doctest: +SKIP >>> tokenizer = TokenizerForHuggingface.from_pretrained('test_dir') # doctest: +SKIP """ def __init__(self, pretrained_model="bert-base-chinese", max_length=512, tokenize_method: str = "pure_text", add_specials: Union[List[str], bool] = False, **kwargs): self._set_basic_tokenizer(tokenize_method, **kwargs) if isinstance(add_specials, bool): add_specials = EDU_SPYMBOLS if add_specials is True else [] else: add_specials = EDU_SPYMBOLS + add_specials self._special_tokens = set() self.max_length = max_length self.bert_tokenizer = AutoTokenizer.from_pretrained(pretrained_model) self.add_specials(add_specials) config = {k: v for k, v in locals().items() if k not in ["self", "__class__", "pretrained_model", "kwargs"]} config.update(kwargs) self.config = config def __call__(self, items: Tuple[list, str, dict], key=lambda x: x, padding=True, return_tensors: Optional[Tuple[str, TensorType, bool]] = True, **kwargs): if isinstance(items, list): text = [self._pre_tokenize(key(i)) for i in items] else: text = self._pre_tokenize(key(items)) if isinstance(return_tensors, bool): return_tensors = "pt" if return_tensors is True else None encodes = self.bert_tokenizer(text, truncation=True, padding=padding, max_length=self.max_length, return_tensors=return_tensors) return encodes def __len__(self): return len(self.bert_tokenizer) def _set_basic_tokenizer(self, tokenize_method: str = None, **kwargs): self.tokenize_method = tokenize_method if self.tokenize_method is not None: self.text_tokenizer = get_tokenizer(tokenize_method, **kwargs) else: self.text_tokenizer = None def _pre_tokenize(self, text: Union[str, dict]): if self.text_tokenizer is not None: text = self.text_tokenizer._tokenize(text) text = " ".join(text) return text
[docs] def tokenize(self, items: Union[list, str, dict], key=lambda x: x, **kwargs): if isinstance(items, list): texts = [self._tokenize(key(i)) for i in items] return texts else: return self._tokenize(key(items))
[docs] def encode(self, items: Tuple[str, dict, List[str], List[dict]], key=lambda x: x, **kwargs): if isinstance(items, str) or isinstance(items, dict): return self.bert_tokenizer.encode(key(items), **kwargs) else: return [self.bert_tokenizer.encode(key(item), **kwargs) for item in items]
[docs] def decode(self, token_ids: list, key=lambda x: x, **kwargs): if isinstance(token_ids[0], list): return [self.bert_tokenizer.decode(key(item), **kwargs) for item in token_ids] else: return self.bert_tokenizer.decode(key(token_ids), **kwargs)
def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): item = self._pre_tokenize(key(item)) return self.bert_tokenizer.tokenize(item, **kwargs)
[docs] @classmethod def from_pretrained(cls, tokenizer_config_dir, **kwargs): custom_config_dir = os.path.join(tokenizer_config_dir, 'custom_config.json') if os.path.exists(custom_config_dir): with open(custom_config_dir, 'r') as f: custom_config = json.load(f) custom_config.update(kwargs) return cls(tokenizer_config_dir, **custom_config) else: return cls(tokenizer_config_dir, **kwargs)
[docs] def save_pretrained(self, tokenizer_config_dir): self.bert_tokenizer.save_pretrained(tokenizer_config_dir) custom_config = self.config with open(os.path.join(tokenizer_config_dir, 'custom_config.json'), 'w') as f: json.dump(custom_config, f, indent=2)
@property def vocab_size(self): return len(self.bert_tokenizer)
[docs] def set_vocab(self, items: Tuple[List[str], List[dict]], key=lambda x: x, lower=False, trim_min_count: int = 1, do_tokenize: bool = True): """ Parameters ----------- items: list can be the list of str, or list of dict key: function determine how to get the text of each item trim_min_count : int, optional the lower bound number for adding a word into vocabulary, by default 1 do_tokenize : bool, optional wheather tokenize items before updating vocab, by default True """ word2cnt = dict() for item in items: tokens = self._pre_tokenize(key(item)).split() if do_tokenize else key(item) for word in tokens: word = word.lower() if lower else word word2cnt[word] = word2cnt.get(word, 0) + 1 remain_tokens = [w for w, c in word2cnt.items() if c >= trim_min_count] added_num = self.add_tokens(remain_tokens) return remain_tokens, added_num
[docs] def add_specials(self, added_spectials: List[str]): for tok in added_spectials: self._special_tokens.add(tok) return self.bert_tokenizer.add_special_tokens({'additional_special_tokens': added_spectials})
[docs] def add_tokens(self, added_tokens: List[str]): return self.bert_tokenizer.add_tokens(added_tokens)