[1]:
from EduNLP.Pretrain import TokenizerForHuggingface
/data/qlh/anaconda3/envs/py36/lib/python3.6/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
warnings.warn(msg)
Huggingface通用化接口¶
[2]:
tokenizer = TokenizerForHuggingface("bert-base-chinese", add_specials=True, tokenize_method="ast_formula")
基础用法¶
[3]:
text = '有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,\
如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,\
若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'
tokens = tokenizer.tokenize(text)
print(tokens)
encodes = tokenizer(text)
print(encodes)
Dump cache file failed.
Traceback (most recent call last):
File "/data/qlh/anaconda3/envs/py36/lib/python3.6/site-packages/jieba/__init__.py", line 154, in initialize
_replace_file(fpath, cache_file)
PermissionError: [Errno 1] Operation not permitted: '/tmp/tmpgu196tfc' -> '/tmp/jieba.cache'
['公', '式', '[FORMULA]', '公', '式', '[FORMULA]', '如', '图', '[FIGURE]', 'ma', '##th', '##or', '##d', '_', '0', ',', 'ma', '##th', '##or', '##d', '_', '1', '约', '束', '条', '件', '[SEP]', 'ma', '##th', '##or', '##d', '_', '2', '=', 'ma', '##th', '##or', '##d', '_', '0', '+', 'text', '##or', '##d', 'ma', '##th', '##or', '##d', '_', '1', '最', '大', '值', '[MARK]']
{'input_ids': tensor([[ 101, 1062, 2466, 21129, 1062, 2466, 21129, 1963, 1745, 21130,
9622, 8414, 8372, 8168, 142, 121, 117, 9622, 8414, 8372,
8168, 142, 122, 5276, 3338, 3340, 816, 102, 9622, 8414,
8372, 8168, 142, 123, 134, 9622, 8414, 8372, 8168, 142,
121, 116, 10539, 8372, 8168, 9622, 8414, 8372, 8168, 142,
122, 3297, 1920, 966, 21131, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1]])}
分词¶
[4]:
tokens = tokenizer.tokenize(text)
print(tokens)
print()
# 等价于如下操作
pre_tokens = tokenizer._pre_tokenize(text)
print(pre_tokens)
tokens = tokenizer.bert_tokenizer.tokenize(pre_tokens)
print(tokens)
['公', '式', '[FORMULA]', '公', '式', '[FORMULA]', '如', '图', '[FIGURE]', 'ma', '##th', '##or', '##d', '_', '0', ',', 'ma', '##th', '##or', '##d', '_', '1', '约', '束', '条', '件', '[SEP]', 'ma', '##th', '##or', '##d', '_', '2', '=', 'ma', '##th', '##or', '##d', '_', '0', '+', 'text', '##or', '##d', 'ma', '##th', '##or', '##d', '_', '1', '最', '大', '值', '[MARK]']
公式 [FORMULA] 公式 [FORMULA] 如图 [FIGURE] mathord_0 , mathord_1 约束条件 [SEP] mathord_2 = mathord_0 + textord mathord_1 最大值 [MARK]
['公', '式', '[FORMULA]', '公', '式', '[FORMULA]', '如', '图', '[FIGURE]', 'ma', '##th', '##or', '##d', '_', '0', ',', 'ma', '##th', '##or', '##d', '_', '1', '约', '束', '条', '件', '[SEP]', 'ma', '##th', '##or', '##d', '_', '2', '=', 'ma', '##th', '##or', '##d', '_', '0', '+', 'text', '##or', '##d', 'ma', '##th', '##or', '##d', '_', '1', '最', '大', '值', '[MARK]']
[14]:
encode_idxs = tokenizer.encode("[FIGURE]")
print(encode_idxs)
encode_tokens = tokenizer.decode(encode_idxs)
print(encode_tokens)
[101, 21130, 102]
[CLS] [FIGURE] [SEP]
扩充词表¶
直接新增单词¶
[6]:
tokenizer.add_tokens(["[python]"])
tokenizer.bert_tokenizer.tokenize("[python] is a coding language")
[6]:
['[python]', 'is', 'a', 'co', '##ding', 'language']
[8]:
print(tokenizer._special_tokens)
print()
tokenizer.add_specials(["[new]"])
print(tokenizer._special_tokens)
{'[FIGURE]', '[FORMULA_END]', '[FORMULA]', '[SEP]', '[TEXT_BEGIN]', '[TEXT_END]', '[TEXT]', '[', 'n', '[TAG]', 'e', ']', 'w', '[FORMULA_BEGIN]', '[MARK]'}
{'[FIGURE]', '[FORMULA_END]', '[FORMULA]', '[new]', '[SEP]', '[TEXT_BEGIN]', '[TEXT_END]', '[TEXT]', '[', 'n', '[TAG]', 'e', ']', 'w', '[FORMULA_BEGIN]', '[MARK]'}
批量设置语料库词表¶
根据原始文本更新词表
[9]:
vocab_sentences = [
'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$, 若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'
]
remain_tokens, added_num = tokenizer.set_vocab(vocab_sentences, lower=False, trim_min_count=1, do_tokenize=True)
print(remain_tokens)
print(added_num)
['公式', '[FORMULA]', '如图', '[FIGURE]', 'mathord_0', ',', 'mathord_1', '约束条件', '[SEP]', 'mathord_2', '=', '+', 'textord', '最大值', '[MARK]']
8
根据分词序列更新词表
[10]:
vocab_tokens = [
['公式', '[FORMULA]', '公式', '[FORMULA]', '如图', '[FIGURE]', 'mathord_0', ',', 'mathord_1', '约束条件', '[SEP]', 'mathord_2', '=', 'mathord_0', '+', 'textord', 'mathord_1', '最大值', '[MARK]']
]
remain_tokens, added_num = tokenizer.set_vocab(vocab_tokens, lower=False, trim_min_count=1, do_tokenize=False)
print(remain_tokens)
print(added_num)
['公式', '[FORMULA]', '如图', '[FIGURE]', 'mathord_0', ',', 'mathord_1', '约束条件', '[SEP]', 'mathord_2', '=', '+', 'textord', '最大值', '[MARK]']
0
[11]:
tokenizer("公式")
[11]:
{'input_ids': tensor([[ 101, 21139, 102]]), 'token_type_ids': tensor([[0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1]])}
[12]:
tokenizer("公 式")
[12]:
{'input_ids': tensor([[ 101, 1062, 2466, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}
保存与加载¶
[13]:
# 保存
save_dir = "./tmp"
tokenizer.save_pretrained(save_dir)
# 加载
tokenizer = TokenizerForHuggingface.from_pretrained(save_dir)