[1]:
from EduNLP.Pretrain import PretrainedEduTokenizer, EduDataset
import os
import json


BASE_DIR = "../.."
data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/data/pretrain_test_models/pretrain/"


def stem_data():
    _data = []
    data_path = os.path.join(data_dir, "standard_luna_data.json")
    with open(data_path, encoding="utf-8") as f:
        for line in f.readlines():
            _data.append(json.loads(line))
    return _data

train_items = stem_data()

test_items = [
    {'ques_content': '有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,\
            如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,\
            若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'},
    {'ques_content': '如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$, \
            若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}
]
/data/qlh/anaconda3/envs/py36/lib/python3.6/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
  warnings.warn(msg)

1. PretrainedEduTokenizer

该类主要用于处理预训练模型的输入语料,主要成分包括词表(vocab) 和 基础令牌话容器,负责将输入语料处理为适合模型的输入格式。

1.1 构造令牌化容器

[2]:
corpus_items = train_items + test_items

# 定义参数
tokenizer_params = {
    "add_specials": True,
    "tokenize_method": "pure_text",
}
# 可自定义pure_text的参数, 参考Tokenizer/PureTextTokenizer
text_params = {
    "granularity": "char",
    "stopwords": None,
}

tokenizer = PretrainedEduTokenizer(**tokenizer_params, text_params=text_params)
print(len(tokenizer))


# 设置预训练语料,训练令牌话容器
tokenizer.set_vocab(corpus_items, key=lambda x: x['ques_content'])
print(len(tokenizer))

# 保存令牌话容器
pretrained_tokenizer_dir = output_dir
tokenizer.save_pretrained(pretrained_tokenizer_dir)
14
Dump cache file failed.
Traceback (most recent call last):
  File "/data/qlh/anaconda3/envs/py36/lib/python3.6/site-packages/jieba/__init__.py", line 154, in initialize
    _replace_file(fpath, cache_file)
PermissionError: [Errno 1] Operation not permitted: '/tmp/tmpk245c2ok' -> '/tmp/jieba.cache'
379

1.2 使用令牌化容器

[3]:
# 加载令牌话容器
tokenizer = PretrainedEduTokenizer.from_pretrained(pretrained_tokenizer_dir)

# 按batch进行padding
encodes = tokenizer(test_items, key=lambda x: x['ques_content'])
print(list(encodes.keys()))
print(encodes["seq_idx"].shape)
print()

# 按max_length进行padding
encodes = tokenizer(test_items, key=lambda x: x['ques_content'], padding="max_length", max_length=100)
print(list(encodes.keys()))
print(encodes["seq_idx"].shape)
print()

# 不返回tensor
encodes = tokenizer(test_items, key=lambda x: x['ques_content'], padding="max_length", max_length=100, return_tensors=False)
print(encodes["seq_idx"])
print()

# 保留tokens
encodes = tokenizer(test_items, key=lambda x: x['ques_content'], padding="max_length", max_length=100, return_text=True)
print(list(encodes.keys()))
print()
['seq_idx', 'seq_len']
torch.Size([2, 17])

['seq_idx', 'seq_len']
torch.Size([2, 100])

[[1, 1, 1, 6, 22, 35, 130, 1, 9, 45, 19, 22, 46, 211, 130, 1, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 6, 22, 35, 130, 1, 9, 45, 19, 22, 46, 211, 130, 1, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

['seq_idx', 'seq_len', 'seq_token']

1.3 其他操作

扩充词表

[4]:
print(tokenizer.vocab._special_tokens)
print()

# 增加特殊词
tokenizer.add_specials(["[special]"])
print(tokenizer.tokenize("[special]"))
print(tokenizer.vocab._special_tokens)
print()

# 增加词
tokenizer.add_tokens(["[token]"])
print(tokenizer.tokenize("[token]"))
['[PAD]', '[UNK]', '[BOS]', '[EOS]', '[TEXT]', '[FORMULA]', '[FIGURE]', '[MARK]', '[TAG]', '[SEP]', '[TEXT_BEGIN]', '[TEXT_END]', '[FORMULA_BEGIN]', '[FORMULA_END]', '[TEXT]', '[FORMULA]', '[FIGURE]', '[MARK]', '[TAG]', '[SEP]', '[TEXT_BEGIN]', '[TEXT_END]', '[FORMULA_BEGIN]', '[FORMULA_END]']

['special']
['[PAD]', '[UNK]', '[BOS]', '[EOS]', '[TEXT]', '[FORMULA]', '[FIGURE]', '[MARK]', '[TAG]', '[SEP]', '[TEXT_BEGIN]', '[TEXT_END]', '[FORMULA_BEGIN]', '[FORMULA_END]', '[TEXT]', '[FORMULA]', '[FIGURE]', '[MARK]', '[TAG]', '[SEP]', '[TEXT_BEGIN]', '[TEXT_END]', '[FORMULA_BEGIN]', '[FORMULA_END]', '[special]']

['token']

编码/解码 句子

[5]:
encode_idxs = tokenizer.encode('公式 公 式')
print(encode_idxs)

encode_tokens = tokenizer.decode(encode_idxs)
print(encode_tokens)
[1, 370, 371]
['[UNK]', '公', '式']

修改基础令牌化容器

[6]:
# 可自定义参数
formula_params = {
    "skip_figure_formula": True,
    "symbolize_figure_formula": False
}

tokenizer._set_basic_tokenizer("ast_formula", formula_params=formula_params)

保存与加载

[7]:
# 保存
save_dir = "./tmp"
tokenizer.save_pretrained(save_dir)

# 加载
tokenizer = PretrainedEduTokenizer.from_pretrained(save_dir)

EduDataset

直接使用

[8]:
# 使用EduDataset
dataset = EduDataset(tokenizer, items=train_items,
                     stem_key="ques_content")
print(dataset[0].keys())
dict_keys(['seq_idx', 'seq_len'])
[9]:
dataset = EduDataset(tokenizer, items=train_items,
                     stem_key="ques_content", label_key="difficulty")
print(dataset[0].keys())
dict_keys(['labels', 'seq_idx', 'seq_len'])
[10]:
dataset = EduDataset(tokenizer, items=train_items,
                     stem_key="ques_content", label_key="difficulty", feature_keys=["know_list"])
print(dataset[0].keys())
dict_keys(['know_list', 'labels', 'seq_idx', 'seq_len'])

保存与加载

考虑到预处理耗时久,若希望下次能直接使用处理后的数据,可将预处理后的数据保存在本地。

[11]:
dataset.to_disk(output_dir)
[12]:
# # 保存
dataset.to_disk(output_dir)

# # 加载
dataset1 = EduDataset(tokenizer, ds_disk_path=output_dir)
print(dataset1[0].keys())

dataset2 = EduDataset(tokenizer, ds_disk_path=output_dir, label_key="difficulty", feature_keys=["know_list"])
print(dataset2[0].keys())
dict_keys(['seq_idx', 'seq_len'])
dict_keys(['know_list', 'labels', 'seq_idx', 'seq_len'])

并行预处理

在题目数据量过大时,令牌化等预处理操作耗时较长,可通过并行处理加速。

[13]:
import time

s = time.time()
# 使用并行加速
dataset = EduDataset(tokenizer, items=train_items*100,
                    stem_key="ques_content",
                    num_processor=4)
print(dataset[0].keys())
e = time.time()
print(f"spand time: {(e - s):.4}s")

s = time.time()
# 不使用并行加速
dataset = EduDataset(tokenizer, items=train_items*100,
                    stem_key="ques_content",)
print(dataset[0].keys())
e = time.time()
print(f"spand time: {(e - s):.4}s")


dict_keys(['seq_idx', 'seq_len'])
spand time: 1.641s
dict_keys(['seq_idx', 'seq_len'])
spand time: 4.484s