[5]:
import torch
import numpy as np
import os
import json
from EduNLP.ModelZoo.rnn import ElmoLM
from EduNLP.Pretrain import train_elmo, ElmoTokenizer
from EduNLP.Vector import ElmoModel, T2V
from EduNLP.I2V import Elmo, get_pretrained_i2v
os.environ["WANDB_DISABLED"] = "true"
训练自己的Elmo模型¶
1. 数据¶
[6]:
# 设置你的数据路径和输出路径
BASE_DIR = "../.."
data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/data/pretrain_test_models/elmo/"
[7]:
def stem_data():
_data = []
data_path = os.path.join(data_dir, "standard_luna_data.json")
with open(data_path, encoding="utf-8") as f:
for line in f.readlines():
_data.append(json.loads(line))
return _data
train_items = stem_data()
2. 训练和评估¶
[8]:
# 自定义训练参数
train_params = {
# "emb_dim": 128,
# "hid_dim": 256,
# "batch_size": 4,
# "epochs": 1,
# "lr": 5e-3,
# "device": None,
"num_train_epochs": 1,
"per_device_train_batch_size": 8,
"save_steps": 50,
"save_total_limit": 2,
"logging_steps": 5,
"gradient_accumulation_steps": 1,
"learning_rate": 5e-4,
}
train_elmo(train_items, output_dir, train_params=train_params)
Model config PretrainedConfig {
"architecture": "ElmoLM",
"batch_first": true,
"dropout_rate": 0.5,
"embedding_dim": 300,
"hidden_size": 300,
"num_layers": 2,
"transformers_version": "4.18.0",
"use_pack_pad": false,
"vocab_size": 305
}
Model config PretrainedConfig {
"architecture": "ElmoLMForPreTraining",
"batch_first": true,
"dropout_rate": 0.5,
"embedding_dim": 300,
"hidden_size": 300,
"transformers_version": "4.18.0",
"use_pack_pad": false,
"vocab_size": 305
}
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
/home/qlh/anaconda3/envs/dev/lib/python3.6/site-packages/transformers/optimization.py:309: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
FutureWarning,
***** Running training *****
Num examples = 25
Num Epochs = 1
Instantaneous batch size per device = 8
Total train batch size (w. parallel, distributed & accumulation) = 16
Gradient Accumulation steps = 1
Total optimization steps = 2
/home/qlh/anaconda3/envs/dev/lib/python3.6/site-packages/torch/nn/modules/rnn.py:662: UserWarning: RNN module weights are not part of single contiguous chunk of memory. This means they need to be compacted at every call, possibly greatly increasing memory usage. To compact weights again call flatten_parameters(). (Triggered internally at /pytorch/aten/src/ATen/native/cudnn/RNN.cpp:915.)
self.dropout, self.training, self.bidirectional, self.batch_first)
/home/qlh/anaconda3/envs/dev/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
warnings.warn('Was asked to gather along dimension 0, but all '
[2/2 00:00, Epoch 1/1]
| Step | Training Loss |
|---|
Training completed. Do not forget to share your model on huggingface.co/models =)
Saving model checkpoint to ../../data/pretrain_test_models/elmo/
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Configuration saved in ../../data/pretrain_test_models/elmo/config.json
[8]:
'../../data/pretrain_test_models/elmo/'
3.使用模型¶
[9]:
test_items = [
{'ques_content': '有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,\
如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,\
若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'},
{'ques_content': '如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$, \
若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}
]
3.1 直接加载令牌容器和模型¶
[10]:
pretrained_model_dir = output_dir
model = ElmoLM.from_pretrained(pretrained_model_dir)
tokenizer = ElmoTokenizer.from_pretrained(pretrained_model_dir)
encodes = tokenizer(test_items, lambda x: x['ques_content'])
model(**encodes)
Model config PretrainedConfig {
"architecture": "ElmoLM",
"batch_first": true,
"dropout_rate": 0.5,
"embedding_dim": 300,
"hidden_size": 300,
"num_layers": 2,
"transformers_version": "4.18.0",
"use_pack_pad": false,
"vocab_size": 305
}
[EduNLP, INFO] All the weights of ElmoLM were initialized from the model checkpoint at ../../data/pretrain_test_models/elmo/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use ElmoLM for predictions without further training.
[10]:
ElmoLMOutput([('pred_forward',
tensor([[[0.0033, 0.0032, 0.0034, ..., 0.0033, 0.0031, 0.0035],
[0.0032, 0.0032, 0.0035, ..., 0.0033, 0.0031, 0.0033],
[0.0035, 0.0031, 0.0034, ..., 0.0032, 0.0033, 0.0033],
...,
[0.0032, 0.0032, 0.0036, ..., 0.0030, 0.0031, 0.0033],
[0.0034, 0.0032, 0.0035, ..., 0.0032, 0.0031, 0.0035],
[0.0034, 0.0031, 0.0032, ..., 0.0033, 0.0031, 0.0033]],
[[0.0034, 0.0030, 0.0034, ..., 0.0033, 0.0032, 0.0034],
[0.0035, 0.0031, 0.0037, ..., 0.0031, 0.0031, 0.0035],
[0.0035, 0.0030, 0.0034, ..., 0.0031, 0.0033, 0.0035],
...,
[0.0032, 0.0032, 0.0032, ..., 0.0032, 0.0032, 0.0034],
[0.0034, 0.0030, 0.0033, ..., 0.0033, 0.0030, 0.0033],
[0.0035, 0.0032, 0.0032, ..., 0.0032, 0.0030, 0.0032]]],
grad_fn=<SoftmaxBackward>)),
('pred_backward',
tensor([[[0.0032, 0.0029, 0.0033, ..., 0.0030, 0.0028, 0.0033],
[0.0032, 0.0031, 0.0034, ..., 0.0031, 0.0029, 0.0033],
[0.0032, 0.0031, 0.0034, ..., 0.0031, 0.0030, 0.0035],
...,
[0.0031, 0.0032, 0.0034, ..., 0.0032, 0.0031, 0.0034],
[0.0034, 0.0033, 0.0035, ..., 0.0031, 0.0030, 0.0034],
[0.0034, 0.0032, 0.0036, ..., 0.0033, 0.0030, 0.0035]],
[[0.0033, 0.0032, 0.0036, ..., 0.0031, 0.0029, 0.0033],
[0.0033, 0.0031, 0.0035, ..., 0.0031, 0.0031, 0.0033],
[0.0034, 0.0031, 0.0033, ..., 0.0030, 0.0031, 0.0032],
...,
[0.0033, 0.0032, 0.0034, ..., 0.0030, 0.0031, 0.0034],
[0.0033, 0.0034, 0.0032, ..., 0.0032, 0.0030, 0.0033],
[0.0034, 0.0033, 0.0033, ..., 0.0032, 0.0032, 0.0033]]],
grad_fn=<SoftmaxBackward>)),
('forward_output',
tensor([[[-0.0528, -0.0000, 0.0000, ..., -0.0000, -0.0000, -0.1094],
[-0.0555, -0.0000, 0.1578, ..., -0.1109, -0.0000, -0.1539],
[-0.0000, -0.0000, 0.1170, ..., -0.1780, -0.0090, -0.0000],
...,
[-0.0000, -0.0493, 0.0206, ..., 0.0145, -0.0501, -0.0000],
[-0.0000, -0.0000, -0.0088, ..., -0.0000, -0.0375, 0.0128],
[-0.0412, -0.1187, 0.0000, ..., -0.0000, 0.0000, 0.0386]],
[[ 0.0173, -0.0000, -0.0000, ..., -0.0000, 0.0000, 0.0000],
[ 0.0000, -0.1261, -0.0141, ..., -0.0000, 0.0211, 0.0752],
[ 0.0000, -0.1159, -0.0309, ..., -0.1112, -0.0282, 0.0501],
...,
[-0.0000, -0.1322, 0.0000, ..., -0.0242, 0.0000, 0.0000],
[ 0.0000, -0.0000, 0.0000, ..., 0.0000, 0.0492, 0.0000],
[ 0.0000, -0.2027, 0.1891, ..., 0.0292, 0.0457, -0.0000]]],
grad_fn=<MulBackward0>)),
('backward_output',
tensor([[[ 0.1090, -0.1446, 0.0000, ..., -0.0652, 0.0701, -0.0444],
[ 0.0911, -0.1078, 0.0514, ..., -0.0000, 0.0735, 0.0000],
[ 0.0000, -0.0000, 0.0000, ..., -0.0463, 0.0000, 0.0000],
...,
[-0.0984, -0.0927, 0.1122, ..., 0.0556, 0.0000, -0.0028],
[-0.0000, -0.0939, 0.0403, ..., 0.0000, 0.0629, -0.0146],
[-0.0000, -0.0779, 0.0000, ..., -0.0000, 0.0000, -0.0187]],
[[-0.0000, -0.0000, 0.0000, ..., -0.0000, 0.0062, 0.0000],
[-0.0000, -0.0482, 0.0000, ..., -0.0000, -0.0000, 0.0713],
[-0.0000, -0.1221, 0.0000, ..., -0.0000, -0.0000, 0.1571],
...,
[ 0.0640, -0.1978, 0.0387, ..., -0.0000, 0.1457, -0.0000],
[ 0.1365, -0.2237, 0.0940, ..., -0.1624, 0.0000, -0.0368],
[ 0.1113, -0.0000, 0.0000, ..., -0.1166, 0.0000, -0.0000]]],
grad_fn=<MulBackward0>))])
3.2 使用I2V向量化¶
[12]:
tokenizer_kwargs = {"tokenizer_config_dir": pretrained_model_dir}
i2v = Elmo('elmo', 'elmo', output_dir, tokenizer_kwargs=tokenizer_kwargs)
# 可以对单个题目进行表征
i_vec, t_vec = i2v(test_items[0], key=lambda x: x["ques_content"])
print(i_vec.shape) # == torch.Size([x])
print(t_vec.shape) # == torch.Size([x, x])
# 也可以对题目列表进行表征
i_vec, t_vec = i2v(test_items, key=lambda x: x["ques_content"])
print(i_vec.shape) # == torch.Size([2, x])
print(t_vec.shape) # == torch.Size([2, x, x]))
Model config PretrainedConfig {
"architecture": "ElmoLM",
"batch_first": true,
"dropout_rate": 0.5,
"embedding_dim": 300,
"hidden_size": 300,
"num_layers": 2,
"transformers_version": "4.18.0",
"use_pack_pad": false,
"vocab_size": 305
}
[EduNLP, INFO] All the weights of ElmoLM were initialized from the model checkpoint at ../../data/pretrain_test_models/elmo/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use ElmoLM for predictions without further training.
/home/qlh/EduNLP/EduNLP/Vector/elmo_vec.py:36: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
(outputs.forward_output[torch.arange(len(items["seq_len"])), torch.tensor(items["seq_len"]) - 1],
torch.Size([1, 600])
torch.Size([1, 15, 600])
torch.Size([2, 600])
torch.Size([2, 25, 600])
3.3 使用Tokenizer和T2V向量化¶
[14]:
# 加载之前训练的模型tokenizer
tokenizer = ElmoTokenizer.from_pretrained(pretrained_model_dir)
encodes = tokenizer(test_items, key=lambda x: x['ques_content'])
t2v = ElmoModel(pretrained_model_dir)
i_vec = t2v(encodes)
print(i_vec.shape) # == torch.Size([2, x])
print()
i_vec = t2v.infer_vector(encodes)
t_vec = t2v.infer_tokens(encodes)
print(i_vec.shape) # == torch.Size([2, x])
print(t_vec.shape) # == torch.Size([2, x, x]))
print()
Model config PretrainedConfig {
"architecture": "ElmoLM",
"batch_first": true,
"dropout_rate": 0.5,
"embedding_dim": 300,
"hidden_size": 300,
"num_layers": 2,
"transformers_version": "4.18.0",
"use_pack_pad": false,
"vocab_size": 305
}
[EduNLP, INFO] All the weights of ElmoLM were initialized from the model checkpoint at ../../data/pretrain_test_models/elmo/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use ElmoLM for predictions without further training.
/home/qlh/EduNLP/EduNLP/Vector/elmo_vec.py:36: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
(outputs.forward_output[torch.arange(len(items["seq_len"])), torch.tensor(items["seq_len"]) - 1],
torch.Size([2, 600])
torch.Size([2, 600])
torch.Size([2, 25, 600])
3.4 使用EduNLP中公开的预训练模型¶
[ ]:
# 获取公开的预训练模型
pretrained_dir = f"{BASE_DIR}/examples/test_model/elmo"
i2v = get_pretrained_i2v("elmo_test", model_dir=pretrained_dir)
[ ]:
i_vec, t_vec = i2v(test_items)
print(i_vec.shape)
print(t_vec.shape)
print()
# 也可以单独获取题目表征和各个token的表征
i_vec = i2v.infer_item_vector(test_items, key=lambda x: x['ques_content'])
print(i_vec.shape)
t_vec = i2v.infer_token_vector(test_items, key=lambda x: x['ques_content'])
print(t_vec.shape)
print()
# 同样,可以获取单个题目的表征
i_vec, t_vec = i2v(test_items[0], key=lambda x: x['ques_content'])
print(i_vec.shape)
print(t_vec.shape)