pytorch BERT文本分类保姆级教学
本文主要依赖的工具为huggingface的transformers,更详细的解释可以查阅文档。
定义模型
模型定义主要是tokenizer、config和model的定义,直接简单粗暴点可以使用huggingface的automodel,这里cache_dir为模型下载的路径,在config中可以定义后面模型要用到的参数,比如我后面model用的是BertForSequenceClassification,需要一个参数来定义模型预测的标签数,所以我在config中加了num_labels=3.
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', cache_dir='./cache_down')
config = BertConfig.from_pretrained('bert-base-chinese', cache_dir='./cache_down', num_labels=3)
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', cache_dir='./cache_down', config=config)
如果预训练参数已经下载了,那么可以用下面的方法,这里要将下载的三个文件命名为config.json,pytorch_model.bin,vocab.txt不然from_pretrained会找不到文件。
tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3)
model = BertForSequenceClassification.from_pretrained(args.model_name_or_path, config=config)
这个from_pretrained功能很强大,也可以载入tensorflow的预训练参数.ckpt文件,只要修改个参数:
tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case)
config = BertConfig.from_json_file('bert_config.json')
model = BertForSequenceClassification.from_pretrained('bert_model.ckpt', from_tf=True,config=config)
读入数据
这里tensorflow的做法是自己修改一个读入数据类DataProcessor,在这个类里边修改读入数据,然后直接跑run_classifier.py就好了(大致上是这样,没用tensorflow跑过bert,大佬请飘过)。transformers的样例也用一个读入类。我把他们拆分开来便于理解。
- 用一个类来存储每条example
class InputExample(object):
def __init__(self, guid, text_a, text_b=None, label=None):
self.guid = guid
self.text_a = text_a
self.text_b = text_b
self.label = label
- 写一个读数据的方法,将json/excel/csv文件读入
def read_examples(input_file, is_training, sep=','):
df = pd.read_excel(input_file)
examples = []
for val in df[['idx', 'text_a', 'text_b', 'label']].values:
examples.append(InputExample(guid=val[0], text_a=val[1], text_b=val[2], label=val[3]))
return examples
- 写一个将读入数据转换成BERT所需特征的方法(input_ids:每个字的id/input_mask:mask有些句子短了,就用0mask掉多出来的部分/segment_ids:NSP任务中区分两句不同句子)
def convert_examples_to_features(examples, tokenizer, max_seq_length, split_num, is_training):
features = []
for example_index, example in enumerate(examples):
context_tokens = tokenizer.tokenize(example.text_a)
ending_tokens = tokenizer.tokenize(example.text_b)
skip_len = len(context_tokens) / split_num
choices_features = []
index_begin = 0
index_end = split_num - 1
if example_index < 1 and is_training:
logger.info("** RAW EXAMPLE **")
logger.info("content: {}".format(context_tokens))
for i in range(split_num):
if i != index_end:
context_tokens_choice = context_tokens[int(i * skip_len):int((i + 1) * skip_len)]
elif i == index_end:
context_tokens_choice = context_tokens[-int(i * skip_len):]
_truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3, i == index_end)
tokens = ["[CLS]"] + ending_tokens + ["[SEP]"] + context_tokens_choice + ["[SEP]"]
segment_ids = [0] * (len(ending_tokens) + 2) + [1] * (len(context_tokens_choice) + 1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_mask = [1] * len(input_ids)
padding_length = max_seq_length - len(input_ids)
input_ids += ([0] * padding_length)
input_mask += ([0] * padding_length)
segment_ids += ([0] * padding_length)
choices_features.append((tokens, input_ids, input_mask, segment_ids))
label = example.label
if example_index < 1 and is_training:
logger.info("*** Example ***")
logger.info("idx: {}".format(example_index))
logger.info("guid: {}".format(example.guid))
logger.info("tokens: {}".format(' '.join(tokens).replace('\u2581', '_')))
logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
logger.info("label: {}".format(label))
features.append(
InputFeatures(
example_id=example.guid,
choices_features=choices_features,
label=label
)
)
return features
- 现在已经有训练BERT能识别的数据了,接下来因为要进行批量训练,我们得找一个容器装下数据,使数据可以一批批拿出来训练。这里用的就是TensorDataset和DataLoader来进行数据包装。
'''这个select_field就是取出上文封装好的特征中对应的特征'''
all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
train_sampler = RandomSampler(train_data)
'''包装数据,用于批量输入到模型中'''
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps)
之后取批量数据训练的时候可以按照以下方法:
'''拿数据的时候可以用以下方法'''
batch = next(train_dataloader)
至此,数据的读入部分已经完成,接下来是模型的训练
模型训练
只看关键步骤,huggingface示例代码各种gpu分布式训练花里胡哨的,把这些拨开看,其实和平常训练模型一样,就分以下n步:
- 定义optimizer(在此步优化的话加warm up)
- 定义损失函数(本文使用的BertForSequenceClassification,它内部已经包含损失函数,不需要再定义,具体下面会讲)
- loss.backward()后向梯度计算
- optimizer.step()更新参数
- optimizer.zero_gard()梯度清零(pytorch默认梯度不清零,会导致后面example计算的梯度累积)
要注意的一点是输入到BERT模型中的数据shape必须是(Batch, Dim)的格式。在输入之前可以做个数据变换:
'''输入BERT前一定要注意格式'''
input_ids = input_ids.view(-1, input_ids.size(-1))
input_mask = input_mask.view(-1, input_mask.size(-1))
segment_ids = segment_ids.view(-1, segment_ids.size(-1))
模型训练核心部分代码:
for step in bar:
'''获取批数据'''
batch = next(train_dataloader)
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch
'''输入BERT前一定要注意格式'''
input_ids = input_ids.view(-1, input_ids.size(-1))
input_mask = input_mask.view(-1, input_mask.size(-1))
segment_ids = segment_ids.view(-1, segment_ids.size(-1))
'''BERT分类模型的输入可以参考下huggingface'''
loss, _ = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask,
labels=label_ids)
'''计算loss'''
nb_tr_examples += input_ids.size(0)
if args.n_gpu > 1:
loss = loss.mean()
if args.fp16 and args.loss_scale != 1.0:
loss = loss * args.loss_scale
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
tr_loss += loss.item()
train_loss = round(tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4)
bar.set_description("loss {}".format(train_loss))
nb_tr_steps += 1
'''后向计算梯度'''
if args.fp16:
optimizer.backward(loss)
else:
loss.backward()
loss_adv, _ = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask,
labels=label_ids)
loss_adv.backward()
if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
if args.fp16:
lr_this_step = args.learning_rate * scheduler.get_lr(global_step, args.warmup_proportion)
for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step
'''更新参数'''
optimizer.step()
scheduler.step()
optimizer.zero_grad()
global_step += 1
if (step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0:
tr_loss = 0
nb_tr_examples, nb_tr_steps = 0, 0
logger.info("***** Report result *****")
logger.info(" %s = %s", 'global_step', str(global_step))
logger.info(" %s = %s", 'train loss', str(train_loss))
验证集评估
这部分用验证集来挑选最好的模型,操作方法和训练方法类似,不多讲,参考以上
if args.do_eval and step > num_train_optimization_steps * args.not_do_eval_steps and (step + 1) % (
args.eval_steps * args.gradient_accumulation_steps) == 0:
for file in ['dev.xslx']:
inference_labels = []
gold_labels = []
inference_logits = []
eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=True)
eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length,
args.split_num, False)
all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)
all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
logger.info("***** Running evaluation *****")
logger.info(" Num examples = %d", len(eval_examples))
logger.info(" Batch size = %d", args.eval_batch_size)
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
input_ids = input_ids.to(device)
input_mask = input_mask.to(device)
segment_ids = segment_ids.to(device)
label_ids = label_ids.to(device)
'''输入BERT前一定要注意格式'''
input_ids = input_ids.view(-1, input_ids.size(-1))
input_mask = input_mask.view(-1, input_mask.size(-1))
segment_ids = segment_ids.view(-1, segment_ids.size(-1))
with torch.no_grad():
tmp_eval_loss, logits = model(input_ids=input_ids, token_type_ids=segment_ids,
attention_mask=input_mask, labels=label_ids)
logits = logits.detach().cpu().numpy()
label_ids = label_ids.to('cpu').numpy()
inference_labels.append(np.argmax(logits, axis=1))
gold_labels.append(label_ids)
inference_logits.append(logits)
eval_loss += tmp_eval_loss.mean().item()
nb_eval_examples += input_ids.size(0)
nb_eval_steps += 1
gold_labels = np.concatenate(gold_labels, 0)
inference_logits = np.concatenate(inference_logits, 0)
inference_labels = np.concatenate(inference_labels, 0)
model.train()
eval_loss = eval_loss / nb_eval_steps
eval_accuracy = accuracy(inference_logits, gold_labels)
print(classification_report(gold_labels, inference_labels))
result = {'eval_loss': eval_loss,
'eval_F1': eval_accuracy,
'global_step': global_step,
'loss': train_loss}
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
with open(output_eval_file, "a") as writer:
for key in sorted(result.keys()):
logger.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
writer.write('*' * 80)
writer.write('\n')
if eval_accuracy > best_acc and 'dev' in file:
print("=" * 80)
print("Best F1", eval_accuracy)
print("Saving Model......")
best_acc = eval_accuracy
model_to_save = model.module if hasattr(model,
'module') else model
output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
torch.save(model_to_save.state_dict(), output_model_file)
print("=" * 80)
else:
print("=" * 80)
测试集结果计算
if args.do_test:
del model
gc.collect()
args.do_train = False
model = BertForSequenceClassification.from_pretrained(os.path.join(args.output_dir, "pytorch_model.bin"))
if args.fp16:
model.half()
model.to(device)
if args.local_rank != -1:
try:
from apex.parallel import DistributedDataParallel as DDP
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
model = DDP(model)
elif args.n_gpu > 1:
model = torch.nn.DataParallel(model)
for file, flag in [('dev.xlsx', 'dev'), ('test.xlsx', 'test')]:
inference_labels = []
gold_labels = []
eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False)
eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, args.split_num,
False)
all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)
all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
input_ids = input_ids.to(device)
input_mask = input_mask.to(device)
segment_ids = segment_ids.to(device)
label_ids = label_ids.to(device)
'''输入BERT前一定要注意格式'''
input_ids = input_ids.view(-1, input_ids.size(-1))
input_mask = input_mask.view(-1, input_mask.size(-1))
segment_ids = segment_ids.view(-1, segment_ids.size(-1))
with torch.no_grad():
logits = model(input_ids=input_ids, token_type_ids=segment_ids,
attention_mask=input_mask).detach().cpu().numpy()
label_ids = label_ids.to('cpu').numpy()
inference_labels.append(logits)
gold_labels.append(label_ids)
gold_labels = np.concatenate(gold_labels, 0)
logits = np.concatenate(inference_labels, 0)
print(flag, accuracy(logits, gold_labels))
if flag == 'test':
df = pd.read_csv(os.path.join(args.data_dir, file))
df['label_0'] = logits[:, 0]
df['label_1'] = logits[:, 1]
df[['qid', 'label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub_test.csv"), index=False)
if flag == 'dev':
df = pd.read_csv(os.path.join(args.data_dir, file))
df['label_0'] = logits[:, 0]
df['label_1'] = logits[:, 1]
df[['qid', 'label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub_dev.csv"),
index=False)
详细代码等整理好再上传到GitHub上
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)