-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathtrain.py
63 lines (50 loc) · 1.76 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import argparse
import numpy as np
import torch
from transformers import BertForTokenClassification, BertTokenizerFast
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from utils import trim_entity_spans, convert_goldparse, ResumeDataset, tag2idx, idx2tag, get_hyperparameters, train_and_val_model
parser = argparse.ArgumentParser(description='Train Bert-NER')
parser.add_argument('-e', type=int, default=5, help='number of epochs')
parser.add_argument('-o', type=str, default='.',
help='output path to save model state')
args = parser.parse_args().__dict__
output_path = args['o']
MAX_LEN = 500
EPOCHS = args['e']
MAX_GRAD_NORM = 1.0
MODEL_NAME = 'bert-base-uncased'
TOKENIZER = BertTokenizerFast('./vocab/vocab.txt', lowercase=True)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data = trim_entity_spans(convert_goldparse('data/Resumes.json'))
total = len(data)
train_data, val_data = data[:180], data[180:]
train_d = ResumeDataset(train_data, TOKENIZER, tag2idx, MAX_LEN)
val_d = ResumeDataset(val_data, TOKENIZER, tag2idx, MAX_LEN)
train_sampler = RandomSampler(train_d)
train_dl = DataLoader(train_d, sampler=train_sampler, batch_size=8)
val_dl = DataLoader(val_d, batch_size=4)
model = BertForTokenClassification.from_pretrained(
MODEL_NAME, num_labels=len(tag2idx))
model.to(DEVICE)
optimizer_grouped_parameters = get_hyperparameters(model, True)
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)
train_and_val_model(
model,
TOKENIZER,
optimizer,
EPOCHS,
idx2tag,
tag2idx,
MAX_GRAD_NORM,
DEVICE,
train_dl,
val_dl
)
torch.save(
{
"model_state_dict": model.state_dict()
},
f'{output_path}/model-state.bin',
)