前置条件
使用pycharm单步调试(远程)
- 网上博客很详细
- 记得环境变量设置
Transformer
train.py 训练
函数主入口(if __name__ == ‘__main__’:)
- 加载parser
判断分布式与多进程,一般都为False
跳入main函数
main函数
加载并打印args(默认max_tokens=6000)
batchsize?
设置随机数种子
torch.manual_seed(args.seed)
Setup task, e.g., translation, language modeling
def setup_task(args):
return TASK_REGISTRY[args.task].setup_task(args)Load dataset splits
load_dataset_splits(task, ['train', 'valid'])
task.load_dataset(split, combine=True) # Translation
src_datasets = []
tgt_datasets = []
data_paths = self.args.data # 路径 可以多路径 train tain1
# 可以多个校验集
for k in itertools.count():
split_k = split + (str(k) if k > 0 else '')
try:
task.load_dataset(split_k, combine=False)
except FileNotFoundError as e:
if k > 0:
break
raise e建立模型和标准 Build model and criterion
model = task.build_model(args)
criterion = task.build_criterion(args) # LabelSmoothedCrossEntropyCriterion
# transformer.py
def build_model(cls, args, task):
base_architecture(args)
src_dict, tgt_dict = task.source_dictionary, task.target_dictionary # load dict
if args.share_all_embeddings:
assert src_dict == tgt_dict
assert args.encoder_embed_dim == args.decoder_embed_dim
assert args.decoder_embed_path == args.encoder_embed_path
# ...
else:
encoder_embed_tokens = build_embedding(src_dict, args.encoder_embed_dim,args.encoder_embed_path)
# ...
encoder = TransformerEncoder(args, src_dict, encoder_embed_tokens)
class TransformerEncoder(FairseqEncoder):
def init()
# load embedding args
# load encoderlayer
self.layers = nn.ModuleList([])
self.layers.extend([
TransformerEncoderLayer(args)
for i in range(args.encoder_layers)
])
#
class TransformerEncoderLayer(nn.Module):
"""
In the original paper each operation (multi-head attention or FFN) is
postprocessed with: `dropout -> add residual -> layernorm`. In the
tensor2tensor code they suggest that learning is more robust when
preprocessing each layer with layernorm and postprocessing with:
`dropout -> add residual`. We default to the approach in the paper, but the
tensor2tensor approach can be enabled by setting
*args.encoder_normalize_before* to ``True``.
"""
def init():
self.embed_dim = args.encoder_embed_dim
self.self_attn = MultiheadAttention(
self.embed_dim, args.encoder_attention_heads,
dropout=args.attention_dropout,
)
self.dropout = args.dropout
self.relu_dropout = args.relu_dropout
self.normalize_before = args.encoder_normalize_before
self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for i in range(2)])
class MultiheadAttention(nn.Module):
def __init__():
in_proj_weight # ??? Todo
6. batch处理
没搞懂两个batch用途 Todo
dummy_batch = task.dataset('train').get_dummy_batch(args.max_tokens, max_positions)
oom_batch = task.dataset('train').get_dummy_batch(1, max_positions)
7. Build trainer
trainer = Trainer(args, task, model, criterion, dummy_batch, oom_batch)
8. Initialize dataloader
附录
打印args
Namespace(adam_betas='(0.9, 0.999)', adam_eps=1e-08, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, arch='transformer_wmt_en_de', attention_dropout=0.0, bucket_cap_mb=150, clip_norm=25, criterion='label_smoothed_cross_entropy', data=['/home/zhangzheyang/package_and_data/wmt14_en_de'], ddp_backend='c10d', decoder_attention_heads=8, decoder_embed_dim=512, decoder_embed_path=None, decoder_ffn_embed_dim=2048, decoder_input_dim=512, decoder_layers=6, decoder_learned_pos=False, decoder_normalize_before=False, decoder_output_dim=512, device_id=0, distributed_backend='nccl', distributed_init_method=None, distributed_port=-1, distributed_rank=0, distributed_world_size=0, dropout=0.1, encoder_attention_heads=8, encoder_embed_dim=512, encoder_embed_path=None, encoder_ffn_embed_dim=2048, encoder_layers=6, encoder_learned_pos=False, encoder_normalize_before=False, fix_batches_to_gpus=False, fp16=False, fp16_init_scale=128, keep_interval_updates=-1, label_smoothing=0.1, left_pad_source='True', left_pad_target='False', log_format=None, log_interval=1000, lr=[0.25], lr_scheduler='inverse_sqrt', lr_shrink=0.1, max_epoch=0, max_sentences=None, max_sentences_valid=None, max_source_positions=1024, max_target_positions=1024, max_tokens=4096, max_update=100000, min_loss_scale=0.0001, min_lr=1e-09, momentum=0.99, no_epoch_checkpoints=False, no_progress_bar=False, no_save=False, no_token_positional_embeddings=False, optimizer='adam', optimizer_overrides='{}', raw_text=False, relu_dropout=0.0, reset_lr_scheduler=False, reset_optimizer=False, restore_file='checkpoint_last.pt', save_dir='checkpoints/transformer', save_interval=1, save_interval_updates=0, seed=1, sentence_avg=False, share_all_embeddings=False, share_decoder_input_output_embed=False, skip_invalid_size_inputs_valid_test=False, source_lang='en', target_lang='de', task='translation', train_subset='train', update_freq=[1], upsample_primary=1, valid_subset='valid', validate_interval=1, warmup_init_lr=1e-07, warmup_updates=4000, weight_decay=0.0001)
base_architecture(args)
def base_architecture(args):
args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 2048)
args.encoder_layers = getattr(args, 'encoder_layers', 6)
args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 8)
args.encoder_normalize_before = getattr(args, 'encoder_normalize_before', False)
args.encoder_learned_pos = getattr(args, 'encoder_learned_pos', False)
args.decoder_embed_path = getattr(args, 'decoder_embed_path', None)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', args.encoder_embed_dim)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', args.encoder_ffn_embed_dim)
args.decoder_layers = getattr(args, 'decoder_layers', 6)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.decoder_normalize_before = getattr(args, 'decoder_normalize_before', False)
args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
args.attention_dropout = getattr(args, 'attention_dropout', 0.)
args.relu_dropout = getattr(args, 'relu_dropout', 0.)
args.dropout = getattr(args, 'dropout', 0.1)
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
args.share_decoder_input_output_embed = getattr(args, 'share_decoder_input_output_embed', False)
args.share_all_embeddings = getattr(args, 'share_all_embeddings', False)
args.no_token_positional_embeddings = getattr(args, 'no_token_positional_embeddings', False)
args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim)
args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim)