Transformer(fairseq)阅读

作者 Boxiao Zhang 日期 2019-05-08
NMT
Transformer(fairseq)阅读

前置条件

使用pycharm单步调试(远程)

  • 网上博客很详细
  • 记得环境变量设置

Transformer

train.py 训练

  • 函数主入口(if __name__ == ‘__main__’:)

    1. 加载parser
    2. 判断分布式与多进程,一般都为False

    3. 跳入main函数

  • main函数

    1. 加载并打印args(默认max_tokens=6000)

      batchsize?

    2. 设置随机数种子

      torch.manual_seed(args.seed)
    3. Setup task, e.g., translation, language modeling

      def setup_task(args):
      return TASK_REGISTRY[args.task].setup_task(args)
    4. Load dataset splits

      load_dataset_splits(task, ['train', 'valid'])
      task.load_dataset(split, combine=True) # Translation
      src_datasets = []
      tgt_datasets = []
      data_paths = self.args.data # 路径 可以多路径 train tain1
      # 可以多个校验集
      for k in itertools.count():
      split_k = split + (str(k) if k > 0 else '')
      try:
      task.load_dataset(split_k, combine=False)
      except FileNotFoundError as e:
      if k > 0:
      break
      raise e
    5. 建立模型和标准 Build model and criterion

      model = task.build_model(args)
      criterion = task.build_criterion(args) # LabelSmoothedCrossEntropyCriterion

      # transformer.py
      def build_model(cls, args, task):
      base_architecture(args)
      src_dict, tgt_dict = task.source_dictionary, task.target_dictionary # load dict
      if args.share_all_embeddings:
      assert src_dict == tgt_dict
      assert args.encoder_embed_dim == args.decoder_embed_dim
      assert args.decoder_embed_path == args.encoder_embed_path
      # ...
      else:
      encoder_embed_tokens = build_embedding(src_dict, args.encoder_embed_dim,args.encoder_embed_path)
      # ...
      encoder = TransformerEncoder(args, src_dict, encoder_embed_tokens)


      class TransformerEncoder(FairseqEncoder):
      def init()
      # load embedding args
      # load encoderlayer
      self.layers = nn.ModuleList([])
      self.layers.extend([
      TransformerEncoderLayer(args)
      for i in range(args.encoder_layers)
      ])
      #

      class TransformerEncoderLayer(nn.Module):
      """
      In the original paper each operation (multi-head attention or FFN) is
      postprocessed with: `dropout -> add residual -> layernorm`. In the
      tensor2tensor code they suggest that learning is more robust when
      preprocessing each layer with layernorm and postprocessing with:
      `dropout -> add residual`. We default to the approach in the paper, but the
      tensor2tensor approach can be enabled by setting
      *args.encoder_normalize_before* to ``True``.
      """
      def init():
      self.embed_dim = args.encoder_embed_dim
      self.self_attn = MultiheadAttention(
      self.embed_dim, args.encoder_attention_heads,
      dropout=args.attention_dropout,
      )
      self.dropout = args.dropout
      self.relu_dropout = args.relu_dropout
      self.normalize_before = args.encoder_normalize_before
      self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
      self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
      self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for i in range(2)])

      class MultiheadAttention(nn.Module):
      def __init__():
      in_proj_weight # ??? Todo
6. batch处理

   没搞懂两个batch用途 Todo

   
dummy_batch = task.dataset('train').get_dummy_batch(args.max_tokens, max_positions)
oom_batch = task.dataset('train').get_dummy_batch(1, max_positions)
7. Build trainer
trainer = Trainer(args, task, model, criterion, dummy_batch, oom_batch)
8. Initialize dataloader

附录

  • 打印args

    Namespace(adam_betas='(0.9, 0.999)', adam_eps=1e-08, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, arch='transformer_wmt_en_de', attention_dropout=0.0, bucket_cap_mb=150, clip_norm=25, criterion='label_smoothed_cross_entropy', data=['/home/zhangzheyang/package_and_data/wmt14_en_de'], ddp_backend='c10d', decoder_attention_heads=8, decoder_embed_dim=512, decoder_embed_path=None, decoder_ffn_embed_dim=2048, decoder_input_dim=512, decoder_layers=6, decoder_learned_pos=False, decoder_normalize_before=False, decoder_output_dim=512, device_id=0, distributed_backend='nccl', distributed_init_method=None, distributed_port=-1, distributed_rank=0, distributed_world_size=0, dropout=0.1, encoder_attention_heads=8, encoder_embed_dim=512, encoder_embed_path=None, encoder_ffn_embed_dim=2048, encoder_layers=6, encoder_learned_pos=False, encoder_normalize_before=False, fix_batches_to_gpus=False, fp16=False, fp16_init_scale=128, keep_interval_updates=-1, label_smoothing=0.1, left_pad_source='True', left_pad_target='False', log_format=None, log_interval=1000, lr=[0.25], lr_scheduler='inverse_sqrt', lr_shrink=0.1, max_epoch=0, max_sentences=None, max_sentences_valid=None, max_source_positions=1024, max_target_positions=1024, max_tokens=4096, max_update=100000, min_loss_scale=0.0001, min_lr=1e-09, momentum=0.99, no_epoch_checkpoints=False, no_progress_bar=False, no_save=False, no_token_positional_embeddings=False, optimizer='adam', optimizer_overrides='{}', raw_text=False, relu_dropout=0.0, reset_lr_scheduler=False, reset_optimizer=False, restore_file='checkpoint_last.pt', save_dir='checkpoints/transformer', save_interval=1, save_interval_updates=0, seed=1, sentence_avg=False, share_all_embeddings=False, share_decoder_input_output_embed=False, skip_invalid_size_inputs_valid_test=False, source_lang='en', target_lang='de', task='translation', train_subset='train', update_freq=[1], upsample_primary=1, valid_subset='valid', validate_interval=1, warmup_init_lr=1e-07, warmup_updates=4000, weight_decay=0.0001)
  • base_architecture(args)

    @register_model_architecture('transformer', 'transformer')
    def base_architecture(args):
    args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 2048)
    args.encoder_layers = getattr(args, 'encoder_layers', 6)
    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 8)
    args.encoder_normalize_before = getattr(args, 'encoder_normalize_before', False)
    args.encoder_learned_pos = getattr(args, 'encoder_learned_pos', False)
    args.decoder_embed_path = getattr(args, 'decoder_embed_path', None)
    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', args.encoder_embed_dim)
    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', args.encoder_ffn_embed_dim)
    args.decoder_layers = getattr(args, 'decoder_layers', 6)
    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
    args.decoder_normalize_before = getattr(args, 'decoder_normalize_before', False)
    args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
    args.attention_dropout = getattr(args, 'attention_dropout', 0.)
    args.relu_dropout = getattr(args, 'relu_dropout', 0.)
    args.dropout = getattr(args, 'dropout', 0.1)
    args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
    args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0)
    args.share_decoder_input_output_embed = getattr(args, 'share_decoder_input_output_embed', False)
    args.share_all_embeddings = getattr(args, 'share_all_embeddings', False)
    args.no_token_positional_embeddings = getattr(args, 'no_token_positional_embeddings', False)

    args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim)
    args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim)