-
Notifications
You must be signed in to change notification settings - Fork 2
/
args.yaml
172 lines (172 loc) · 3.5 KB
/
args.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
allow_missing_params: false
batch_size: 2048
batch_type: word
bucket_width: 10
checkpoint_frequency: 4000
cnn_activation_type: glu
cnn_hidden_dropout: 0.2
cnn_kernel_width:
- 3
- 3
cnn_num_hidden: 512
cnn_positional_embedding_type: learned
cnn_project_qkv: false
config: null
conv_embed_add_positional_encodings: false
conv_embed_dropout: 0.0
conv_embed_max_filter_width: 8
conv_embed_num_filters:
- 200
- 200
- 250
- 250
- 300
- 300
- 300
- 300
conv_embed_num_highway_layers: 4
conv_embed_output_dim: null
conv_embed_pool_stride: 5
decode_and_evaluate: 500
decode_and_evaluate_device_id: null
decode_and_evaluate_use_cpu: false
decoder: rnn
decoder_only: false
device_ids:
- 0
- 1
disable_device_locking: true
dry_run: false
embed_dropout:
- 0.0
- 0.0
embed_weight_init: default
encoder: rnn
fill_up: replicate
fixed_param_names: []
gradient_clipping_threshold: 1.0
gradient_clipping_type: none
gradient_compression_threshold: 0.5
gradient_compression_type: null
initial_learning_rate: 0.0002
keep_last_params: 5
kvstore: device
label_smoothing: 0.1
layer_normalization: false
learning_rate_decay_optimizer_states_reset: 'off'
learning_rate_decay_param_reset: false
learning_rate_half_life: 10
learning_rate_reduce_factor: 0.7
learning_rate_reduce_num_not_improved: 8
learning_rate_schedule: null
learning_rate_scheduler_type: plateau-reduce
learning_rate_warmup: 0
lhuc: null
lock_dir: /tmp
loss: cross-entropy
loss_normalization_type: valid
max_num_checkpoint_not_improved: 32
max_num_epochs: null
max_samples: null
max_seq_len:
- 30
- 30
max_updates: null
metrics:
- perplexity
min_num_epochs: null
min_samples: null
min_updates: null
momentum: null
monitor_pattern: null
monitor_stat_func: mx_default
no_bucketing: false
num_embed:
- 512
- 512
num_layers:
- 6
- 6
num_words:
- 0
- 0
optimized_metric: perplexity
optimizer: adam
optimizer_params: null
output: ../mono_model_8_28_rnn1024_mul
overwrite_output: false
pad_vocab_to_multiple_of: null
params: null
prepared_data: ../mono_8_28
quiet: false
rnn_attention_coverage_num_hidden: 1
rnn_attention_coverage_type: count
rnn_attention_in_upper_layers: false
rnn_attention_mhdot_heads: null
rnn_attention_num_hidden: null
rnn_attention_type: dot
rnn_attention_use_prev_word: false
rnn_cell_type: lstm
rnn_context_gating: false
rnn_decoder_hidden_dropout: 0.2
rnn_decoder_state_init: last
rnn_dropout_inputs:
- 0.0
- 0.0
rnn_dropout_recurrent:
- 0.0
- 0.0
rnn_dropout_states:
- 0.0
- 0.0
rnn_enc_last_hidden_concat_to_embedding: false
rnn_encoder_reverse_input: false
rnn_first_residual_layer: 2
rnn_forget_bias: 0.0
rnn_h2h_init: orthogonal
rnn_num_hidden: 1024
rnn_residual_connections: false
rnn_scale_dot_attention: false
seed: 13
shared_vocab: false
source: null
source_factors: []
source_factors_num_embed: []
source_vocab: null
target: null
target_vocab: null
transformer_activation_type: relu
transformer_attention_heads:
- 8
- 8
transformer_dropout_act: 0.1
transformer_dropout_attention: 0.1
transformer_dropout_prepost: 0.1
transformer_feed_forward_num_hidden:
- 2048
- 2048
transformer_model_size:
- 512
- 512
transformer_positional_embedding_type: fixed
transformer_postprocess:
- dr
- dr
transformer_preprocess:
- n
- n
use_cpu: false
validation_source: ../data/ParaBank/parabank.1.dev.src
validation_source_factors: []
validation_target: ../data/ParaBank/parabank.1.dev.trg
weight_decay: 0.0
weight_init: xavier
weight_init_scale: 3.0
weight_init_xavier_factor_type: avg
weight_init_xavier_rand_type: uniform
weight_normalization: false
weight_tying: false
weight_tying_type: trg_softmax
word_min_count:
- 1
- 1