Grammar
<features> ::= <convolution> | <convolution> | <pooling> | <pooling> | <dropout> | <batch_norm>
<convolution> ::= layer:conv [out_channels,int,1,32,256] [kernel_size,int,1,2,5] [stride,int,1,1,3] <padding> <activation_function> <bias>
<batch_norm> ::= layer:batch_norm
<pooling> ::= <pool_type> [kernel_size,int,1,2,5] [stride,int,1,1,3] <padding>
<pool_type> ::= layer:pool_avg | layer:pool_max
<padding> ::= padding:same | padding:valid
<dropout> ::= layer:dropout [rate,float,1,0,0.7]
<classification> ::= <fully_connected> | <dropout>
<fully_connected> ::= layer:fc <activation_function> [out_features,int,1,128,2048] <bias>
<activation_function> ::= act:linear | act:relu | act:sigmoid
<bias> ::= bias:True | bias:False
<softmax> ::= layer:fc act:softmax out_features:10 bias:True
<learning> ::= <lars> <early_stop> [batch_size,int,1,32,1024] epochs:10000 | <gradient_descent> <early_stop> [batch_size,int,1,32,1024] epochs:10000 | <rmsprop> <early_stop> [batch_size,int,1,32,1024] epochs:10000 | <adam> <early_stop> [batch_size,int,1,32,1024] epochs:10000
<gradient_descent> ::= learning:gradient_descent [lr,float,1,0.0001,0.1] [momentum,float,1,0.68,0.99] [weight_decay,float,1,0.000001,0.001] <nesterov>
<nesterov> ::= nesterov:True | nesterov:False
<adam> ::= learning:adam [lr,float,1,0.0001,0.1] [beta1,float,1,0.5,0.9999] [beta2,float,1,0.5,0.9999] [weight_decay,float,1,0.000001,0.001]
<rmsprop> ::= learning:rmsprop [lr,float,1,0.0001,0.1] [alpha,float,1,0.5,1] [weight_decay,float,1,0.000001,0.001]
<lars> ::= learning:lars [lr_weights,float,1,0.05,0.35] [lr_biases,float,1,0.001,0.01] [momentum,float,1,0.7,0.9] [weight_decay,float,1,0.0000001,0.00001]
<early_stop> ::= [early_stop,int,1,5,20]
<model_partition> ::= model_partition:_ [partition_point,int,1,0,_]Specific structure:
Last updated