#Original code from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/vit_mae/configuration_vit_mae.py
import ml_collections
def ViTMAEConfig(ratio = 0.75):
config = ml_collections.ConfigDict()
config.hidden_size = 768
config.num_hidden_layers = 12
config.num_attention_heads = 12
config.intermediate_size = 3072
config.hidden_act = 'gelu'
config.hidden_dropout_prob = 0.0 #The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
config.attention_probs_dropout_prob = 0.0
config.initializer_range = 0.02
config.layer_norm_eps = 1e-12
config.image_size = 224
config.patch_size = 16
config.num_channels = 3
config.qkv_bias = True
config.decoder_num_attention_heads=16
config.decoder_hidden_size=512
config.decoder_num_hidden_layers=8
config.decoder_intermediate_size=2048
config.mask_ratio=ratio
config.norm_pix_loss=False
config.chunk_size_feed_forward=False
return config
def get_b16_config():
"""Returns the ViT-B/16 configuration."""
config = ml_collections.ConfigDict()
config.attention_probs_dropout_prob = 0.0
config.encoder_stride = 16
config.hidden_act = 'gelu'
config.hidden_dropout_prob = 0.0 #The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
config.hidden_size = 768
config.image_size = 224
config.initializer_range = 0.02
config.intermediate_size = 3072
config.layer_norm_eps = 1e-12
config.num_attention_heads = 12
config.num_channels = 3
config.num_hidden_layers = 12
config.patch_size = 16
config.qkv_bias = True
return config