>>107387250
This one doesn't use SWA.
def __init__(
self,
vocab_size: Optional[int] = 131072,
hidden_size: Optional[int] = 4096,
intermediate_size: Optional[int] = 14336,
num_hidden_layers: Optional[int] = 34,
num_attention_heads: Optional[int] = 32,
num_key_value_heads: Optional[int] = 8,
head_dim: Optional[int] = 128,
hidden_act: Optional[str] = "silu",
max_position_embeddings: Optional[int] = 262144,
initializer_range: Optional[float] = 0.02,
rms_norm_eps: Optional[float] = 1e-5,
use_cache: Optional[bool] = True,
pad_token_id: Optional[int] = 11,
bos_token_id: Optional[int] = 1,
eos_token_id: Optional[int] = 2,
tie_word_embeddings: Optional[bool] = False,
rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = {
"type": "yarn",
"rope_theta": 1000000.0,
"factor": 16.0,
"original_max_position_embeddings": 16384,
"beta_fast": 32.0,
"beta_slow": 1.0,
"mscale_all_dim": 1.0,
"mscale": 1.0,
"llama_4_scaling_beta": 0.1,
},
sliding_window: Optional[int] = None,
attention_dropout: Optional[float] = 0.0,
**kwargs,
):