{ "in_channels": 768, "audio_codec": { "encoder_dim": 64, "encoder_rates": [ 2, 8, 10, 12 ], "latent_dim": 1024, "decoder_dim": 1536, "decoder_rates": [ 12, 10, 8, 2 ], "n_codebooks": 16, "codebook_size": 1024, "codebook_dim": 128, "quantizer_dropout": false, "sample_rate": 48000, "mean": 0.0, "std": 1.0 }, "text_encoder": { "dim": 768, "name": "t5-base", "max_length": 512, "pad_mode": "longest" }, "vision_encoder": { "dim": 1024, "batch_size": 300, "name": "PE-Core-L14-336", "normalize_feature": true, "interpolation_mode": "BICUBIC", "image_size": 336 }, "transformer": { "dim": 2816, "n_heads": 22, "n_layers": 22, "dropout": 0.1, "norm_eps": 1e-05, "qk_norm": true, "fc_bias": false, "ffn_exp": 4, "ffn_dim_multiplier": 1, "multiple_of": 64, "non_linearity": "swiglu", "use_rope": true, "max_positions": 10000, "frequency_embedding_dim": 256, "timestep_non_linearity": "swiglu", "t_block_non_linearity": "silu", "t_block_bias": true, "context_dim": 2816, "context_non_linearity": "swiglu", "context_embedder_dropout": 0.0, "context_norm": false, "out_channels": 256, "in_channels": null }, "num_anchors": 3, "anchor_embedding_dim": 128, "visual_ranker": { "checkpoint": null, "kind": "imagebind" }, "text_ranker": { "rankers": { "clap": [ { "checkpoint": null, "kind": "clap" }, 5.0 ], "judge": [ { "checkpoint_or_model_id": "facebook/sam-audio-judge", "kind": "judge" }, 1.0 ] }, "kind": "ensemble" }, "span_predictor": "pe-a-frame-large" }