universal-saes_2025-10-01_19-16-33 / evaluation /metrics_tokens_3001344.json
hanspeterlyngsoeraaschoujensen's picture
Upload folder using huggingface_hub
8061c39 verified
{
"epoch": 1,
"n_tokens": 3001344,
"global_step": 2931,
"training_metrics": {
"train/loss": 2.515625,
"train/contrastive": 2.421875,
"train/recons_loss": 0.671875,
"train/balance_loss": 3.75,
"train/balance_loss_contrastive": 2.71875,
"train/balance_loss_recons": 1.0390625,
"train/contrastive_std": 3.25,
"train/recons_std": 0.138671875,
"train/contrastive_min": 0.146484375,
"train/contrastive_max": 6.9375,
"train/recons_min": 0.56640625,
"train/recons_max": 0.9375,
"train/Qwen3_0.6B_layer_2": 0.9375,
"train/Qwen3_0.6B_layer_4": 0.59765625,
"train/Qwen3_1.7B_layer_2": 0.59375,
"train/Qwen3_1.7B_layer_4": 0.703125,
"train/Qwen3_4B_layer_2": 0.56640625,
"train/Qwen3_4B_layer_4": 0.6328125,
"train/contrastives": null,
"train/epoch": 1,
"train/n_tokens": 3001344,
"train/step": 2931
},
"eval_metrics": {
"global_step": 2931,
"n_tokens": 3001344,
"kl_divergence": {
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 11.318835258483887,
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 9.138021469116211,
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 9.61973762512207,
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 9.007281303405762,
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 8.960853576660156,
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 8.789403915405273,
"Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 7.3046698570251465,
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.55082368850708,
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.5602962970733643,
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.592942714691162,
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.588857650756836,
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.6625943183898926,
"Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 10.131369590759277,
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 5.891963481903076,
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.430274963378906,
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 6.0684638023376465,
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 5.9689507484436035,
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.356847286224365,
"Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 8.19615364074707,
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.8310694694519043,
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.7546491622924805,
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.7474663257598877,
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.857220411300659,
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.925436019897461,
"Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 7.565979957580566,
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.9663586616516113,
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.719478130340576,
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.741952657699585,
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.7755935192108154,
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.7375831604003906,
"Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 7.4653778076171875,
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.67035174369812,
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 3.566011905670166,
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.7160496711730957,
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.552424907684326,
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 3.459855556488037,
"Qwen3_4B_layer_4_to_uniform": 10.104096412658691
},
"mae_hidden_states": {
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 15.485275268554688,
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 22.359243392944336,
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 21.841341018676758,
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 20.851577758789062,
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 23.41849136352539,
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 22.13389015197754,
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 8.68209457397461,
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 1.0910885334014893,
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 1.0663363933563232,
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.1295608282089233,
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 1.096497654914856,
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 1.0976781845092773,
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 9.745889663696289,
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 1.073387622833252,
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 1.0651912689208984,
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 1.1097475290298462,
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.102055311203003,
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 1.1042507886886597,
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 6.085488319396973,
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.443469762802124,
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.407573938369751,
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.394163966178894,
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.4274914264678955,
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.423639178276062,
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 6.723683834075928,
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 1.2199777364730835,
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 1.1646456718444824,
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 1.1640838384628296,
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 1.1155877113342285,
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 1.1568272113800049,
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 8.499415397644043,
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.2985743284225464,
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.2958557605743408,
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.2903549671173096,
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.2823046445846558,
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.2616506814956665
},
"alignment": {
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
"mse": 1.421875,
"mean_cosine_similarity": -0.03369140625,
"std_cosine_similarity": 0.109375,
"mean_l2_distance": 72.5,
"std_l2_distance": 3.90625,
"mean_dimension_correlation": 0.254237837344408,
"std_dimension_correlation": 0.16181929189675745,
"linear_cka": 0.5859375
},
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
"mse": 1.421875,
"mean_cosine_similarity": -0.0284423828125,
"std_cosine_similarity": 0.10888671875,
"mean_l2_distance": 72.5,
"std_l2_distance": 3.890625,
"mean_dimension_correlation": 0.25683254674077033,
"std_dimension_correlation": 0.16029215327593901,
"linear_cka": 0.57421875
},
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
"mse": 1.4140625,
"mean_cosine_similarity": -0.0252685546875,
"std_cosine_similarity": 0.1083984375,
"mean_l2_distance": 72.5,
"std_l2_distance": 3.875,
"mean_dimension_correlation": 0.25395019352436066,
"std_dimension_correlation": 0.15926056622745546,
"linear_cka": 0.578125
},
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
"mse": 1.421875,
"mean_cosine_similarity": -0.03271484375,
"std_cosine_similarity": 0.1064453125,
"mean_l2_distance": 72.5,
"std_l2_distance": 3.796875,
"mean_dimension_correlation": 0.24886183738708495,
"std_dimension_correlation": 0.15849261736593726,
"linear_cka": 0.55859375
},
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
"mse": 1.421875,
"mean_cosine_similarity": -0.033203125,
"std_cosine_similarity": 0.109375,
"mean_l2_distance": 72.5,
"std_l2_distance": 3.890625,
"mean_dimension_correlation": 0.256584095954895,
"std_dimension_correlation": 0.15873214025442897,
"linear_cka": 0.57421875
},
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
"mse": 1.4296875,
"mean_cosine_similarity": -0.03369140625,
"std_cosine_similarity": 0.109375,
"mean_l2_distance": 72.5,
"std_l2_distance": 3.90625,
"mean_dimension_correlation": 0.2542317323386669,
"std_dimension_correlation": 0.16183266276519212,
"linear_cka": 0.5859375
},
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
"mse": 0.734375,
"mean_cosine_similarity": 0.65625,
"std_cosine_similarity": 0.28515625,
"mean_l2_distance": 37.25,
"std_l2_distance": 19.5,
"mean_dimension_correlation": 0.6187647342681885,
"std_dimension_correlation": 0.11470426666838326,
"linear_cka": 0.984375
},
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
"mse": 0.7265625,
"mean_cosine_similarity": 0.66015625,
"std_cosine_similarity": 0.279296875,
"mean_l2_distance": 37.0,
"std_l2_distance": 19.25,
"mean_dimension_correlation": 0.6220208525657653,
"std_dimension_correlation": 0.11040039509848326,
"linear_cka": 0.984375
},
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
"mse": 0.76171875,
"mean_cosine_similarity": 0.62890625,
"std_cosine_similarity": 0.302734375,
"mean_l2_distance": 38.75,
"std_l2_distance": 20.125,
"mean_dimension_correlation": 0.592758321762085,
"std_dimension_correlation": 0.11886540241980308,
"linear_cka": 0.98046875
},
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
"mse": 0.74609375,
"mean_cosine_similarity": 0.63671875,
"std_cosine_similarity": 0.302734375,
"mean_l2_distance": 38.25,
"std_l2_distance": 20.25,
"mean_dimension_correlation": 0.6037769317626953,
"std_dimension_correlation": 0.11647753822253991,
"linear_cka": 0.984375
},
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
"mse": 1.4296875,
"mean_cosine_similarity": -0.0284423828125,
"std_cosine_similarity": 0.10888671875,
"mean_l2_distance": 72.5,
"std_l2_distance": 3.890625,
"mean_dimension_correlation": 0.25684744566679,
"std_dimension_correlation": 0.16032274573798164,
"linear_cka": 0.57421875
},
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
"mse": 0.734375,
"mean_cosine_similarity": 0.65625,
"std_cosine_similarity": 0.28515625,
"mean_l2_distance": 37.25,
"std_l2_distance": 19.5,
"mean_dimension_correlation": 0.6187384128570557,
"std_dimension_correlation": 0.11471572089316741,
"linear_cka": 0.984375
},
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
"mse": 0.734375,
"mean_cosine_similarity": 0.6484375,
"std_cosine_similarity": 0.30078125,
"mean_l2_distance": 37.5,
"std_l2_distance": 20.375,
"mean_dimension_correlation": 0.6119367599487304,
"std_dimension_correlation": 0.1157440646478159,
"linear_cka": 0.99609375
},
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
"mse": 0.75390625,
"mean_cosine_similarity": 0.63671875,
"std_cosine_similarity": 0.30078125,
"mean_l2_distance": 38.25,
"std_l2_distance": 20.375,
"mean_dimension_correlation": 0.5996460914611816,
"std_dimension_correlation": 0.11944124129277625,
"linear_cka": 0.98046875
},
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
"mse": 0.70703125,
"mean_cosine_similarity": 0.67578125,
"std_cosine_similarity": 0.27734375,
"mean_l2_distance": 36.0,
"std_l2_distance": 19.5,
"mean_dimension_correlation": 0.638215160369873,
"std_dimension_correlation": 0.10975611081697591,
"linear_cka": 0.984375
},
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
"mse": 1.4140625,
"mean_cosine_similarity": -0.0252685546875,
"std_cosine_similarity": 0.1083984375,
"mean_l2_distance": 72.5,
"std_l2_distance": 3.875,
"mean_dimension_correlation": 0.25395837128162385,
"std_dimension_correlation": 0.15926372552177567,
"linear_cka": 0.578125
},
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
"mse": 0.7265625,
"mean_cosine_similarity": 0.66015625,
"std_cosine_similarity": 0.279296875,
"mean_l2_distance": 37.0,
"std_l2_distance": 19.25,
"mean_dimension_correlation": 0.6219659209251404,
"std_dimension_correlation": 0.11032863879333923,
"linear_cka": 0.984375
},
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
"mse": 0.734375,
"mean_cosine_similarity": 0.6484375,
"std_cosine_similarity": 0.30078125,
"mean_l2_distance": 37.5,
"std_l2_distance": 20.375,
"mean_dimension_correlation": 0.6119108200073242,
"std_dimension_correlation": 0.1157383378132106,
"linear_cka": 0.99609375
},
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
"mse": 0.7578125,
"mean_cosine_similarity": 0.6328125,
"std_cosine_similarity": 0.298828125,
"mean_l2_distance": 38.5,
"std_l2_distance": 20.125,
"mean_dimension_correlation": 0.5979020118713378,
"std_dimension_correlation": 0.1151705814719715,
"linear_cka": 0.984375
},
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
"mse": 0.71875,
"mean_cosine_similarity": 0.6640625,
"std_cosine_similarity": 0.28125,
"mean_l2_distance": 36.75,
"std_l2_distance": 19.5,
"mean_dimension_correlation": 0.6274345874786377,
"std_dimension_correlation": 0.11253210388812478,
"linear_cka": 0.98046875
},
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
"mse": 1.4296875,
"mean_cosine_similarity": -0.03271484375,
"std_cosine_similarity": 0.1064453125,
"mean_l2_distance": 72.5,
"std_l2_distance": 3.796875,
"mean_dimension_correlation": 0.24887723177671434,
"std_dimension_correlation": 0.15850834600861563,
"linear_cka": 0.55859375
},
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
"mse": 0.76171875,
"mean_cosine_similarity": 0.62890625,
"std_cosine_similarity": 0.302734375,
"mean_l2_distance": 38.75,
"std_l2_distance": 20.125,
"mean_dimension_correlation": 0.5927883148193359,
"std_dimension_correlation": 0.11887853166661289,
"linear_cka": 0.98046875
},
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
"mse": 0.75390625,
"mean_cosine_similarity": 0.63671875,
"std_cosine_similarity": 0.30078125,
"mean_l2_distance": 38.25,
"std_l2_distance": 20.375,
"mean_dimension_correlation": 0.5995779991149902,
"std_dimension_correlation": 0.1193691675179003,
"linear_cka": 0.98046875
},
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
"mse": 0.7578125,
"mean_cosine_similarity": 0.6328125,
"std_cosine_similarity": 0.298828125,
"mean_l2_distance": 38.5,
"std_l2_distance": 20.125,
"mean_dimension_correlation": 0.5978128433227539,
"std_dimension_correlation": 0.11512506347641102,
"linear_cka": 0.984375
},
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
"mse": 0.75390625,
"mean_cosine_similarity": 0.62890625,
"std_cosine_similarity": 0.3046875,
"mean_l2_distance": 38.5,
"std_l2_distance": 20.625,
"mean_dimension_correlation": 0.5955796241760254,
"std_dimension_correlation": 0.11906185378925987,
"linear_cka": 0.98828125
},
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
"mse": 1.4296875,
"mean_cosine_similarity": -0.033203125,
"std_cosine_similarity": 0.109375,
"mean_l2_distance": 72.5,
"std_l2_distance": 3.890625,
"mean_dimension_correlation": 0.2565764158964157,
"std_dimension_correlation": 0.1587071816624074,
"linear_cka": 0.57421875
},
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
"mse": 0.74609375,
"mean_cosine_similarity": 0.63671875,
"std_cosine_similarity": 0.302734375,
"mean_l2_distance": 38.25,
"std_l2_distance": 20.25,
"mean_dimension_correlation": 0.6037120819091797,
"std_dimension_correlation": 0.11639985412027169,
"linear_cka": 0.984375
},
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
"mse": 0.70703125,
"mean_cosine_similarity": 0.67578125,
"std_cosine_similarity": 0.27734375,
"mean_l2_distance": 36.0,
"std_l2_distance": 19.5,
"mean_dimension_correlation": 0.6382188320159912,
"std_dimension_correlation": 0.10972459865917429,
"linear_cka": 0.984375
},
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
"mse": 0.71875,
"mean_cosine_similarity": 0.6640625,
"std_cosine_similarity": 0.28125,
"mean_l2_distance": 36.75,
"std_l2_distance": 19.5,
"mean_dimension_correlation": 0.6273346900939941,
"std_dimension_correlation": 0.1124933006393999,
"linear_cka": 0.98046875
},
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
"mse": 0.75390625,
"mean_cosine_similarity": 0.62890625,
"std_cosine_similarity": 0.3046875,
"mean_l2_distance": 38.5,
"std_l2_distance": 20.625,
"mean_dimension_correlation": 0.5955384254455567,
"std_dimension_correlation": 0.11905433194805992,
"linear_cka": 0.98828125
},
"avg_mse": 0.9674479166666666,
"std_mse": 0.32276016873074226,
"avg_mean_cosine_similarity": 0.4210286458333333,
"std_mean_cosine_similarity": 0.31965592389258923,
"avg_std_cosine_similarity": 0.23173828125,
"std_std_cosine_similarity": 0.08757208624967457,
"avg_mean_l2_distance": 49.28333333333333,
"std_mean_l2_distance": 16.43189648890907,
"avg_std_l2_distance": 14.598958333333334,
"std_std_l2_distance": 7.594291533045653,
"avg_mean_dimension_correlation": 0.49188637080291897,
"std_mean_dimension_correlation": 0.16857047305704442,
"avg_std_dimension_correlation": 0.1300404178186724,
"std_std_dimension_correlation": 0.021172306029780062,
"avg_linear_cka": 0.8479166666666667,
"std_linear_cka": 0.19363585907609904
}
}
}