universal-saes_2025-10-01_19-16-33 / evaluation /metrics_tokens_4001792.json
hanspeterlyngsoeraaschoujensen's picture
Upload folder using huggingface_hub
8061c39 verified
{
"epoch": 1,
"n_tokens": 4001792,
"global_step": 3908,
"training_metrics": {
"train/loss": 2.5,
"train/contrastive": 2.40625,
"train/recons_loss": 0.671875,
"train/balance_loss": 3.828125,
"train/balance_loss_contrastive": 2.78125,
"train/balance_loss_recons": 1.046875,
"train/contrastive_std": 3.265625,
"train/recons_std": 0.1513671875,
"train/contrastive_min": 0.1162109375,
"train/contrastive_max": 6.9375,
"train/recons_min": 0.55859375,
"train/recons_max": 0.96484375,
"train/Qwen3_0.6B_layer_2": 0.96484375,
"train/Qwen3_0.6B_layer_4": 0.6015625,
"train/Qwen3_1.7B_layer_2": 0.58203125,
"train/Qwen3_1.7B_layer_4": 0.69140625,
"train/Qwen3_4B_layer_2": 0.55859375,
"train/Qwen3_4B_layer_4": 0.625,
"train/contrastives": null,
"train/epoch": 1,
"train/n_tokens": 4001792,
"train/step": 3908
},
"eval_metrics": {
"global_step": 3908,
"n_tokens": 4001792,
"kl_divergence": {
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 10.677733421325684,
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 10.070417404174805,
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 10.500988960266113,
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 10.254755973815918,
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 10.141581535339355,
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 10.209218978881836,
"Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 6.944426536560059,
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.526094675064087,
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.448215961456299,
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.5273706912994385,
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.518568515777588,
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.5949084758758545,
"Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 10.105497360229492,
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 6.14721155166626,
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.3550543785095215,
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 6.340244293212891,
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.490333557128906,
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.536875247955322,
"Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 9.303935050964355,
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.7055323123931885,
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.6092498302459717,
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.6337990760803223,
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.66951322555542,
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.742098569869995,
"Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 8.335909843444824,
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.7109298706054688,
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.387141704559326,
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.432076930999756,
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.466850519180298,
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.3831467628479004,
"Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 8.078448295593262,
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.5085513591766357,
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 3.5161335468292236,
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.7921173572540283,
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.47990345954895,
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 3.2069830894470215,
"Qwen3_4B_layer_4_to_uniform": 10.104096412658691
},
"mae_hidden_states": {
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 9.193017959594727,
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 5.069667816162109,
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 5.365467071533203,
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 5.159329414367676,
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 5.38785982131958,
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 5.0614728927612305,
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 5.8135833740234375,
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 1.0534567832946777,
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 1.0686990022659302,
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.093401312828064,
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 1.055011510848999,
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 1.0553429126739502,
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 8.750565528869629,
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 1.014384150505066,
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 1.0321710109710693,
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 1.048608422279358,
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.0492010116577148,
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 1.053753137588501,
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 5.545437812805176,
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.3332059383392334,
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.340571641921997,
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.3302825689315796,
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.3458808660507202,
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.3539550304412842,
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 7.418992042541504,
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 1.1341036558151245,
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 1.0986826419830322,
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 1.1034446954727173,
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 1.0604543685913086,
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 1.0890880823135376,
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 6.221832275390625,
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.2409595251083374,
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.2512269020080566,
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.2450522184371948,
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.2315609455108643,
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.2183749675750732
},
"alignment": {
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
"mse": 1.3515625,
"mean_cosine_similarity": 0.052001953125,
"std_cosine_similarity": 0.19140625,
"mean_l2_distance": 69.5,
"std_l2_distance": 7.3125,
"mean_dimension_correlation": 0.46465563774108887,
"std_dimension_correlation": 0.1362067287833464,
"linear_cka": 0.5859375
},
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
"mse": 1.34375,
"mean_cosine_similarity": 0.056396484375,
"std_cosine_similarity": 0.19140625,
"mean_l2_distance": 69.0,
"std_l2_distance": 7.34375,
"mean_dimension_correlation": 0.46726187616586684,
"std_dimension_correlation": 0.13268670396178475,
"linear_cka": 0.578125
},
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
"mse": 1.34375,
"mean_cosine_similarity": 0.055908203125,
"std_cosine_similarity": 0.1904296875,
"mean_l2_distance": 69.0,
"std_l2_distance": 7.34375,
"mean_dimension_correlation": 0.4647917509078979,
"std_dimension_correlation": 0.13471787446655337,
"linear_cka": 0.578125
},
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
"mse": 1.3515625,
"mean_cosine_similarity": 0.056640625,
"std_cosine_similarity": 0.1884765625,
"mean_l2_distance": 69.0,
"std_l2_distance": 7.21875,
"mean_dimension_correlation": 0.4657045602798462,
"std_dimension_correlation": 0.13361017606712636,
"linear_cka": 0.57421875
},
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
"mse": 1.34375,
"mean_cosine_similarity": 0.05615234375,
"std_cosine_similarity": 0.1904296875,
"mean_l2_distance": 69.0,
"std_l2_distance": 7.3125,
"mean_dimension_correlation": 0.4670211374759674,
"std_dimension_correlation": 0.13379253598505308,
"linear_cka": 0.57421875
},
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
"mse": 1.359375,
"mean_cosine_similarity": 0.052001953125,
"std_cosine_similarity": 0.19140625,
"mean_l2_distance": 69.5,
"std_l2_distance": 7.3125,
"mean_dimension_correlation": 0.4646653652191162,
"std_dimension_correlation": 0.13617385784126732,
"linear_cka": 0.5859375
},
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
"mse": 0.5,
"mean_cosine_similarity": 0.8046875,
"std_cosine_similarity": 0.255859375,
"mean_l2_distance": 25.5,
"std_l2_distance": 19.375,
"mean_dimension_correlation": 0.7786048889160156,
"std_dimension_correlation": 0.079747066045607,
"linear_cka": 0.984375
},
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
"mse": 0.498046875,
"mean_cosine_similarity": 0.8046875,
"std_cosine_similarity": 0.25390625,
"mean_l2_distance": 25.375,
"std_l2_distance": 19.25,
"mean_dimension_correlation": 0.7792343139648438,
"std_dimension_correlation": 0.07860444177664169,
"linear_cka": 0.98828125
},
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
"mse": 0.51953125,
"mean_cosine_similarity": 0.78515625,
"std_cosine_similarity": 0.275390625,
"mean_l2_distance": 26.5,
"std_l2_distance": 20.25,
"mean_dimension_correlation": 0.7631843566894532,
"std_dimension_correlation": 0.08458475161101357,
"linear_cka": 0.98828125
},
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
"mse": 0.5078125,
"mean_cosine_similarity": 0.79296875,
"std_cosine_similarity": 0.2734375,
"mean_l2_distance": 26.0,
"std_l2_distance": 20.125,
"mean_dimension_correlation": 0.7681190490722656,
"std_dimension_correlation": 0.08387350384855204,
"linear_cka": 0.984375
},
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
"mse": 1.3515625,
"mean_cosine_similarity": 0.056396484375,
"std_cosine_similarity": 0.19140625,
"mean_l2_distance": 69.0,
"std_l2_distance": 7.34375,
"mean_dimension_correlation": 0.46729940325021746,
"std_dimension_correlation": 0.13270665905666312,
"linear_cka": 0.578125
},
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
"mse": 0.5,
"mean_cosine_similarity": 0.8046875,
"std_cosine_similarity": 0.255859375,
"mean_l2_distance": 25.5,
"std_l2_distance": 19.375,
"mean_dimension_correlation": 0.7785774230957031,
"std_dimension_correlation": 0.07977299719796638,
"linear_cka": 0.984375
},
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
"mse": 0.49609375,
"mean_cosine_similarity": 0.796875,
"std_cosine_similarity": 0.2734375,
"mean_l2_distance": 25.25,
"std_l2_distance": 20.375,
"mean_dimension_correlation": 0.7738082885742188,
"std_dimension_correlation": 0.08244148252527034,
"linear_cka": 0.98828125
},
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
"mse": 0.50390625,
"mean_cosine_similarity": 0.79296875,
"std_cosine_similarity": 0.275390625,
"mean_l2_distance": 25.75,
"std_l2_distance": 20.5,
"mean_dimension_correlation": 0.7685455322265625,
"std_dimension_correlation": 0.08636337823761847,
"linear_cka": 0.98828125
},
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
"mse": 0.48046875,
"mean_cosine_similarity": 0.8125,
"std_cosine_similarity": 0.251953125,
"mean_l2_distance": 24.5,
"std_l2_distance": 19.375,
"mean_dimension_correlation": 0.789703369140625,
"std_dimension_correlation": 0.07704774466213117,
"linear_cka": 0.984375
},
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
"mse": 1.34375,
"mean_cosine_similarity": 0.055908203125,
"std_cosine_similarity": 0.1904296875,
"mean_l2_distance": 69.0,
"std_l2_distance": 7.34375,
"mean_dimension_correlation": 0.4647957801818848,
"std_dimension_correlation": 0.1347461643666133,
"linear_cka": 0.578125
},
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
"mse": 0.49609375,
"mean_cosine_similarity": 0.8046875,
"std_cosine_similarity": 0.25390625,
"mean_l2_distance": 25.375,
"std_l2_distance": 19.25,
"mean_dimension_correlation": 0.779193115234375,
"std_dimension_correlation": 0.07862977772942846,
"linear_cka": 0.98828125
},
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
"mse": 0.4921875,
"mean_cosine_similarity": 0.796875,
"std_cosine_similarity": 0.2734375,
"mean_l2_distance": 25.25,
"std_l2_distance": 20.375,
"mean_dimension_correlation": 0.773846435546875,
"std_dimension_correlation": 0.08246401911605972,
"linear_cka": 0.98828125
},
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
"mse": 0.5078125,
"mean_cosine_similarity": 0.79296875,
"std_cosine_similarity": 0.271484375,
"mean_l2_distance": 26.0,
"std_l2_distance": 20.125,
"mean_dimension_correlation": 0.7682723999023438,
"std_dimension_correlation": 0.08173679476643078,
"linear_cka": 0.984375
},
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
"mse": 0.486328125,
"mean_cosine_similarity": 0.80859375,
"std_cosine_similarity": 0.255859375,
"mean_l2_distance": 24.875,
"std_l2_distance": 19.5,
"mean_dimension_correlation": 0.7830284118652344,
"std_dimension_correlation": 0.07756386958443834,
"linear_cka": 0.98046875
},
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
"mse": 1.359375,
"mean_cosine_similarity": 0.056640625,
"std_cosine_similarity": 0.1884765625,
"mean_l2_distance": 69.0,
"std_l2_distance": 7.21875,
"mean_dimension_correlation": 0.46567630767822266,
"std_dimension_correlation": 0.13364195702919346,
"linear_cka": 0.57421875
},
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
"mse": 0.51953125,
"mean_cosine_similarity": 0.78515625,
"std_cosine_similarity": 0.275390625,
"mean_l2_distance": 26.5,
"std_l2_distance": 20.25,
"mean_dimension_correlation": 0.7631195068359375,
"std_dimension_correlation": 0.08451723229099471,
"linear_cka": 0.98828125
},
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
"mse": 0.50390625,
"mean_cosine_similarity": 0.79296875,
"std_cosine_similarity": 0.275390625,
"mean_l2_distance": 25.75,
"std_l2_distance": 20.5,
"mean_dimension_correlation": 0.7685020446777344,
"std_dimension_correlation": 0.08637723380547804,
"linear_cka": 0.98828125
},
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
"mse": 0.5078125,
"mean_cosine_similarity": 0.79296875,
"std_cosine_similarity": 0.271484375,
"mean_l2_distance": 26.0,
"std_l2_distance": 20.125,
"mean_dimension_correlation": 0.7681938171386719,
"std_dimension_correlation": 0.08170402844520411,
"linear_cka": 0.984375
},
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
"mse": 0.5,
"mean_cosine_similarity": 0.7890625,
"std_cosine_similarity": 0.27734375,
"mean_l2_distance": 25.5,
"std_l2_distance": 20.75,
"mean_dimension_correlation": 0.7680191040039063,
"std_dimension_correlation": 0.08532466419571123,
"linear_cka": 0.98828125
},
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
"mse": 1.3515625,
"mean_cosine_similarity": 0.05615234375,
"std_cosine_similarity": 0.1904296875,
"mean_l2_distance": 69.0,
"std_l2_distance": 7.3125,
"mean_dimension_correlation": 0.4670826017856598,
"std_dimension_correlation": 0.13384197305399426,
"linear_cka": 0.57421875
},
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
"mse": 0.5078125,
"mean_cosine_similarity": 0.79296875,
"std_cosine_similarity": 0.2734375,
"mean_l2_distance": 26.0,
"std_l2_distance": 20.125,
"mean_dimension_correlation": 0.7681541442871094,
"std_dimension_correlation": 0.08380469982339137,
"linear_cka": 0.984375
},
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
"mse": 0.48046875,
"mean_cosine_similarity": 0.8125,
"std_cosine_similarity": 0.251953125,
"mean_l2_distance": 24.5,
"std_l2_distance": 19.375,
"mean_dimension_correlation": 0.7896865844726563,
"std_dimension_correlation": 0.07704049416139637,
"linear_cka": 0.984375
},
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
"mse": 0.48828125,
"mean_cosine_similarity": 0.80859375,
"std_cosine_similarity": 0.255859375,
"mean_l2_distance": 24.875,
"std_l2_distance": 19.5,
"mean_dimension_correlation": 0.7829521179199219,
"std_dimension_correlation": 0.07745501980282286,
"linear_cka": 0.98046875
},
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
"mse": 0.5,
"mean_cosine_similarity": 0.7890625,
"std_cosine_similarity": 0.27734375,
"mean_l2_distance": 25.5,
"std_l2_distance": 20.75,
"mean_dimension_correlation": 0.7680793762207031,
"std_dimension_correlation": 0.08533230341332358,
"linear_cka": 0.98828125
},
"avg_mse": 0.783203125,
"std_mse": 0.4008924066126565,
"avg_mean_cosine_similarity": 0.5505045572916667,
"std_mean_cosine_similarity": 0.3501489988188365,
"avg_std_cosine_similarity": 0.24108072916666667,
"std_std_cosine_similarity": 0.036733753214407784,
"avg_mean_l2_distance": 40.05,
"std_mean_l2_distance": 20.54668464740723,
"avg_std_l2_distance": 15.74375,
"std_std_l2_distance": 5.980928892460323,
"avg_mean_dimension_correlation": 0.6713259566823642,
"std_mean_dimension_correlation": 0.14540630815784578,
"avg_std_dimension_correlation": 0.09921700445503585,
"std_std_dimension_correlation": 0.024890230217464338,
"avg_linear_cka": 0.85,
"std_linear_cka": 0.19227216969593736
}
}
}