universal-saes_2025-10-01_19-16-33 / evaluation /metrics_tokens_8003584.json
hanspeterlyngsoeraaschoujensen's picture
Upload folder using huggingface_hub
8061c39 verified
{
"epoch": 1,
"n_tokens": 8003584,
"global_step": 7816,
"training_metrics": {
"train/loss": 2.546875,
"train/contrastive": 2.453125,
"train/recons_loss": 0.5703125,
"train/balance_loss": 3.84375,
"train/balance_loss_contrastive": 2.84375,
"train/balance_loss_recons": 1.0078125,
"train/contrastive_std": 3.359375,
"train/recons_std": 0.0703125,
"train/contrastive_min": 0.083984375,
"train/contrastive_max": 7.125,
"train/recons_min": 0.48828125,
"train/recons_max": 0.671875,
"train/Qwen3_0.6B_layer_2": 0.671875,
"train/Qwen3_0.6B_layer_4": 0.54296875,
"train/Qwen3_1.7B_layer_2": 0.52734375,
"train/Qwen3_1.7B_layer_4": 0.640625,
"train/Qwen3_4B_layer_2": 0.48828125,
"train/Qwen3_4B_layer_4": 0.5625,
"train/contrastives": null,
"train/epoch": 1,
"train/n_tokens": 8003584,
"train/step": 7816
},
"eval_metrics": {
"global_step": 7816,
"n_tokens": 8003584,
"kl_divergence": {
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 6.801623344421387,
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 6.516300201416016,
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 6.550345420837402,
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 6.498440742492676,
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 6.312735080718994,
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 6.4551262855529785,
"Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824,
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 2.2260851860046387,
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.1856892108917236,
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.254146099090576,
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.229769468307495,
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.2037243843078613,
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.2896828651428223,
"Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824,
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 5.400465965270996,
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 5.9340386390686035,
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 5.794930458068848,
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 5.900982856750488,
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.348906517028809,
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.4423675537109375,
"Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543,
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 2.5666661262512207,
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.535998821258545,
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.4926912784576416,
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.476747989654541,
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.408336877822876,
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.6492466926574707,
"Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543,
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 2.4851021766662598,
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.303314685821533,
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.0016140937805176,
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.184553384780884,
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.121729850769043,
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.000966787338257,
"Qwen3_4B_layer_2_to_uniform": 10.104096412658691,
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 3.442514419555664,
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.1136765480041504,
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 2.937788486480713,
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.0111327171325684,
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.0196948051452637,
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 2.7799510955810547,
"Qwen3_4B_layer_4_to_uniform": 10.104096412658691
},
"mae_hidden_states": {
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 1.2630091905593872,
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 1.2069993019104004,
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 1.2386506795883179,
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 1.2585456371307373,
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 1.212580919265747,
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 1.2229262590408325,
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 1.0233924388885498,
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 0.9251772165298462,
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 0.9622151255607605,
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 0.9760592579841614,
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 0.9428697824478149,
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 0.9486178159713745,
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 1.0079174041748047,
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 0.9031265377998352,
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 0.9057611227035522,
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 0.9231780767440796,
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 0.9179145097732544,
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 0.9312993884086609,
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 1.2595539093017578,
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.169715166091919,
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.1957802772521973,
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.1877433061599731,
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.179739236831665,
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.1788952350616455,
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 1.0426846742630005,
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 0.9591526985168457,
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 0.9619539380073547,
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 0.9698508977890015,
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 0.9279893636703491,
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 0.9385145902633667,
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 1.1462980508804321,
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.0632051229476929,
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.0799243450164795,
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.0858067274093628,
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.0611412525177002,
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.0519263744354248
},
"alignment": {
"Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": {
"mse": 0.388671875,
"mean_cosine_similarity": 0.89453125,
"std_cosine_similarity": 0.15625,
"mean_l2_distance": 19.875,
"std_l2_distance": 12.375,
"mean_dimension_correlation": 0.890447998046875,
"std_dimension_correlation": 0.03419125987740356,
"linear_cka": 0.96484375
},
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": {
"mse": 0.39453125,
"mean_cosine_similarity": 0.89453125,
"std_cosine_similarity": 0.162109375,
"mean_l2_distance": 20.125,
"std_l2_distance": 12.625,
"mean_dimension_correlation": 0.8867477416992188,
"std_dimension_correlation": 0.035491939390515204,
"linear_cka": 0.96484375
},
"Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": {
"mse": 0.39453125,
"mean_cosine_similarity": 0.89453125,
"std_cosine_similarity": 0.154296875,
"mean_l2_distance": 20.125,
"std_l2_distance": 12.1875,
"mean_dimension_correlation": 0.889697265625,
"std_dimension_correlation": 0.03374281347550432,
"linear_cka": 0.96484375
},
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": {
"mse": 0.390625,
"mean_cosine_similarity": 0.89453125,
"std_cosine_similarity": 0.1591796875,
"mean_l2_distance": 20.0,
"std_l2_distance": 12.5,
"mean_dimension_correlation": 0.8883514404296875,
"std_dimension_correlation": 0.035164283126066044,
"linear_cka": 0.96484375
},
"Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": {
"mse": 0.388671875,
"mean_cosine_similarity": 0.89453125,
"std_cosine_similarity": 0.1591796875,
"mean_l2_distance": 20.0,
"std_l2_distance": 12.4375,
"mean_dimension_correlation": 0.8896194458007812,
"std_dimension_correlation": 0.03421083254072828,
"linear_cka": 0.96484375
},
"Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": {
"mse": 0.388671875,
"mean_cosine_similarity": 0.89453125,
"std_cosine_similarity": 0.15625,
"mean_l2_distance": 19.875,
"std_l2_distance": 12.375,
"mean_dimension_correlation": 0.8904556274414063,
"std_dimension_correlation": 0.034210556225841876,
"linear_cka": 0.96484375
},
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": {
"mse": 0.26953125,
"mean_cosine_similarity": 0.93359375,
"std_cosine_similarity": 0.1513671875,
"mean_l2_distance": 13.8125,
"std_l2_distance": 12.625,
"mean_dimension_correlation": 0.923016357421875,
"std_dimension_correlation": 0.029236331051580345,
"linear_cka": 0.984375
},
"Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": {
"mse": 0.263671875,
"mean_cosine_similarity": 0.9375,
"std_cosine_similarity": 0.150390625,
"mean_l2_distance": 13.5625,
"std_l2_distance": 12.625,
"mean_dimension_correlation": 0.9244888305664063,
"std_dimension_correlation": 0.02919239611161659,
"linear_cka": 0.984375
},
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": {
"mse": 0.271484375,
"mean_cosine_similarity": 0.93359375,
"std_cosine_similarity": 0.16015625,
"mean_l2_distance": 13.875,
"std_l2_distance": 13.0625,
"mean_dimension_correlation": 0.9205032348632812,
"std_dimension_correlation": 0.029844860484086543,
"linear_cka": 0.984375
},
"Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": {
"mse": 0.267578125,
"mean_cosine_similarity": 0.93359375,
"std_cosine_similarity": 0.158203125,
"mean_l2_distance": 13.6875,
"std_l2_distance": 13.0,
"mean_dimension_correlation": 0.9218185424804688,
"std_dimension_correlation": 0.030954341854338954,
"linear_cka": 0.984375
},
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": {
"mse": 0.39453125,
"mean_cosine_similarity": 0.89453125,
"std_cosine_similarity": 0.162109375,
"mean_l2_distance": 20.125,
"std_l2_distance": 12.625,
"mean_dimension_correlation": 0.8868682861328125,
"std_dimension_correlation": 0.03559183889902671,
"linear_cka": 0.96484375
},
"Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": {
"mse": 0.26953125,
"mean_cosine_similarity": 0.93359375,
"std_cosine_similarity": 0.1513671875,
"mean_l2_distance": 13.8125,
"std_l2_distance": 12.625,
"mean_dimension_correlation": 0.9229568481445313,
"std_dimension_correlation": 0.029229316660619842,
"linear_cka": 0.984375
},
"Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": {
"mse": 0.2578125,
"mean_cosine_similarity": 0.93359375,
"std_cosine_similarity": 0.16015625,
"mean_l2_distance": 13.25,
"std_l2_distance": 13.25,
"mean_dimension_correlation": 0.923333740234375,
"std_dimension_correlation": 0.030134410337098863,
"linear_cka": 0.984375
},
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": {
"mse": 0.26171875,
"mean_cosine_similarity": 0.93359375,
"std_cosine_similarity": 0.16015625,
"mean_l2_distance": 13.4375,
"std_l2_distance": 13.25,
"mean_dimension_correlation": 0.9219314575195312,
"std_dimension_correlation": 0.03136389625872561,
"linear_cka": 0.984375
},
"Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": {
"mse": 0.25390625,
"mean_cosine_similarity": 0.9375,
"std_cosine_similarity": 0.1484375,
"mean_l2_distance": 13.0625,
"std_l2_distance": 12.625,
"mean_dimension_correlation": 0.92755126953125,
"std_dimension_correlation": 0.02898992593261031,
"linear_cka": 0.984375
},
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": {
"mse": 0.39453125,
"mean_cosine_similarity": 0.89453125,
"std_cosine_similarity": 0.154296875,
"mean_l2_distance": 20.125,
"std_l2_distance": 12.1875,
"mean_dimension_correlation": 0.8896469116210938,
"std_dimension_correlation": 0.03377379140546021,
"linear_cka": 0.96484375
},
"Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": {
"mse": 0.263671875,
"mean_cosine_similarity": 0.9375,
"std_cosine_similarity": 0.150390625,
"mean_l2_distance": 13.5625,
"std_l2_distance": 12.625,
"mean_dimension_correlation": 0.9245574951171875,
"std_dimension_correlation": 0.029099754782990043,
"linear_cka": 0.984375
},
"Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": {
"mse": 0.2578125,
"mean_cosine_similarity": 0.93359375,
"std_cosine_similarity": 0.16015625,
"mean_l2_distance": 13.25,
"std_l2_distance": 13.25,
"mean_dimension_correlation": 0.9233123779296875,
"std_dimension_correlation": 0.030156395218800952,
"linear_cka": 0.984375
},
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": {
"mse": 0.263671875,
"mean_cosine_similarity": 0.93359375,
"std_cosine_similarity": 0.1572265625,
"mean_l2_distance": 13.5625,
"std_l2_distance": 13.0625,
"mean_dimension_correlation": 0.9226715087890625,
"std_dimension_correlation": 0.02929662688468137,
"linear_cka": 0.984375
},
"Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": {
"mse": 0.255859375,
"mean_cosine_similarity": 0.9375,
"std_cosine_similarity": 0.1533203125,
"mean_l2_distance": 13.125,
"std_l2_distance": 12.8125,
"mean_dimension_correlation": 0.9262313842773438,
"std_dimension_correlation": 0.029011160291782537,
"linear_cka": 0.984375
},
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": {
"mse": 0.390625,
"mean_cosine_similarity": 0.89453125,
"std_cosine_similarity": 0.1591796875,
"mean_l2_distance": 20.0,
"std_l2_distance": 12.5,
"mean_dimension_correlation": 0.8883377075195312,
"std_dimension_correlation": 0.03512599620173197,
"linear_cka": 0.96484375
},
"Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": {
"mse": 0.271484375,
"mean_cosine_similarity": 0.93359375,
"std_cosine_similarity": 0.16015625,
"mean_l2_distance": 13.875,
"std_l2_distance": 13.0625,
"mean_dimension_correlation": 0.9205001831054688,
"std_dimension_correlation": 0.02990616928878693,
"linear_cka": 0.984375
},
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": {
"mse": 0.26171875,
"mean_cosine_similarity": 0.93359375,
"std_cosine_similarity": 0.16015625,
"mean_l2_distance": 13.4375,
"std_l2_distance": 13.25,
"mean_dimension_correlation": 0.922039794921875,
"std_dimension_correlation": 0.03143896607512693,
"linear_cka": 0.984375
},
"Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": {
"mse": 0.263671875,
"mean_cosine_similarity": 0.93359375,
"std_cosine_similarity": 0.1572265625,
"mean_l2_distance": 13.5625,
"std_l2_distance": 13.0625,
"mean_dimension_correlation": 0.9226806640625,
"std_dimension_correlation": 0.029339070768690877,
"linear_cka": 0.984375
},
"Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": {
"mse": 0.24609375,
"mean_cosine_similarity": 0.9375,
"std_cosine_similarity": 0.1591796875,
"mean_l2_distance": 12.625,
"std_l2_distance": 13.375,
"mean_dimension_correlation": 0.9257278442382812,
"std_dimension_correlation": 0.030489491126206747,
"linear_cka": 0.984375
},
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": {
"mse": 0.388671875,
"mean_cosine_similarity": 0.89453125,
"std_cosine_similarity": 0.1591796875,
"mean_l2_distance": 20.0,
"std_l2_distance": 12.4375,
"mean_dimension_correlation": 0.8896011352539063,
"std_dimension_correlation": 0.034245117741804325,
"linear_cka": 0.96484375
},
"Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": {
"mse": 0.267578125,
"mean_cosine_similarity": 0.93359375,
"std_cosine_similarity": 0.158203125,
"mean_l2_distance": 13.6875,
"std_l2_distance": 13.0,
"mean_dimension_correlation": 0.9218338012695313,
"std_dimension_correlation": 0.03096110466803191,
"linear_cka": 0.984375
},
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": {
"mse": 0.25390625,
"mean_cosine_similarity": 0.9375,
"std_cosine_similarity": 0.1484375,
"mean_l2_distance": 13.0625,
"std_l2_distance": 12.625,
"mean_dimension_correlation": 0.9275863647460938,
"std_dimension_correlation": 0.029019101935420444,
"linear_cka": 0.984375
},
"Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": {
"mse": 0.255859375,
"mean_cosine_similarity": 0.9375,
"std_cosine_similarity": 0.1533203125,
"mean_l2_distance": 13.125,
"std_l2_distance": 12.8125,
"mean_dimension_correlation": 0.9262100219726562,
"std_dimension_correlation": 0.029023808376502022,
"linear_cka": 0.984375
},
"Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": {
"mse": 0.24609375,
"mean_cosine_similarity": 0.9375,
"std_cosine_similarity": 0.1591796875,
"mean_l2_distance": 12.625,
"std_l2_distance": 13.375,
"mean_dimension_correlation": 0.9256805419921875,
"std_dimension_correlation": 0.030472261122601613,
"linear_cka": 0.984375
},
"avg_mse": 0.3045572916666667,
"std_mse": 0.061728089131668586,
"avg_mean_cosine_similarity": 0.9216145833333333,
"std_mean_cosine_similarity": 0.01921444452676741,
"avg_std_cosine_similarity": 0.156640625,
"std_std_cosine_similarity": 0.003999537123283247,
"avg_mean_l2_distance": 15.608333333333333,
"std_mean_l2_distance": 3.137845819808374,
"avg_std_l2_distance": 12.7875,
"std_std_l2_distance": 0.34746102898982306,
"avg_mean_dimension_correlation": 0.9121468607584635,
"std_mean_dimension_correlation": 0.016489903679962933,
"avg_std_dimension_correlation": 0.03143026060381273,
"std_std_dimension_correlation": 0.0023529554482308356,
"avg_linear_cka": 0.9778645833333334,
"std_linear_cka": 0.009207119546699838
}
}
}