{ "epoch": 1, "n_tokens": 8003584, "global_step": 7816, "training_metrics": { "train/loss": 2.546875, "train/contrastive": 2.453125, "train/recons_loss": 0.5703125, "train/balance_loss": 3.84375, "train/balance_loss_contrastive": 2.84375, "train/balance_loss_recons": 1.0078125, "train/contrastive_std": 3.359375, "train/recons_std": 0.0703125, "train/contrastive_min": 0.083984375, "train/contrastive_max": 7.125, "train/recons_min": 0.48828125, "train/recons_max": 0.671875, "train/Qwen3_0.6B_layer_2": 0.671875, "train/Qwen3_0.6B_layer_4": 0.54296875, "train/Qwen3_1.7B_layer_2": 0.52734375, "train/Qwen3_1.7B_layer_4": 0.640625, "train/Qwen3_4B_layer_2": 0.48828125, "train/Qwen3_4B_layer_4": 0.5625, "train/contrastives": null, "train/epoch": 1, "train/n_tokens": 8003584, "train/step": 7816 }, "eval_metrics": { "global_step": 7816, "n_tokens": 8003584, "kl_divergence": { "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 6.801623344421387, "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 6.516300201416016, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 6.550345420837402, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 6.498440742492676, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 6.312735080718994, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 6.4551262855529785, "Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 2.2260851860046387, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.1856892108917236, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.254146099090576, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.229769468307495, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.2037243843078613, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.2896828651428223, "Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 5.400465965270996, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 5.9340386390686035, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 5.794930458068848, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 5.900982856750488, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.348906517028809, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.4423675537109375, "Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 2.5666661262512207, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.535998821258545, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.4926912784576416, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.476747989654541, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.408336877822876, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.6492466926574707, "Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 2.4851021766662598, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.303314685821533, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.0016140937805176, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.184553384780884, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.121729850769043, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.000966787338257, "Qwen3_4B_layer_2_to_uniform": 10.104096412658691, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 3.442514419555664, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.1136765480041504, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 2.937788486480713, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.0111327171325684, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.0196948051452637, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 2.7799510955810547, "Qwen3_4B_layer_4_to_uniform": 10.104096412658691 }, "mae_hidden_states": { "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 1.2630091905593872, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 1.2069993019104004, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 1.2386506795883179, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 1.2585456371307373, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 1.212580919265747, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 1.2229262590408325, "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 1.0233924388885498, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 0.9251772165298462, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 0.9622151255607605, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 0.9760592579841614, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 0.9428697824478149, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 0.9486178159713745, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 1.0079174041748047, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 0.9031265377998352, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 0.9057611227035522, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 0.9231780767440796, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 0.9179145097732544, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 0.9312993884086609, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 1.2595539093017578, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.169715166091919, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.1957802772521973, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.1877433061599731, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.179739236831665, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.1788952350616455, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 1.0426846742630005, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 0.9591526985168457, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 0.9619539380073547, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 0.9698508977890015, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 0.9279893636703491, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 0.9385145902633667, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 1.1462980508804321, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.0632051229476929, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.0799243450164795, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.0858067274093628, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.0611412525177002, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.0519263744354248 }, "alignment": { "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": { "mse": 0.388671875, "mean_cosine_similarity": 0.89453125, "std_cosine_similarity": 0.15625, "mean_l2_distance": 19.875, "std_l2_distance": 12.375, "mean_dimension_correlation": 0.890447998046875, "std_dimension_correlation": 0.03419125987740356, "linear_cka": 0.96484375 }, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": { "mse": 0.39453125, "mean_cosine_similarity": 0.89453125, "std_cosine_similarity": 0.162109375, "mean_l2_distance": 20.125, "std_l2_distance": 12.625, "mean_dimension_correlation": 0.8867477416992188, "std_dimension_correlation": 0.035491939390515204, "linear_cka": 0.96484375 }, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": { "mse": 0.39453125, "mean_cosine_similarity": 0.89453125, "std_cosine_similarity": 0.154296875, "mean_l2_distance": 20.125, "std_l2_distance": 12.1875, "mean_dimension_correlation": 0.889697265625, "std_dimension_correlation": 0.03374281347550432, "linear_cka": 0.96484375 }, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": { "mse": 0.390625, "mean_cosine_similarity": 0.89453125, "std_cosine_similarity": 0.1591796875, "mean_l2_distance": 20.0, "std_l2_distance": 12.5, "mean_dimension_correlation": 0.8883514404296875, "std_dimension_correlation": 0.035164283126066044, "linear_cka": 0.96484375 }, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": { "mse": 0.388671875, "mean_cosine_similarity": 0.89453125, "std_cosine_similarity": 0.1591796875, "mean_l2_distance": 20.0, "std_l2_distance": 12.4375, "mean_dimension_correlation": 0.8896194458007812, "std_dimension_correlation": 0.03421083254072828, "linear_cka": 0.96484375 }, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": { "mse": 0.388671875, "mean_cosine_similarity": 0.89453125, "std_cosine_similarity": 0.15625, "mean_l2_distance": 19.875, "std_l2_distance": 12.375, "mean_dimension_correlation": 0.8904556274414063, "std_dimension_correlation": 0.034210556225841876, "linear_cka": 0.96484375 }, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": { "mse": 0.26953125, "mean_cosine_similarity": 0.93359375, "std_cosine_similarity": 0.1513671875, "mean_l2_distance": 13.8125, "std_l2_distance": 12.625, "mean_dimension_correlation": 0.923016357421875, "std_dimension_correlation": 0.029236331051580345, "linear_cka": 0.984375 }, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": { "mse": 0.263671875, "mean_cosine_similarity": 0.9375, "std_cosine_similarity": 0.150390625, "mean_l2_distance": 13.5625, "std_l2_distance": 12.625, "mean_dimension_correlation": 0.9244888305664063, "std_dimension_correlation": 0.02919239611161659, "linear_cka": 0.984375 }, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": { "mse": 0.271484375, "mean_cosine_similarity": 0.93359375, "std_cosine_similarity": 0.16015625, "mean_l2_distance": 13.875, "std_l2_distance": 13.0625, "mean_dimension_correlation": 0.9205032348632812, "std_dimension_correlation": 0.029844860484086543, "linear_cka": 0.984375 }, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": { "mse": 0.267578125, "mean_cosine_similarity": 0.93359375, "std_cosine_similarity": 0.158203125, "mean_l2_distance": 13.6875, "std_l2_distance": 13.0, "mean_dimension_correlation": 0.9218185424804688, "std_dimension_correlation": 0.030954341854338954, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": { "mse": 0.39453125, "mean_cosine_similarity": 0.89453125, "std_cosine_similarity": 0.162109375, "mean_l2_distance": 20.125, "std_l2_distance": 12.625, "mean_dimension_correlation": 0.8868682861328125, "std_dimension_correlation": 0.03559183889902671, "linear_cka": 0.96484375 }, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": { "mse": 0.26953125, "mean_cosine_similarity": 0.93359375, "std_cosine_similarity": 0.1513671875, "mean_l2_distance": 13.8125, "std_l2_distance": 12.625, "mean_dimension_correlation": 0.9229568481445313, "std_dimension_correlation": 0.029229316660619842, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": { "mse": 0.2578125, "mean_cosine_similarity": 0.93359375, "std_cosine_similarity": 0.16015625, "mean_l2_distance": 13.25, "std_l2_distance": 13.25, "mean_dimension_correlation": 0.923333740234375, "std_dimension_correlation": 0.030134410337098863, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": { "mse": 0.26171875, "mean_cosine_similarity": 0.93359375, "std_cosine_similarity": 0.16015625, "mean_l2_distance": 13.4375, "std_l2_distance": 13.25, "mean_dimension_correlation": 0.9219314575195312, "std_dimension_correlation": 0.03136389625872561, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": { "mse": 0.25390625, "mean_cosine_similarity": 0.9375, "std_cosine_similarity": 0.1484375, "mean_l2_distance": 13.0625, "std_l2_distance": 12.625, "mean_dimension_correlation": 0.92755126953125, "std_dimension_correlation": 0.02898992593261031, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": { "mse": 0.39453125, "mean_cosine_similarity": 0.89453125, "std_cosine_similarity": 0.154296875, "mean_l2_distance": 20.125, "std_l2_distance": 12.1875, "mean_dimension_correlation": 0.8896469116210938, "std_dimension_correlation": 0.03377379140546021, "linear_cka": 0.96484375 }, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": { "mse": 0.263671875, "mean_cosine_similarity": 0.9375, "std_cosine_similarity": 0.150390625, "mean_l2_distance": 13.5625, "std_l2_distance": 12.625, "mean_dimension_correlation": 0.9245574951171875, "std_dimension_correlation": 0.029099754782990043, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": { "mse": 0.2578125, "mean_cosine_similarity": 0.93359375, "std_cosine_similarity": 0.16015625, "mean_l2_distance": 13.25, "std_l2_distance": 13.25, "mean_dimension_correlation": 0.9233123779296875, "std_dimension_correlation": 0.030156395218800952, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": { "mse": 0.263671875, "mean_cosine_similarity": 0.93359375, "std_cosine_similarity": 0.1572265625, "mean_l2_distance": 13.5625, "std_l2_distance": 13.0625, "mean_dimension_correlation": 0.9226715087890625, "std_dimension_correlation": 0.02929662688468137, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": { "mse": 0.255859375, "mean_cosine_similarity": 0.9375, "std_cosine_similarity": 0.1533203125, "mean_l2_distance": 13.125, "std_l2_distance": 12.8125, "mean_dimension_correlation": 0.9262313842773438, "std_dimension_correlation": 0.029011160291782537, "linear_cka": 0.984375 }, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": { "mse": 0.390625, "mean_cosine_similarity": 0.89453125, "std_cosine_similarity": 0.1591796875, "mean_l2_distance": 20.0, "std_l2_distance": 12.5, "mean_dimension_correlation": 0.8883377075195312, "std_dimension_correlation": 0.03512599620173197, "linear_cka": 0.96484375 }, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": { "mse": 0.271484375, "mean_cosine_similarity": 0.93359375, "std_cosine_similarity": 0.16015625, "mean_l2_distance": 13.875, "std_l2_distance": 13.0625, "mean_dimension_correlation": 0.9205001831054688, "std_dimension_correlation": 0.02990616928878693, "linear_cka": 0.984375 }, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": { "mse": 0.26171875, "mean_cosine_similarity": 0.93359375, "std_cosine_similarity": 0.16015625, "mean_l2_distance": 13.4375, "std_l2_distance": 13.25, "mean_dimension_correlation": 0.922039794921875, "std_dimension_correlation": 0.03143896607512693, "linear_cka": 0.984375 }, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": { "mse": 0.263671875, "mean_cosine_similarity": 0.93359375, "std_cosine_similarity": 0.1572265625, "mean_l2_distance": 13.5625, "std_l2_distance": 13.0625, "mean_dimension_correlation": 0.9226806640625, "std_dimension_correlation": 0.029339070768690877, "linear_cka": 0.984375 }, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": { "mse": 0.24609375, "mean_cosine_similarity": 0.9375, "std_cosine_similarity": 0.1591796875, "mean_l2_distance": 12.625, "std_l2_distance": 13.375, "mean_dimension_correlation": 0.9257278442382812, "std_dimension_correlation": 0.030489491126206747, "linear_cka": 0.984375 }, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": { "mse": 0.388671875, "mean_cosine_similarity": 0.89453125, "std_cosine_similarity": 0.1591796875, "mean_l2_distance": 20.0, "std_l2_distance": 12.4375, "mean_dimension_correlation": 0.8896011352539063, "std_dimension_correlation": 0.034245117741804325, "linear_cka": 0.96484375 }, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": { "mse": 0.267578125, "mean_cosine_similarity": 0.93359375, "std_cosine_similarity": 0.158203125, "mean_l2_distance": 13.6875, "std_l2_distance": 13.0, "mean_dimension_correlation": 0.9218338012695313, "std_dimension_correlation": 0.03096110466803191, "linear_cka": 0.984375 }, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": { "mse": 0.25390625, "mean_cosine_similarity": 0.9375, "std_cosine_similarity": 0.1484375, "mean_l2_distance": 13.0625, "std_l2_distance": 12.625, "mean_dimension_correlation": 0.9275863647460938, "std_dimension_correlation": 0.029019101935420444, "linear_cka": 0.984375 }, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": { "mse": 0.255859375, "mean_cosine_similarity": 0.9375, "std_cosine_similarity": 0.1533203125, "mean_l2_distance": 13.125, "std_l2_distance": 12.8125, "mean_dimension_correlation": 0.9262100219726562, "std_dimension_correlation": 0.029023808376502022, "linear_cka": 0.984375 }, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": { "mse": 0.24609375, "mean_cosine_similarity": 0.9375, "std_cosine_similarity": 0.1591796875, "mean_l2_distance": 12.625, "std_l2_distance": 13.375, "mean_dimension_correlation": 0.9256805419921875, "std_dimension_correlation": 0.030472261122601613, "linear_cka": 0.984375 }, "avg_mse": 0.3045572916666667, "std_mse": 0.061728089131668586, "avg_mean_cosine_similarity": 0.9216145833333333, "std_mean_cosine_similarity": 0.01921444452676741, "avg_std_cosine_similarity": 0.156640625, "std_std_cosine_similarity": 0.003999537123283247, "avg_mean_l2_distance": 15.608333333333333, "std_mean_l2_distance": 3.137845819808374, "avg_std_l2_distance": 12.7875, "std_std_l2_distance": 0.34746102898982306, "avg_mean_dimension_correlation": 0.9121468607584635, "std_mean_dimension_correlation": 0.016489903679962933, "avg_std_dimension_correlation": 0.03143026060381273, "std_std_dimension_correlation": 0.0023529554482308356, "avg_linear_cka": 0.9778645833333334, "std_linear_cka": 0.009207119546699838 } } }