{ "epoch": 1, "n_tokens": 4001792, "global_step": 3908, "training_metrics": { "train/loss": 2.5, "train/contrastive": 2.40625, "train/recons_loss": 0.671875, "train/balance_loss": 3.828125, "train/balance_loss_contrastive": 2.78125, "train/balance_loss_recons": 1.046875, "train/contrastive_std": 3.265625, "train/recons_std": 0.1513671875, "train/contrastive_min": 0.1162109375, "train/contrastive_max": 6.9375, "train/recons_min": 0.55859375, "train/recons_max": 0.96484375, "train/Qwen3_0.6B_layer_2": 0.96484375, "train/Qwen3_0.6B_layer_4": 0.6015625, "train/Qwen3_1.7B_layer_2": 0.58203125, "train/Qwen3_1.7B_layer_4": 0.69140625, "train/Qwen3_4B_layer_2": 0.55859375, "train/Qwen3_4B_layer_4": 0.625, "train/contrastives": null, "train/epoch": 1, "train/n_tokens": 4001792, "train/step": 3908 }, "eval_metrics": { "global_step": 3908, "n_tokens": 4001792, "kl_divergence": { "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 10.677733421325684, "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 10.070417404174805, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 10.500988960266113, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 10.254755973815918, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 10.141581535339355, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 10.209218978881836, "Qwen3_0.6B_layer_2_to_uniform": 9.070773124694824, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 6.944426536560059, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 2.526094675064087, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 2.448215961456299, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 2.5273706912994385, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 2.518568515777588, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 2.5949084758758545, "Qwen3_0.6B_layer_4_to_uniform": 9.070773124694824, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 10.105497360229492, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 6.14721155166626, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 6.3550543785095215, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 6.340244293212891, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 6.490333557128906, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 6.536875247955322, "Qwen3_1.7B_layer_2_to_uniform": 9.88111686706543, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 9.303935050964355, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 2.7055323123931885, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 2.6092498302459717, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 2.6337990760803223, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 2.66951322555542, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 2.742098569869995, "Qwen3_1.7B_layer_4_to_uniform": 9.88111686706543, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 8.335909843444824, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 2.7109298706054688, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 2.387141704559326, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 2.432076930999756, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 2.466850519180298, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 2.3831467628479004, "Qwen3_4B_layer_2_to_uniform": 10.104096412658691, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 8.078448295593262, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 3.5085513591766357, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 3.5161335468292236, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 3.7921173572540283, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 3.47990345954895, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 3.2069830894470215, "Qwen3_4B_layer_4_to_uniform": 10.104096412658691 }, "mae_hidden_states": { "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_2": 9.193017959594727, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": 5.069667816162109, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": 5.365467071533203, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": 5.159329414367676, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": 5.38785982131958, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": 5.0614728927612305, "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": 5.8135833740234375, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_4": 1.0534567832946777, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": 1.0686990022659302, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": 1.093401312828064, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": 1.055011510848999, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": 1.0553429126739502, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": 8.750565528869629, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": 1.014384150505066, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_2": 1.0321710109710693, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": 1.048608422279358, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": 1.0492010116577148, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": 1.053753137588501, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": 5.545437812805176, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": 1.3332059383392334, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": 1.340571641921997, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_4": 1.3302825689315796, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": 1.3458808660507202, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": 1.3539550304412842, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": 7.418992042541504, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": 1.1341036558151245, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": 1.0986826419830322, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": 1.1034446954727173, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_2": 1.0604543685913086, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": 1.0890880823135376, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": 6.221832275390625, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": 1.2409595251083374, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": 1.2512269020080566, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": 1.2450522184371948, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": 1.2315609455108643, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_4": 1.2183749675750732 }, "alignment": { "Qwen3_0.6B_layer_2_to_Qwen3_0.6B_layer_4": { "mse": 1.3515625, "mean_cosine_similarity": 0.052001953125, "std_cosine_similarity": 0.19140625, "mean_l2_distance": 69.5, "std_l2_distance": 7.3125, "mean_dimension_correlation": 0.46465563774108887, "std_dimension_correlation": 0.1362067287833464, "linear_cka": 0.5859375 }, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_2": { "mse": 1.34375, "mean_cosine_similarity": 0.056396484375, "std_cosine_similarity": 0.19140625, "mean_l2_distance": 69.0, "std_l2_distance": 7.34375, "mean_dimension_correlation": 0.46726187616586684, "std_dimension_correlation": 0.13268670396178475, "linear_cka": 0.578125 }, "Qwen3_0.6B_layer_2_to_Qwen3_1.7B_layer_4": { "mse": 1.34375, "mean_cosine_similarity": 0.055908203125, "std_cosine_similarity": 0.1904296875, "mean_l2_distance": 69.0, "std_l2_distance": 7.34375, "mean_dimension_correlation": 0.4647917509078979, "std_dimension_correlation": 0.13471787446655337, "linear_cka": 0.578125 }, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_2": { "mse": 1.3515625, "mean_cosine_similarity": 0.056640625, "std_cosine_similarity": 0.1884765625, "mean_l2_distance": 69.0, "std_l2_distance": 7.21875, "mean_dimension_correlation": 0.4657045602798462, "std_dimension_correlation": 0.13361017606712636, "linear_cka": 0.57421875 }, "Qwen3_0.6B_layer_2_to_Qwen3_4B_layer_4": { "mse": 1.34375, "mean_cosine_similarity": 0.05615234375, "std_cosine_similarity": 0.1904296875, "mean_l2_distance": 69.0, "std_l2_distance": 7.3125, "mean_dimension_correlation": 0.4670211374759674, "std_dimension_correlation": 0.13379253598505308, "linear_cka": 0.57421875 }, "Qwen3_0.6B_layer_4_to_Qwen3_0.6B_layer_2": { "mse": 1.359375, "mean_cosine_similarity": 0.052001953125, "std_cosine_similarity": 0.19140625, "mean_l2_distance": 69.5, "std_l2_distance": 7.3125, "mean_dimension_correlation": 0.4646653652191162, "std_dimension_correlation": 0.13617385784126732, "linear_cka": 0.5859375 }, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_2": { "mse": 0.5, "mean_cosine_similarity": 0.8046875, "std_cosine_similarity": 0.255859375, "mean_l2_distance": 25.5, "std_l2_distance": 19.375, "mean_dimension_correlation": 0.7786048889160156, "std_dimension_correlation": 0.079747066045607, "linear_cka": 0.984375 }, "Qwen3_0.6B_layer_4_to_Qwen3_1.7B_layer_4": { "mse": 0.498046875, "mean_cosine_similarity": 0.8046875, "std_cosine_similarity": 0.25390625, "mean_l2_distance": 25.375, "std_l2_distance": 19.25, "mean_dimension_correlation": 0.7792343139648438, "std_dimension_correlation": 0.07860444177664169, "linear_cka": 0.98828125 }, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_2": { "mse": 0.51953125, "mean_cosine_similarity": 0.78515625, "std_cosine_similarity": 0.275390625, "mean_l2_distance": 26.5, "std_l2_distance": 20.25, "mean_dimension_correlation": 0.7631843566894532, "std_dimension_correlation": 0.08458475161101357, "linear_cka": 0.98828125 }, "Qwen3_0.6B_layer_4_to_Qwen3_4B_layer_4": { "mse": 0.5078125, "mean_cosine_similarity": 0.79296875, "std_cosine_similarity": 0.2734375, "mean_l2_distance": 26.0, "std_l2_distance": 20.125, "mean_dimension_correlation": 0.7681190490722656, "std_dimension_correlation": 0.08387350384855204, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_2": { "mse": 1.3515625, "mean_cosine_similarity": 0.056396484375, "std_cosine_similarity": 0.19140625, "mean_l2_distance": 69.0, "std_l2_distance": 7.34375, "mean_dimension_correlation": 0.46729940325021746, "std_dimension_correlation": 0.13270665905666312, "linear_cka": 0.578125 }, "Qwen3_1.7B_layer_2_to_Qwen3_0.6B_layer_4": { "mse": 0.5, "mean_cosine_similarity": 0.8046875, "std_cosine_similarity": 0.255859375, "mean_l2_distance": 25.5, "std_l2_distance": 19.375, "mean_dimension_correlation": 0.7785774230957031, "std_dimension_correlation": 0.07977299719796638, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_2_to_Qwen3_1.7B_layer_4": { "mse": 0.49609375, "mean_cosine_similarity": 0.796875, "std_cosine_similarity": 0.2734375, "mean_l2_distance": 25.25, "std_l2_distance": 20.375, "mean_dimension_correlation": 0.7738082885742188, "std_dimension_correlation": 0.08244148252527034, "linear_cka": 0.98828125 }, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_2": { "mse": 0.50390625, "mean_cosine_similarity": 0.79296875, "std_cosine_similarity": 0.275390625, "mean_l2_distance": 25.75, "std_l2_distance": 20.5, "mean_dimension_correlation": 0.7685455322265625, "std_dimension_correlation": 0.08636337823761847, "linear_cka": 0.98828125 }, "Qwen3_1.7B_layer_2_to_Qwen3_4B_layer_4": { "mse": 0.48046875, "mean_cosine_similarity": 0.8125, "std_cosine_similarity": 0.251953125, "mean_l2_distance": 24.5, "std_l2_distance": 19.375, "mean_dimension_correlation": 0.789703369140625, "std_dimension_correlation": 0.07704774466213117, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_2": { "mse": 1.34375, "mean_cosine_similarity": 0.055908203125, "std_cosine_similarity": 0.1904296875, "mean_l2_distance": 69.0, "std_l2_distance": 7.34375, "mean_dimension_correlation": 0.4647957801818848, "std_dimension_correlation": 0.1347461643666133, "linear_cka": 0.578125 }, "Qwen3_1.7B_layer_4_to_Qwen3_0.6B_layer_4": { "mse": 0.49609375, "mean_cosine_similarity": 0.8046875, "std_cosine_similarity": 0.25390625, "mean_l2_distance": 25.375, "std_l2_distance": 19.25, "mean_dimension_correlation": 0.779193115234375, "std_dimension_correlation": 0.07862977772942846, "linear_cka": 0.98828125 }, "Qwen3_1.7B_layer_4_to_Qwen3_1.7B_layer_2": { "mse": 0.4921875, "mean_cosine_similarity": 0.796875, "std_cosine_similarity": 0.2734375, "mean_l2_distance": 25.25, "std_l2_distance": 20.375, "mean_dimension_correlation": 0.773846435546875, "std_dimension_correlation": 0.08246401911605972, "linear_cka": 0.98828125 }, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_2": { "mse": 0.5078125, "mean_cosine_similarity": 0.79296875, "std_cosine_similarity": 0.271484375, "mean_l2_distance": 26.0, "std_l2_distance": 20.125, "mean_dimension_correlation": 0.7682723999023438, "std_dimension_correlation": 0.08173679476643078, "linear_cka": 0.984375 }, "Qwen3_1.7B_layer_4_to_Qwen3_4B_layer_4": { "mse": 0.486328125, "mean_cosine_similarity": 0.80859375, "std_cosine_similarity": 0.255859375, "mean_l2_distance": 24.875, "std_l2_distance": 19.5, "mean_dimension_correlation": 0.7830284118652344, "std_dimension_correlation": 0.07756386958443834, "linear_cka": 0.98046875 }, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_2": { "mse": 1.359375, "mean_cosine_similarity": 0.056640625, "std_cosine_similarity": 0.1884765625, "mean_l2_distance": 69.0, "std_l2_distance": 7.21875, "mean_dimension_correlation": 0.46567630767822266, "std_dimension_correlation": 0.13364195702919346, "linear_cka": 0.57421875 }, "Qwen3_4B_layer_2_to_Qwen3_0.6B_layer_4": { "mse": 0.51953125, "mean_cosine_similarity": 0.78515625, "std_cosine_similarity": 0.275390625, "mean_l2_distance": 26.5, "std_l2_distance": 20.25, "mean_dimension_correlation": 0.7631195068359375, "std_dimension_correlation": 0.08451723229099471, "linear_cka": 0.98828125 }, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_2": { "mse": 0.50390625, "mean_cosine_similarity": 0.79296875, "std_cosine_similarity": 0.275390625, "mean_l2_distance": 25.75, "std_l2_distance": 20.5, "mean_dimension_correlation": 0.7685020446777344, "std_dimension_correlation": 0.08637723380547804, "linear_cka": 0.98828125 }, "Qwen3_4B_layer_2_to_Qwen3_1.7B_layer_4": { "mse": 0.5078125, "mean_cosine_similarity": 0.79296875, "std_cosine_similarity": 0.271484375, "mean_l2_distance": 26.0, "std_l2_distance": 20.125, "mean_dimension_correlation": 0.7681938171386719, "std_dimension_correlation": 0.08170402844520411, "linear_cka": 0.984375 }, "Qwen3_4B_layer_2_to_Qwen3_4B_layer_4": { "mse": 0.5, "mean_cosine_similarity": 0.7890625, "std_cosine_similarity": 0.27734375, "mean_l2_distance": 25.5, "std_l2_distance": 20.75, "mean_dimension_correlation": 0.7680191040039063, "std_dimension_correlation": 0.08532466419571123, "linear_cka": 0.98828125 }, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_2": { "mse": 1.3515625, "mean_cosine_similarity": 0.05615234375, "std_cosine_similarity": 0.1904296875, "mean_l2_distance": 69.0, "std_l2_distance": 7.3125, "mean_dimension_correlation": 0.4670826017856598, "std_dimension_correlation": 0.13384197305399426, "linear_cka": 0.57421875 }, "Qwen3_4B_layer_4_to_Qwen3_0.6B_layer_4": { "mse": 0.5078125, "mean_cosine_similarity": 0.79296875, "std_cosine_similarity": 0.2734375, "mean_l2_distance": 26.0, "std_l2_distance": 20.125, "mean_dimension_correlation": 0.7681541442871094, "std_dimension_correlation": 0.08380469982339137, "linear_cka": 0.984375 }, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_2": { "mse": 0.48046875, "mean_cosine_similarity": 0.8125, "std_cosine_similarity": 0.251953125, "mean_l2_distance": 24.5, "std_l2_distance": 19.375, "mean_dimension_correlation": 0.7896865844726563, "std_dimension_correlation": 0.07704049416139637, "linear_cka": 0.984375 }, "Qwen3_4B_layer_4_to_Qwen3_1.7B_layer_4": { "mse": 0.48828125, "mean_cosine_similarity": 0.80859375, "std_cosine_similarity": 0.255859375, "mean_l2_distance": 24.875, "std_l2_distance": 19.5, "mean_dimension_correlation": 0.7829521179199219, "std_dimension_correlation": 0.07745501980282286, "linear_cka": 0.98046875 }, "Qwen3_4B_layer_4_to_Qwen3_4B_layer_2": { "mse": 0.5, "mean_cosine_similarity": 0.7890625, "std_cosine_similarity": 0.27734375, "mean_l2_distance": 25.5, "std_l2_distance": 20.75, "mean_dimension_correlation": 0.7680793762207031, "std_dimension_correlation": 0.08533230341332358, "linear_cka": 0.98828125 }, "avg_mse": 0.783203125, "std_mse": 0.4008924066126565, "avg_mean_cosine_similarity": 0.5505045572916667, "std_mean_cosine_similarity": 0.3501489988188365, "avg_std_cosine_similarity": 0.24108072916666667, "std_std_cosine_similarity": 0.036733753214407784, "avg_mean_l2_distance": 40.05, "std_mean_l2_distance": 20.54668464740723, "avg_std_l2_distance": 15.74375, "std_std_l2_distance": 5.980928892460323, "avg_mean_dimension_correlation": 0.6713259566823642, "std_mean_dimension_correlation": 0.14540630815784578, "avg_std_dimension_correlation": 0.09921700445503585, "std_std_dimension_correlation": 0.024890230217464338, "avg_linear_cka": 0.85, "std_linear_cka": 0.19227216969593736 } } }