File size: 23,377 Bytes
044c652 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 |
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="description" content="RecA: Reconstruction Alignment Improves Unified Multimodal Models">
<meta property="og:title" content="RecA: Reconstruction Alignment Improves Unified Multimodal Models">
<meta property="og:description" content="Introducing RecA, a self-supervised training framework that aligns understanding and generation through image reconstruction at the semantic level.">
<meta property="og:url" content="https://horizonwind2004.github.io/RecA-Page">
<meta name="twitter:title" content="RecA: Reconstruction Alignment Improves Unified Multimodal Models">
<meta name="twitter:description" content="Introducing RecA, a self-supervised training framework that aligns understanding and generation through image reconstruction at the semantic level.">
<meta name="keywords" content="RecA, Vision-Language, Multimodal, Image Generation, AI, Machine Learning">
<title>RecA: Reconstruction Alignment Improves Unified Multimodal Models</title>
<link rel="icon" type="image/png" href="static/files/teaser/logo.png">
<link rel="shortcut icon" href="static/files/teaser/logo.png" type="image/png">
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
<script src="https://cdn.tailwindcss.com"></script>
<script>
tailwind.config = {
theme: {
extend: {
fontFamily: {
sans: ['Inter', 'sans-serif'],
},
colors: {
primary: {
50: '#f0f9ff',
100: '#e0f2fe',
200: '#bae6fd',
300: '#7dd3fc',
400: '#38bdf8',
500: '#0ea5e9',
600: '#0284c7',
700: '#0369a1',
800: '#075985',
900: '#0c4a6e',
},
secondary: {
50: '#f8fafc',
100: '#f1f5f9',
200: '#e2e8f0',
300: '#cbd5e1',
400: '#94a3b8',
500: '#64748b',
600: '#475569',
700: '#334155',
800: '#1e293b',
900: '#0f172a',
},
},
}
}
}
</script>
<style>
.gradient-text {
background: linear-gradient(90deg, #3b82f6, #8b5cf6);
-webkit-background-clip: text;
background-clip: text;
color: transparent;
}
.hero-pattern {
background-image: url('static/files/teaser/DEMO.jpg');
background-size: cover;
background-position: center;
background-repeat: no-repeat;
position: relative;
}
.hero-pattern::before {
content: '';
position: absolute;
top: 0;
left: 0;
right: 0;
bottom: 0;
background-color: rgba(0, 0, 0, 0.6);
}
.hero-content {
position: relative;
z-index: 10;
}
.card-hover {
transition: all 0.3s ease;
}
.card-hover:hover {
transform: translateY(-5px);
box-shadow: 0 20px 25px -5px rgba(0, 0, 0, 0.1), 0 10px 10px -5px rgba(0, 0, 0, 0.04);
}
.animate-float {
animation: float 6s ease-in-out infinite;
}
@keyframes float {
0% { transform: translateY(0px); }
50% { transform: translateY(-15px); }
100% { transform: translateY(0px); }
}
.text-shadow {
text-shadow: 0 2px 4px rgba(0, 0, 0, 0.3);
}
</style>
</head>
<body class="font-sans bg-gray-50 text-gray-800">
<!-- Navigation -->
<nav class="bg-white/80 backdrop-blur-md shadow-sm sticky top-0 z-50">
<div class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8">
<div class="flex justify-between h-16">
<div class="flex items-center">
<img src="static/files/teaser/logo.png" alt="RecA Logo" class="h-8 w-8 mr-2">
<span class="text-xl font-semibold text-gray-900">RecA</span>
</div>
<div class="hidden md:flex items-center space-x-8">
<a href="#abstract" class="text-gray-700 hover:text-primary-600 transition">Abstract</a>
<a href="#approach" class="text-gray-700 hover:text-primary-600 transition">Approach</a>
<a href="#results" class="text-gray-700 hover:text-primary-600 transition">Results</a>
<a href="#demo" class="text-gray-700 hover:text-primary-600 transition">Demo</a>
</div>
<div class="flex items-center md:hidden">
<!-- Mobile menu button -->
<button class="text-gray-500 hover:text-gray-900 focus:outline-none">
<svg class="h-6 w-6" fill="none" viewBox="0 0 24 24" stroke="currentColor">
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M4 6h16M4 12h16M4 18h16"></path>
</svg>
</button>
</div>
</div>
</div>
</nav>
<!-- Hero Section with Demo Background -->
<section class="hero-pattern py-20">
<div class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8 hero-content">
<div class="text-center">
<div class="flex justify-center items-center mb-6">
<img src="static/files/teaser/logo.png" alt="RecA Logo" class="h-20 w-20 mr-4 animate-float">
<h1 class="text-4xl md:text-5xl font-bold text-white text-shadow">
<span class="gradient-text">RecA</span>: Reconstruction Alignment Improves Unified Multimodal Models
</h1>
</div>
<h2 class="mt-4 text-xl md:text-2xl text-gray-200 max-w-4xl mx-auto text-shadow">
Unlocking the Massive Zero-shot Potential in Unified Multimodal Models through Self-supervised Learning
</h2>
<!-- Authors -->
<div class="mt-8 flex flex-wrap justify-center gap-4">
<div class="bg-white/90 rounded-lg shadow-sm px-6 py-3 flex items-center backdrop-blur-sm">
<div class="text-center">
<p class="text-sm font-medium text-gray-700">Ji Xie<sup>1</sup></p>
<p class="text-xs text-gray-500">UC Berkeley</p>
</div>
</div>
<div class="bg-white/90 rounded-lg shadow-sm px-6 py-3 flex items-center backdrop-blur-sm">
<div class="text-center">
<p class="text-sm font-medium text-gray-700">Trevor Darrell<sup>1</sup></p>
<p class="text-xs text-gray-500">UC Berkeley</p>
</div>
</div>
<div class="bg-white/90 rounded-lg shadow-sm px-6 py-3 flex items-center backdrop-blur-sm">
<div class="text-center">
<p class="text-sm font-medium text-gray-700">Luke Zettlemoyer<sup>2</sup></p>
<p class="text-xs text-gray-500">University of Washington</p>
</div>
</div>
<div class="bg-white/90 rounded-lg shadow-sm px-6 py-3 flex items-center backdrop-blur-sm">
<div class="text-center">
<p class="text-sm font-medium text-gray-700">XuDong Wang<sup>1*</sup></p>
<p class="text-xs text-gray-500">UC Berkeley</p>
</div>
</div>
</div>
<!-- Links -->
<div class="mt-10 flex flex-wrap justify-center gap-4">
<a href="https://arxiv.org/pdf/2412.17910" target="_blank" class="inline-flex items-center px-6 py-3 border border-transparent text-base font-medium rounded-md shadow-sm text-white bg-primary-600 hover:bg-primary-700 transition">
<i class="fas fa-file-pdf mr-2"></i> Paper
</a>
<a href="https://arxiv.org/abs/2412.17910" target="_blank" class="inline-flex items-center px-6 py-3 border border-transparent text-base font-medium rounded-md shadow-sm text-white bg-gray-800 hover:bg-gray-900 transition">
<i class="fas fa-book-open mr-2"></i> ArXiv
</a>
<a href="https://github.com/HorizonWind2004/RecA" target="_blank" class="inline-flex items-center px-6 py-3 border border-transparent text-base font-medium rounded-md shadow-sm text-white bg-gray-800 hover:bg-gray-900 transition">
<i class="fab fa-github mr-2"></i> Code/Models
</a>
<a href="https://huggingface.co/collections/sanaka87/reca-68ad2176380355a3dcedc068" target="_blank" class="inline-flex items-center px-6 py-3 border border-transparent text-base font-medium rounded-md shadow-sm text-white bg-yellow-600 hover:bg-yellow-700 transition">
<i class="fas fa-robot mr-2"></i> HF Models
</a>
<a href="https://huggingface.co/spaces/sanaka87/BAGEL-RecA" target="_blank" class="inline-flex items-center px-6 py-3 border border-transparent text-base font-medium rounded-md shadow-sm text-white bg-red-600 hover:bg-red-700 transition">
<i class="fas fa-play-circle mr-2"></i> Demo (BAGEL)
</a>
</div>
</div>
</div>
</section>
<!-- Demo Section (now empty since background is the demo) -->
<section id="demo" class="py-16 bg-white">
<div class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8">
<div class="text-center mb-12">
<h2 class="text-3xl font-bold text-gray-900">Interactive Demo</h2>
<p class="mt-4 text-lg text-gray-600 max-w-3xl mx-auto">
Experience RecA's capabilities firsthand with our interactive demonstration
</p>
</div>
<div class="rounded-xl overflow-hidden shadow-xl max-w-5xl mx-auto bg-gray-100 p-8 text-center">
<p class="text-gray-600 mb-4">The background image above is from our demo showcasing RecA's capabilities.</p>
<a href="https://huggingface.co/spaces/sanaka87/BAGEL-RecA" target="_blank" class="inline-flex items-center px-6 py-3 border border-transparent text-base font-medium rounded-md shadow-sm text-white bg-red-600 hover:bg-red-700 transition">
<i class="fas fa-play-circle mr-2"></i> Try the Live Demo
</a>
</div>
</div>
</section>
<!-- Abstract Section -->
<section id="abstract" class="py-20 bg-gray-50">
<div class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8">
<div class="text-center mb-12">
<h2 class="text-3xl font-bold text-gray-900">Abstract</h2>
<div class="mt-4 h-1 w-20 bg-primary-600 mx-auto"></div>
</div>
<div class="bg-white rounded-xl shadow-md p-8 max-w-5xl mx-auto">
<div class="prose-lg text-gray-700">
<p class="mb-6">
Unified multimodal models (UMMs) are designed to perform both vision understanding and image generation within a single architecture. While they have achieved strong performance on image-text understanding tasks, their generation capabilities often lag behind, revealing a misalignment between what the model understands and what it can produce. We identify this disconnect as a consequence of sparse and biased text supervision in conventional training.
</p>
<p class="mb-6">
We propose <span class="font-bold text-primary-600">RecA</span>, a self-supervised training framework that aligns understanding and generation through image reconstruction at the semantic level. By reconstructing images from their own vision encoder embeddings, UMMs receive dense, semantically grounded supervision—free of captions or paired image-text data. This alignment mechanism effectively bridges the modality gap and improves generation fidelity.
</p>
<p>
Despite its simplicity, our approach delivers strong gains for unified multimodal models across generation and editing tasks. Applied to a 1.5B parameter UMM, RecA achieves state-of-the-art results on GenEval (<span class="font-bold text-primary-600">0.90</span>) and DPGBench (<span class="font-bold text-primary-600">88.15</span>), outperforming models with significantly larger scale. More impressively, RecA achieves this with modest compute, requiring just 8,000 unlabeled images and <span class="font-bold text-primary-600">6×A100 GPUs for 4.5 hours (27 GPU-hours)</span>.
</p>
</div>
</div>
</div>
</section>
<!-- Teaser Section -->
<section class="py-20 bg-white">
<div class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8">
<div class="text-center mb-12">
<h2 class="text-3xl font-bold text-gray-900">Teaser</h2>
<p class="mt-4 text-lg text-gray-600 max-w-3xl mx-auto">
RecA demonstrates remarkable improvements in multimodal understanding and generation capabilities
</p>
</div>
<div class="bg-gray-50 rounded-xl p-6 max-w-5xl mx-auto">
<img src="static/files/teaser/teaser.jpg" alt="RecA Teaser" class="w-full h-auto rounded-lg shadow-md">
</div>
</div>
</section>
<!-- Approach Section -->
<section id="approach" class="py-20 bg-gray-50">
<div class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8">
<div class="text-center mb-12">
<h2 class="text-3xl font-bold text-gray-900">RecA: Semantic-Level Image Reconstruction</h2>
<div class="mt-4 h-1 w-20 bg-primary-600 mx-auto"></div>
</div>
<div class="grid md:grid-cols-2 gap-8 max-w-5xl mx-auto">
<div class="bg-white rounded-xl shadow-md p-6 card-hover">
<h3 class="text-xl font-semibold text-gray-800 mb-4">The Challenge</h3>
<p class="text-gray-700 mb-4">
As shown in the figure, longer captions capture more details but still cannot fully represent the original image, missing the detailed <span class="font-medium text-primary-600">overall layouts, object shapes, instance attributes, etc</span>.
</p>
<p class="text-gray-700">
Vision embeddings from the understanding vision encoder are already mapped into the UMM's space while retaining richer visual information. <span class="font-medium italic">Can we prompt the UMMs with embeddings from visual understanding models to close this information gap?</span>
</p>
</div>
<div class="bg-white rounded-xl shadow-md p-6 card-hover">
<h3 class="text-xl font-semibold text-gray-800 mb-4">Our Solution</h3>
<p class="text-gray-700">
RecA implements a self-supervised training paradigm where a <span class="font-medium">understanding</span> vision encoder extracts features from the input image; these features are fused with template text embeddings and fed into a Unified Multimodal Model (UMM) to regenerate the image.
</p>
<p class="text-gray-700 mt-4">
We use the self-supervised loss between the original and generated images to optimize the UMM, providing dense supervision that preserves almost all fine-grained details that captions omit.
</p>
</div>
</div>
<div class="mt-12 bg-white rounded-xl shadow-md p-6 max-w-5xl mx-auto">
<img src="static/files/teaser/pipeline.jpg" alt="Figure 1: RecA Pipeline Overview" class="w-full h-auto rounded-lg">
</div>
</div>
</section>
<!-- Results Section -->
<section id="results" class="py-20 bg-white">
<div class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8">
<div class="text-center mb-12">
<h2 class="text-3xl font-bold text-gray-900">State-of-the-Art Performance</h2>
<div class="mt-4 h-1 w-20 bg-primary-600 mx-auto"></div>
</div>
<div class="bg-gray-50 rounded-xl shadow-md p-8 max-w-5xl mx-auto">
<div class="prose-lg text-gray-700">
<p class="mb-6">
After only a few training steps, all models post large <span class="font-bold text-primary-600">zero-shot</span> gains in generation capability with <span class="italic">no loss in vision-understanding accuracy</span>. Our fine-tuned Harmon model, even with just 1.5B parameters, achieves a high score of <span class="font-bold text-primary-600">0.86</span> on GenEval and <span class="font-bold text-primary-600">87.21</span> on DPGBench, significantly outperforming the previous state-of-the-art models <span class="font-bold">without any GPT-4o-Image distillation data or reinforcement learning</span>.
</p>
<p>
The most effective approach is a <span class="italic">two-stage strategy</span>: first applying SFT followed by reconstruction tuning, which achieves <span class="font-bold text-primary-600">0.90</span> on GenEval and <span class="font-bold text-primary-600">88.15</span> on DPGBench.
</p>
</div>
<div class="mt-10">
<img src="static/files/teaser/main.jpg" alt="Table 1: Benchmark Comparison" class="w-full h-auto rounded-lg shadow-sm">
</div>
</div>
</div>
</section>
<!-- Enhanced Editing Section -->
<section class="py-20 bg-gray-50">
<div class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8">
<div class="text-center mb-12">
<h2 class="text-3xl font-bold text-gray-900">Enhanced Editing Capabilities</h2>
<div class="mt-4 h-1 w-20 bg-primary-600 mx-auto"></div>
</div>
<div class="bg-white rounded-xl shadow-md p-8 max-w-5xl mx-auto">
<div class="prose-lg text-gray-700">
<p class="mb-6">
We surprisingly discover that, for models with <span class="italic">image editing capabilities</span>, our method also significantly improves their editing performance. RecA demonstrates consistent improvements across all editing categories, increasing the ImgEdit scores from 3.38 to <span class="font-bold text-primary-600">3.75</span> and GEdit from 6.94 to <span class="font-bold text-primary-600">7.25</span>, using only <span class="italic">1,000 training steps and 8,000 unlabeled images</span>.
</p>
<p>
Our method unlocks the model's inherent editing potential without expensive annotation across various tasks like addition, replacement, stylization and color modification.
</p>
</div>
<div class="mt-10">
<img src="static/files/teaser/edit_result.jpg" alt="Image Editing Results" class="w-full h-auto rounded-lg shadow-sm">
</div>
</div>
</div>
</section>
<!-- Generalizability Section -->
<section class="py-20 bg-white">
<div class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8">
<div class="text-center mb-12">
<h2 class="text-3xl font-bold text-gray-900">Enhanced Generalizability</h2>
<p class="mt-4 text-lg text-gray-600 max-w-3xl mx-auto">
Across Different Architectures and Tasks
</p>
</div>
<div class="bg-gray-50 rounded-xl shadow-md p-8 max-w-5xl mx-auto">
<div class="prose-lg text-gray-700">
<p class="mb-6">
RecA achieves consistent performance gains across different UMM frameworks, showcasing its generalizability. We apply RecA to various unified multimodal models including Show-o (AR), Harmon (AR+MAR), OpenUni (AR+Diffusion), and BAGEL (AR+Diffusion).
</p>
<p class="mb-6">
All models demonstrate significant improvements through RecA: the most notable improvement is achieved by Harmon-1.5B with 85.7 GenEval score (+12.8). Our method exhibits the most significant gains in <span class="font-medium">Position</span> and <span class="font-medium">Color Attribution</span> tasks, while maintaining correct subjects, bindings, and positions across cases with <span class="italic">multiple objects</span>, <span class="italic">complex attributions</span>, and explicit <span class="italic">spatial layouts</span>.
</p>
</div>
<div class="mt-10">
<img src="static/files/teaser/t2i_result.jpg" alt="Text-to-Image Generation Results" class="w-full h-auto rounded-lg shadow-sm">
</div>
</div>
</div>
</section>
<!-- Footer -->
<footer class="bg-gray-900 text-white py-12">
<div class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8">
<div class="flex flex-col md:flex-row justify-between items-center">
<div class="flex items-center mb-6 md:mb-0">
<img src="static/files/teaser/logo.png" alt="RecA Logo" class="h-10 w-10 mr-3">
<span class="text-xl font-semibold">RecA</span>
</div>
<div class="flex space-x-6">
<a href="https://arxiv.org/pdf/2412.17910" target="_blank" class="text-gray-300 hover:text-white transition">
<i class="fas fa-file-pdf text-xl"></i>
</a>
<a href="https://arxiv.org/abs/2412.17910" target="_blank" class="text-gray-300 hover:text-white transition">
<i class="fas fa-book-open text-xl"></i>
</a>
<a href="https://github.com/HorizonWind2004/RecA" target="_blank" class="text-gray-300 hover:text-white transition">
<i class="fab fa-github text-xl"></i>
</a>
<a href="https://huggingface.co/collections/sanaka87/reca-68ad2176380355a3dcedc068" target="_blank" class="text-gray-300 hover:text-white transition">
<i class="fas fa-robot text-xl"></i>
</a>
</div>
</div>
<div class="mt-8 pt-8 border-t border-gray-800 text-center text-gray-400 text-sm">
<p>© 2024 RecA Research Team. All rights reserved.</p>
</div>
</div>
</footer>
<!-- Back to top button -->
<button id="back-to-top" class="fixed bottom-8 right-8 bg-primary-600 text-white p-3 rounded-full shadow-lg opacity-0 invisible transition-all duration-300">
<i class="fas fa-arrow-up"></i>
</button>
<script>
// Back to top button
const backToTopButton = document.getElementById('back-to-top');
window.addEventListener('scroll', () => {
if (window.pageYOffset > 300) {
backToTopButton.classList.remove('opacity-0', 'invisible');
backToTopButton.classList.add('opacity-100', 'visible');
} else {
backToTopButton.classList.remove('opacity-100', 'visible');
backToTopButton.classList.add('opacity-0', 'invisible');
}
});
backToTopButton.addEventListener('click', () => {
window.scrollTo({
top: 0,
behavior: 'smooth'
});
});
// Smooth scrolling for anchor links
document.querySelectorAll('a[href^="#"]').forEach(anchor => {
anchor.addEventListener('click', function (e) {
e.preventDefault();
document.querySelector(this.getAttribute('href')).scrollIntoView({
behavior: 'smooth'
});
});
});
</script>
<p style="border-radius: 8px; text-align: center; font-size: 12px; color: #fff; margin-top: 16px;position: fixed; left: 8px; bottom: 8px; z-index: 10; background: rgba(0, 0, 0, 0.8); padding: 4px 8px;">Made with <img src="https://enzostvs-deepsite.hf.space/logo.svg" alt="DeepSite Logo" style="width: 16px; height: 16px; vertical-align: middle;display:inline-block;margin-right:3px;filter:brightness(0) invert(1);"><a href="https://enzostvs-deepsite.hf.space" style="color: #fff;text-decoration: underline;" target="_blank" >DeepSite</a> - 🧬 <a href="https://enzostvs-deepsite.hf.space?remix=sanaka87/reca-page" style="color: #fff;text-decoration: underline;" target="_blank" >Remix</a></p></body>
</html> |