Spaces:

Yuxihenry
/

SpatialTrackerV2

Running on Zero

App Files Files Community

xiaoyuxi commited on Jul 2

Commit

2a1f271

1 Parent(s): 7e0fb4c

vggt_da

Browse files

Files changed (2) hide show

app.py +153 -229
app_backup.py +1268 -0

app.py CHANGED Viewed

@@ -684,11 +684,12 @@ def clear_all():
             gr.update(value=3))
 def clear_all_with_download():
-    """Clear all buffers including download component"""
     return (None, None, [],
             gr.update(value=50),
             gr.update(value=756),
             gr.update(value=3),
             None)  # HTML download component
 def update_tracker_model(model_name):
@@ -964,174 +965,84 @@ with gr.Blocks(
     Welcome to SpatialTracker V2! This interface allows you to track any pixels in 3D using our model.
-    **Instructions:**
     1. Upload a video file or select from examples below
-    2. Click on the object you want to track in the first frame
-    3. Adjust tracking parameters if needed
-    4. Click "Launch Visualization" to start tracking
     """)
-    # Status indicator with more detailed information
-    if BACKEND_AVAILABLE:
-        status_text = "🟢 Backend Connected"
-        status_details = f"Connected to: {BACKEND_SPACE_URL}"
-    else:
-        status_text = "🟡 Running in Standalone Mode"
-        status_details = f"Backend unavailable: {BACKEND_SPACE_URL}"
-    gr.Markdown(f"**Status:** {status_text}")
-    gr.Markdown(f"<small style='color: #666;'>{status_details}</small>", elem_id="status-details")
-    # Example videos section - moved to top
-    with gr.Group(elem_classes=["example-videos"]):
-        gr.Markdown("### 📂 Example Videos")
-        gr.Markdown("Try these example videos to get started quickly:")
-        # Custom horizontal scrolling video gallery
-        gr.HTML("""
-        <div style='background-color: #f8f9ff; border-radius: 8px; padding: 10px; margin: 10px 0; border-left: 4px solid #667eea;'>
-            <p style='margin: 0; font-size: 13px; color: #666; display: flex; align-items: center; gap: 8px;'>
-                <span style='font-size: 16px;'>💡</span>
-                <strong>Tip:</strong> Scroll horizontally below to see all example videos
-            </p>
-        </div>
-        """)
-        # Define video_input here so it can be referenced in examples
-        video_input = gr.Video(
-            label="Upload Video or Select Example",
-            format="mp4",
-            height=300
-        )
-        # Create a horizontal scrolling container for the examples
-        with gr.Group(elem_classes=["horizontal-examples"]):
-            gr.Examples(
-                examples=[
-                    ["examples/kiss.mp4"],
-                    ["examples/backpack.mp4"],
-                    ["examples/pillow.mp4"],
-                    ["examples/handwave.mp4"],
-                    ["examples/hockey.mp4"],
-                    ["examples/drifting.mp4"],
-                    ["examples/ken_block_0.mp4"],
-                    ["examples/kitchen.mp4"],
-                    ["examples/basketball.mp4"],
-                    ["examples/ego_kc1.mp4"],
-                    ["examples/vertical_place.mp4"],
-                    ["examples/ego_teaser.mp4"],
-                    ["examples/robot_unitree.mp4"],
-                    ["examples/robot_3.mp4"],
-                    ["examples/teleop2.mp4"],
-                    ["examples/pusht.mp4"],
-                    ["examples/cinema_0.mp4"],
-                    ["examples/cinema_1.mp4"],
-                ],
-                inputs=video_input,
-                label="🎬 Click on any example to load it",
-                examples_per_page=16
-            )
     with gr.Row():
         with gr.Column(scale=1):
-            # Interactive frame display
             with gr.Group():
-                gr.Markdown("### 🎯 Point Selection")
-                gr.Markdown("Click on the object you want to track in the frame below:")
-                interactive_frame = gr.Image(
-                    label="Click to select tracking points",
-                    type="numpy",
-                    interactive=True
                 )
-                with gr.Row():
-                    point_type = gr.Radio(
-                        choices=["positive_point", "negative_point"],
-                        value="positive_point",
-                        label="Point Type",
-                        info="Positive points indicate the object to track, negative points indicate areas to avoid"
                     )
-                with gr.Row():
-                    reset_points_btn = gr.Button("🔄 Reset Points", variant="secondary")
-                    clear_all_btn = gr.Button("🗑️ Clear All", variant="stop")
         with gr.Column(scale=1):
-            # Tracking results
-            with gr.Group():
-                gr.Markdown("### 🎬 Tracking Results")
-                tracking_result_video = gr.Video(
-                    label="Tracking Result Video",
-                    interactive=False,
-                    height=300
-                )
-                # HTML文件下载组件
-                html_download = gr.File(
-                    label="📥 Download 3D Visualization HTML",
-                    interactive=False,
-                    visible=True
-                )
-    # Advanced settings section - changed to open=True
-    with gr.Accordion("⚙️ Advanced Settings", open=True):
-        gr.Markdown("Adjust these parameters to optimize tracking performance:")
-        with gr.Row():
-            grid_size = gr.Slider(
-                minimum=10,
-                maximum=100,
-                step=10,
-                value=50,
-                label="Grid Size",
-                info="Size of the tracking grid (larger = more detailed)"
-            )
-            vo_points = gr.Slider(
-                minimum=100,
-                maximum=2000,
-                step=50,
-                value=756,
-                label="VO Points",
-                info="Number of visual odometry points (more = better accuracy)"
-            )
-            fps = gr.Slider(
-                minimum=1,
-                maximum=30,
-                step=1,
-                value=3,
-                label="FPS",
-                info="Frames per second for processing (higher = smoother but slower)"
-            )
-    # Launch button
-    with gr.Row():
-        launch_btn = gr.Button("🚀 Start Tracking Now!", variant="primary", size="lg")
-    # 3D Visualization - Make it larger and more prominent
-    with gr.Row():
-        with gr.Column():
             with gr.Group():
                 gr.Markdown("### 🌐 3D Trajectory Visualization")
-                gr.Markdown("Interactive 3D visualization of 3D point tracking and camera motion:")
                 viz_html = gr.HTML(
                     label="3D Trajectory Visualization",
                     value="""
-                    <div style='border: 3px solid #667eea; border-radius: 15px; padding: 40px;
                                 background: linear-gradient(135deg, #f8f9ff 0%, #e6f3ff 100%);
-                                text-align: center; min-height: 600px; display: flex;
                                 flex-direction: column; justify-content: center; align-items: center;
-                                box-shadow: 0 8px 32px rgba(102, 126, 234, 0.2);'>
-                        <div style='font-size: 48px; margin-bottom: 20px;'>🌐</div>
-                        <h2 style='color: #667eea; margin-bottom: 15px; font-size: 28px; font-weight: 600;'>
                             3D Trajectory Visualization
-                        </h2>
-                        <p style='color: #666; font-size: 16px; line-height: 1.6; max-width: 500px; margin-bottom: 25px;'>
-                            Perceive the world with Pixel-wise 3D Motions!
                         </p>
-                        <div style='background: rgba(102, 126, 234, 0.1); border-radius: 25px;
-                                    padding: 12px 24px; border: 2px solid rgba(102, 126, 234, 0.2);'>
-                            <span style='color: #667eea; font-weight: 600; font-size: 14px;'>
                                 ⚡ Powered by SpatialTracker V2
                             </span>
                         </div>
@@ -1139,7 +1050,100 @@ with gr.Blocks(
                     """,
                     elem_id="viz_container"
                 )
     # Hidden state variables
     original_image_state = gr.State(None)
     selected_points = gr.State([])
@@ -1165,95 +1169,15 @@ with gr.Blocks(
     clear_all_btn.click(
         fn=clear_all_with_download,
-        outputs=[video_input, interactive_frame, selected_points, grid_size, vo_points, fps, html_download]
     )
     launch_btn.click(
         fn=launch_viz,
         inputs=[grid_size, vo_points, fps, original_image_state],
-        outputs=[viz_html, tracking_result_video, html_download]
     )
-    # GitHub Star Reminder - Added back!
-    gr.HTML("""
-    <div style='background: linear-gradient(135deg, #e8eaff 0%, #f0f2ff 100%);
-                border-radius: 10px;
-                padding: 15px;
-                margin: 15px 0;
-                box-shadow: 0 2px 8px rgba(102, 126, 234, 0.1);
-                border: 1px solid rgba(102, 126, 234, 0.15);'>
-        <div style='text-align: center; color: #4a5568;'>
-            <h3 style='margin: 0 0 10px 0; font-size: 18px; text-shadow: none; color: #2d3748;'>
-                ⭐ Love SpatialTracker? Give us a Star! ⭐
-            </h3>
-            <p style='margin: 0 0 12px 0; font-size: 14px; opacity: 0.8; color: #4a5568;'>
-                Help us grow by starring our repository on GitHub! 🚀
-            </p>
-            <div style='display: flex; justify-content: center;'>
-                <a href="https://github.com/henry123-boy/SpaTrackerV2"
-                   target="_blank"
-                   style='display: inline-flex;
-                          align-items: center;
-                          gap: 6px;
-                          background: rgba(102, 126, 234, 0.1);
-                          color: #4a5568;
-                          padding: 8px 16px;
-                          border-radius: 20px;
-                          text-decoration: none;
-                          font-weight: bold;
-                          font-size: 14px;
-                          backdrop-filter: blur(5px);
-                          border: 1px solid rgba(102, 126, 234, 0.2);
-                          transition: all 0.3s ease;'
-                   onmouseover="this.style.background='rgba(102, 126, 234, 0.15)'; this.style.transform='translateY(-1px)'"
-                   onmouseout="this.style.background='rgba(102, 126, 234, 0.1)'; this.style.transform='translateY(0)'">
-                    <span style='font-size: 16px;'>⭐</span>
-                    Star on GitHub
-                </a>
-            </div>
-        </div>
-    </div>
-    """)
-    # Acknowledgment section for TAPIR3D - moved to the end
-    gr.HTML("""
-    <div style='background: linear-gradient(135deg, #fff8e1 0%, #fffbf0 100%);
-                border-radius: 8px;
-                padding: 12px;
-                margin: 15px 0;
-                box-shadow: 0 1px 4px rgba(255, 193, 7, 0.1);
-                border: 1px solid rgba(255, 193, 7, 0.2);'>
-        <div style='text-align: center; color: #5d4037;'>
-            <h5 style='margin: 0 0 6px 0; font-size: 14px; color: #5d4037;'>
-                Acknowledgments
-            </h5>
-            <p style='margin: 0; font-size: 12px; opacity: 0.9; color: #5d4037; line-height: 1.3;'>
-                Our 3D visualizer is adapted from <strong>TAPIP3D</strong>. We thank the authors for their excellent work!
-            </p>
-            <div style='margin-top: 6px;'>
-                <a href="https://github.com/zbw001/TAPIP3D"
-                   target="_blank"
-                   style='display: inline-flex;
-                          align-items: center;
-                          gap: 3px;
-                          background: rgba(255, 193, 7, 0.15);
-                          color: #5d4037;
-                          padding: 3px 10px;
-                          border-radius: 12px;
-                          text-decoration: none;
-                          font-weight: 500;
-                          font-size: 11px;
-                          border: 1px solid rgba(255, 193, 7, 0.3);
-                          transition: all 0.3s ease;'
-                   onmouseover="this.style.background='rgba(255, 193, 7, 0.2)'"
-                   onmouseout="this.style.background='rgba(255, 193, 7, 0.15)'">
-                    📚 TAPIP3D Repository
-                </a>
-            </div>
-        </div>
-    </div>
-    """)
 # Launch the interface
 if __name__ == "__main__":
     print("🌟 Launching SpatialTracker V2 Frontend...")

             gr.update(value=3))
 def clear_all_with_download():
+    """Clear all buffers including both download components"""
     return (None, None, [],
             gr.update(value=50),
             gr.update(value=756),
             gr.update(value=3),
+            None,  # tracking_video_download
             None)  # HTML download component
 def update_tracker_model(model_name):
     Welcome to SpatialTracker V2! This interface allows you to track any pixels in 3D using our model.
+    **⚡ Quick Start:** Upload video → Click "Start Tracking Now!"
+    **🔬 Advanced Usage with SAM:**
     1. Upload a video file or select from examples below
+    2. Expand "Manual Point Selection" to click on specific objects for SAM-guided tracking
+    3. Adjust tracking parameters for optimal performance
+    4. Click "Start Tracking Now!" to begin 3D tracking with SAM guidance
     """)
+    # Status indicator - more compact
+    status_info = "🟢 Backend Connected" if BACKEND_AVAILABLE else "🟡 Standalone Mode"
+    gr.Markdown(f"**Status:** {status_info} | Backend: {BACKEND_SPACE_URL}")
+    # Main content area - video upload left, 3D visualization right
     with gr.Row():
         with gr.Column(scale=1):
+            # Video upload section
             with gr.Group():
+                gr.Markdown("### 📂 Select Video")
+                video_input = gr.Video(
+                    label="Upload Video or Select Example",
+                    format="mp4",
+                    height=250  # Matched height with 3D viz
                 )
+                # Compact horizontal examples
+                gr.Markdown("**Examples:** (scroll horizontally)")
+                with gr.Group(elem_classes=["horizontal-examples"]):
+                    gr.Examples(
+                        examples=[
+                            ["examples/kiss.mp4"],
+                            ["examples/backpack.mp4"],
+                            ["examples/pillow.mp4"],
+                            ["examples/handwave.mp4"],
+                            ["examples/hockey.mp4"],
+                            ["examples/drifting.mp4"],
+                            ["examples/ken_block_0.mp4"],
+                            ["examples/kitchen.mp4"],
+                            ["examples/basketball.mp4"],
+                            ["examples/ego_kc1.mp4"],
+                            ["examples/vertical_place.mp4"],
+                            ["examples/ego_teaser.mp4"],
+                            ["examples/robot_unitree.mp4"],
+                            ["examples/robot_3.mp4"],
+                            ["examples/teleop2.mp4"],
+                            ["examples/pusht.mp4"],
+                            ["examples/cinema_0.mp4"],
+                            ["examples/cinema_1.mp4"],
+                        ],
+                        inputs=video_input,
+                        label="",
+                        examples_per_page=18
                     )
         with gr.Column(scale=1):
+            # 3D Visualization - moved to top right for immediate visibility
             with gr.Group():
                 gr.Markdown("### 🌐 3D Trajectory Visualization")
                 viz_html = gr.HTML(
                     label="3D Trajectory Visualization",
                     value="""
+                    <div style='border: 3px solid #667eea; border-radius: 10px; padding: 20px;
                                 background: linear-gradient(135deg, #f8f9ff 0%, #e6f3ff 100%);
+                                text-align: center; min-height: 250px; display: flex;
                                 flex-direction: column; justify-content: center; align-items: center;
+                                box-shadow: 0 4px 16px rgba(102, 126, 234, 0.15);'>
+                        <div style='font-size: 32px; margin-bottom: 10px;'>🌐</div>
+                        <h3 style='color: #667eea; margin-bottom: 8px; font-size: 18px; font-weight: 600;'>
                             3D Trajectory Visualization
+                        </h3>
+                        <p style='color: #666; font-size: 13px; line-height: 1.4; max-width: 400px; margin-bottom: 15px;'>
+                            Track any pixels in 3D space with camera motion
                         </p>
+                        <div style='background: rgba(102, 126, 234, 0.1); border-radius: 15px;
+                                    padding: 6px 12px; border: 1px solid rgba(102, 126, 234, 0.2);'>
+                            <span style='color: #667eea; font-weight: 500; font-size: 11px;'>
                                 ⚡ Powered by SpatialTracker V2
                             </span>
                         </div>
                     """,
                     elem_id="viz_container"
                 )
+    # Start button section - below video area
+    with gr.Row():
+        with gr.Column(scale=3):
+            launch_btn = gr.Button("🚀 Start Tracking Now!", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            clear_all_btn = gr.Button("🗑️ Clear All", variant="secondary", size="sm")
+    # Tracking parameters section
+    with gr.Row():
+        gr.Markdown("### ⚙️ Tracking Parameters")
+    with gr.Row():
+        grid_size = gr.Slider(
+            minimum=10, maximum=100, step=10, value=50,
+            label="Grid Size", info="Tracking detail level"
+        )
+        vo_points = gr.Slider(
+            minimum=100, maximum=2000, step=50, value=756,
+            label="VO Points", info="Motion accuracy"
+        )
+        fps = gr.Slider(
+            minimum=1, maximum=30, step=1, value=3,
+            label="FPS", info="Processing speed"
+        )
+    # Advanced Point Selection with SAM - Collapsed by default
+    with gr.Accordion("🎯 Advanced: Manual Point Selection with SAM", open=False):
+        gr.Markdown("""
+        **Use SAM (Segment Anything Model) for precise object selection:**
+        - Click on target objects in the image for SAM-guided segmentation
+        - Positive points: include these areas | Negative points: exclude these areas
+        - Get more accurate 3D tracking results with SAM's powerful segmentation
+        """)
+        with gr.Row():
+            with gr.Column():
+                interactive_frame = gr.Image(
+                    label="Click to select tracking points with SAM guidance",
+                    type="numpy",
+                    interactive=True,
+                    height=300
+                )
+                with gr.Row():
+                    point_type = gr.Radio(
+                        choices=["positive_point", "negative_point"],
+                        value="positive_point",
+                        label="Point Type",
+                        info="Positive: track these areas | Negative: avoid these areas"
+                    )
+                with gr.Row():
+                    reset_points_btn = gr.Button("🔄 Reset Points", variant="secondary", size="sm")
+    # Downloads section - both as download buttons
+    with gr.Row():
+        gr.Markdown("### 📥 Downloads")
+    with gr.Row():
+        with gr.Column(scale=1):
+            tracking_video_download = gr.File(
+                label="📹 Download 2D Tracking Video",
+                interactive=False,
+                visible=True
+            )
+        with gr.Column(scale=1):
+            html_download = gr.File(
+                label="📄 Download 3D Visualization HTML",
+                interactive=False,
+                visible=True
+            )
+    # Compact footer
+    gr.HTML("""
+    <div style='background: linear-gradient(135deg, #e8eaff 0%, #f0f2ff 100%);
+                border-radius: 8px; padding: 12px; margin: 10px 0;
+                box-shadow: 0 1px 4px rgba(102, 126, 234, 0.1);
+                border: 1px solid rgba(102, 126, 234, 0.15);'>
+        <div style='display: flex; justify-content: space-between; align-items: center; color: #4a5568;'>
+            <div style='display: flex; align-items: center; gap: 15px;'>
+                <a href="https://github.com/henry123-boy/SpaTrackerV2" target="_blank"
+                   style='display: flex; align-items: center; gap: 5px; text-decoration: none; color: #4a5568; font-weight: 500;'>
+                    ⭐ Star on GitHub
+                </a>
+                <span style='color: #ccc;'>|</span>
+                <a href="https://github.com/zbw001/TAPIP3D" target="_blank"
+                   style='display: flex; align-items: center; gap: 5px; text-decoration: none; color: #4a5568; font-size: 12px;'>
+                    📚 TAPIP3D Acknowledgments
+                </a>
+            </div>
+            <span style='font-size: 11px; color: #666;'>Powered by SpatialTracker V2</span>
+        </div>
+    </div>
+    """)
     # Hidden state variables
     original_image_state = gr.State(None)
     selected_points = gr.State([])
     clear_all_btn.click(
         fn=clear_all_with_download,
+        outputs=[video_input, interactive_frame, selected_points, grid_size, vo_points, fps, tracking_video_download, html_download]
     )
     launch_btn.click(
         fn=launch_viz,
         inputs=[grid_size, vo_points, fps, original_image_state],
+        outputs=[viz_html, tracking_video_download, html_download]
     )
 # Launch the interface
 if __name__ == "__main__":
     print("🌟 Launching SpatialTracker V2 Frontend...")

app_backup.py ADDED Viewed

	@@ -0,0 +1,1268 @@

+import gradio as gr
+import os
+import json
+import numpy as np
+import cv2
+import base64
+import requests
+import time
+from typing import List, Tuple
+from gradio_client.utils import handle_file
+from pathlib import Path
+# Backend Space URL - replace with your actual backend space URL
+BACKEND_SPACE_URL = "Yuxihenry/SpatialTrackerV2_Backend"  # Replace with actual backend space URL
+hf_token = os.getenv("HF_TOKEN")  # Replace with your actual Hugging Face token
+# Debug information
+print(f"🔧 Environment Debug Info:")
+print(f"   - Backend URL: {BACKEND_SPACE_URL}")
+print(f"   - HF Token available: {'Yes' if hf_token else 'No'}")
+print(f"   - HF Token length: {len(hf_token) if hf_token else 0}")
+# Flag to track if backend is available
+BACKEND_AVAILABLE = False
+backend_client = None
+def check_user_permissions():
+    """Check if user has necessary permissions"""
+    print("🔐 Checking user permissions...")
+    if not hf_token:
+        print("❌ No HF Token found")
+        print("🔧 To get a token:")
+        print("   1. Go to https://huggingface.co/settings/tokens")
+        print("   2. Create a new token with 'read' permissions")
+        print("   3. Set it as environment variable: export HF_TOKEN='your_token'")
+        return False
+    # Try to access user info
+    try:
+        headers = {'Authorization': f'Bearer {hf_token}'}
+        response = requests.get('https://huggingface.co/api/whoami', headers=headers, timeout=5)
+        if response.status_code == 200:
+            user_info = response.json()
+            username = user_info.get('name', 'Unknown')
+            print(f"✅ Authenticated as: {username}")
+            # Check if user has access to the specific space
+            space_url = f"https://huggingface.co/api/spaces/{BACKEND_SPACE_URL}"
+            space_response = requests.get(space_url, headers=headers, timeout=5)
+            if space_response.status_code == 200:
+                print("✅ You have access to the backend Space")
+                return True
+            elif space_response.status_code == 401:
+                print("❌ You don't have access to the backend Space")
+                print("🔧 Solutions:")
+                print("   1. Contact the Space owner to add you as collaborator")
+                print("   2. Ask the owner to make the Space public")
+                return False
+            elif space_response.status_code == 404:
+                print("❌ Backend Space not found")
+                print("🔧 Please check if the Space URL is correct")
+                return False
+            else:
+                print(f"⚠️  Unexpected response checking Space access: {space_response.status_code}")
+                return False
+        else:
+            print(f"❌ Token validation failed: {response.status_code}")
+            print("🔧 Your token might be invalid or expired")
+            return False
+    except Exception as e:
+        print(f"❌ Error checking permissions: {e}")
+        return False
+def check_backend_space_status():
+    """Check if backend space is running via HTTP request"""
+    try:
+        backend_url = f"https://huggingface.co/spaces/{BACKEND_SPACE_URL}"
+        print(f"🔍 Checking backend space status: {backend_url}")
+        # Prepare headers with authentication if token is available
+        headers = {}
+        if hf_token:
+            headers['Authorization'] = f'Bearer {hf_token}'
+            print(f"🔐 Using HF Token for authentication")
+        # Try to access the space page
+        response = requests.get(backend_url, headers=headers, timeout=10)
+        if response.status_code == 200:
+            print("✅ Backend space page is accessible")
+            # Check if space is running (look for common indicators)
+            page_content = response.text.lower()
+            if "runtime error" in page_content:
+                print("❌ Backend space has runtime error")
+                return False
+            elif "building" in page_content:
+                print("🔄 Backend space is building...")
+                return False
+            elif "sleeping" in page_content:
+                print("😴 Backend space is sleeping")
+                return False
+            else:
+                print("✅ Backend space appears to be running")
+                return True
+        elif response.status_code == 401:
+            print("❌ Authentication failed (HTTP 401)")
+            print("🔧 This means:")
+            print("   - The backend Space is private")
+            print("   - Your HF Token doesn't have access to this Space")
+            print("   - You need to be added as a collaborator to the Space")
+            print("   - Or the Space owner needs to make it public")
+            return False
+        elif response.status_code == 404:
+            print("❌ Backend space not found (HTTP 404)")
+            print("🔧 Please check if the Space URL is correct:")
+            print(f"   Current URL: {BACKEND_SPACE_URL}")
+            return False
+        else:
+            print(f"❌ Backend space not accessible (HTTP {response.status_code})")
+            print(f"🔧 Response: {response.text[:200]}...")
+            return False
+    except requests.RequestException as e:
+        print(f"❌ Failed to check backend space status: {e}")
+        return False
+    except Exception as e:
+        print(f"❌ Unexpected error checking backend: {e}")
+        return False
+def initialize_backend():
+    """Initialize backend connection using gradio_client"""
+    global backend_client, BACKEND_AVAILABLE
+    try:
+        from gradio_client import Client
+        # Connect to HF Space
+        if hf_token:
+            backend_client = Client(BACKEND_SPACE_URL, hf_token=hf_token)
+        else:
+            backend_client = Client(BACKEND_SPACE_URL)
+        # Test the connection
+        backend_client.view_api()
+        BACKEND_AVAILABLE = True
+        return True
+    except Exception as e:
+        print(f"❌ Backend connection failed: {e}")
+        BACKEND_AVAILABLE = False
+        return False
+def numpy_to_base64(arr):
+    """Convert numpy array to base64 string"""
+    return base64.b64encode(arr.tobytes()).decode('utf-8')
+def base64_to_numpy(b64_str, shape, dtype):
+    """Convert base64 string back to numpy array"""
+    return np.frombuffer(base64.b64decode(b64_str), dtype=dtype).reshape(shape)
+def base64_to_image(b64_str):
+    """Convert base64 string to numpy image array"""
+    if not b64_str:
+        return None
+    try:
+        # Decode base64 to bytes
+        img_bytes = base64.b64decode(b64_str)
+        # Convert bytes to numpy array
+        nparr = np.frombuffer(img_bytes, np.uint8)
+        # Decode image
+        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+        # Convert BGR to RGB
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        return img
+    except Exception as e:
+        print(f"Error converting base64 to image: {e}")
+        return None
+def get_video_name(video_path):
+    """Extract video name without extension"""
+    return os.path.splitext(os.path.basename(video_path))[0]
+def extract_first_frame(video_path):
+    """Extract first frame from video file"""
+    try:
+        cap = cv2.VideoCapture(video_path)
+        ret, frame = cap.read()
+        cap.release()
+        if ret:
+            # Convert BGR to RGB
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            return frame_rgb
+        else:
+            return None
+    except Exception as e:
+        print(f"Error extracting first frame: {e}")
+        return None
+def handle_video_upload(video):
+    """Handle video upload and extract first frame"""
+    if video is None:
+        return (None, None, [],
+                gr.update(value=50),
+                gr.update(value=756),
+                gr.update(value=3))
+    try:
+        if BACKEND_AVAILABLE and backend_client:
+            # Try to use backend API
+            try:
+                print("🔧 Calling backend API for video upload...")
+                # Call the unified API with upload_video function type - fix: use handle_file wrapper
+                result = backend_client.predict(
+                    "upload_video",  # function_type
+                    handle_file(video),           # video file - wrapped with handle_file
+                    "",              # original_image_state (not used for upload)
+                    [],              # selected_points (not used for upload)
+                    "positive_point", # point_type (not used for upload)
+                    0,               # point_x (not used for upload)
+                    0,               # point_y (not used for upload)
+                    50,              # grid_size (not used for upload)
+                    756,             # vo_points (not used for upload)
+                    3,               # fps (not used for upload)
+                    api_name="/unified_api"
+                )
+                print(f"✅ Backend video upload API call successful!")
+                print(f"🔧 Result type: {type(result)}")
+                print(f"🔧 Result: {result}")
+                # Parse the result - expect a dict with success status
+                if isinstance(result, dict) and result.get("success"):
+                    # Extract data from backend response
+                    original_image_state = result.get("original_image_state", "")
+                    display_image = result.get("display_image", None)
+                    selected_points = result.get("selected_points", [])
+                    # Fix: Convert display_image from list back to numpy array if needed
+                    if isinstance(display_image, list):
+                        display_image = np.array(display_image, dtype=np.uint8)
+                        print(f"🔧 Converted display_image from list to numpy array: {display_image.shape}")
+                    # Get video settings based on video name
+                    video_name = get_video_name(video)
+                    print(f"🎬 Video path: '{video}' -> Video name: '{video_name}'")
+                    grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)
+                    print(f"🎬 Video settings for '{video_name}': grid_size={grid_size_val}, vo_points={vo_points_val}, fps={fps_val}")
+                    return (original_image_state, display_image, selected_points,
+                            gr.update(value=grid_size_val),
+                            gr.update(value=vo_points_val),
+                            gr.update(value=fps_val))
+                else:
+                    print("Backend processing failed, using local fallback")
+                    # Fallback to local processing
+                    pass
+            except Exception as e:
+                print(f"Backend API call failed: {e}")
+                # Fallback to local processing
+                pass
+        # Fallback: local processing
+        print("Using local video processing...")
+        display_image = extract_first_frame(video)
+        if display_image is not None:
+            # Create a state format compatible with backend
+            import tempfile
+            import shutil
+            # Create a temporary directory for this session
+            session_id = str(int(time.time() * 1000))  # Use timestamp as session ID
+            temp_dir = os.path.join("temp_frontend", f"session_{session_id}")
+            os.makedirs(temp_dir, exist_ok=True)
+            # Copy video to temp directory with standardized name
+            video_name = get_video_name(video)
+            temp_video_path = os.path.join(temp_dir, f"{video_name}.mp4")
+            shutil.copy(video, temp_video_path)
+            # Create state format compatible with backend
+            frame_data = {
+                'data': numpy_to_base64(display_image),
+                'shape': display_image.shape,
+                'dtype': str(display_image.dtype),
+                'temp_dir': temp_dir,
+                'video_name': video_name,
+                'video_path': temp_video_path  # Keep for backward compatibility
+            }
+            original_image_state = json.dumps(frame_data)
+        else:
+            # Fallback to simple state if frame extraction fails
+            original_image_state = json.dumps({
+                "video_path": video,
+                "frame": "local_processing_failed"
+            })
+        # Get video settings
+        video_name = get_video_name(video)
+        print(f"🎬 Local fallback - Video path: '{video}' -> Video name: '{video_name}'")
+        grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)
+        print(f"🎬 Local fallback - Video settings for '{video_name}': grid_size={grid_size_val}, vo_points={vo_points_val}, fps={fps_val}")
+        return (original_image_state, display_image, [],
+                gr.update(value=grid_size_val),
+                gr.update(value=vo_points_val),
+                gr.update(value=fps_val))
+    except Exception as e:
+        print(f"Error in handle_video_upload: {e}")
+        return (None, None, [],
+                gr.update(value=50),
+                gr.update(value=756),
+                gr.update(value=3))
+def select_point(original_img: str, sel_pix: list, point_type: str, evt: gr.SelectData):
+    """Handle point selection for SAM"""
+    if original_img is None:
+        return None, []
+    try:
+        if BACKEND_AVAILABLE and backend_client:
+            # Try to use backend API
+            try:
+                print(f"🔧 Calling backend select point API: x={evt.index[0]}, y={evt.index[1]}, type={point_type}")
+                # Call the unified API with select_point function type
+                result = backend_client.predict(
+                    "select_point",  # function_type
+                    None,            # video file (not used for select_point)
+                    original_img,    # original_image_state
+                    sel_pix,         # selected_points
+                    point_type,      # point_type
+                    evt.index[0],    # point_x
+                    evt.index[1],    # point_y
+                    50,              # grid_size (not used for select_point)
+                    756,             # vo_points (not used for select_point)
+                    3,               # fps (not used for select_point)
+                    api_name="/unified_api"
+                )
+                print(f"✅ Backend select point API call successful!")
+                print(f"🔧 Result type: {type(result)}")
+                print(f"🔧 Result: {result}")
+                # Parse the result - expect a dict with success status
+                if isinstance(result, dict) and result.get("success"):
+                    display_image = result.get("display_image", None)
+                    new_sel_pix = result.get("selected_points", sel_pix)
+                    # Fix: Convert display_image from list back to numpy array if needed
+                    if isinstance(display_image, list):
+                        display_image = np.array(display_image, dtype=np.uint8)
+                        print(f"🔧 Converted display_image from list to numpy array: {display_image.shape}")
+                    return display_image, new_sel_pix
+                else:
+                    print("Backend processing failed, using local fallback")
+                    # Fallback to local processing
+                    pass
+            except Exception as e:
+                print(f"Backend API call failed: {e}")
+                # Check for specific gradio_client errors
+                if "AppError" in str(type(e)):
+                    print("🔧 Backend Space has internal errors (AppError)")
+                    print("🔧 The backend Space code has bugs or configuration issues")
+                    print("🔧 Contact the Space owner to fix the backend implementation")
+                elif "Could not fetch config" in str(e):
+                    print("🔧 Config fetch failed - possible Gradio version mismatch")
+                    print("🔧 Frontend and backend may be using incompatible Gradio versions")
+                elif "timeout" in str(e).lower():
+                    print("🔧 Backend request timed out - Space might be overloaded")
+                else:
+                    print(f"🔧 Unexpected error type: {type(e).__name__}")
+                print("🔄 Showing error message instead of visualization...")
+                # Fallback to local processing
+                pass
+        # Fallback: local processing with improved visualization
+        print("Using local point selection with enhanced visualization...")
+        # Parse original image state
+        try:
+            state_data = json.loads(original_img)
+            video_path = state_data.get("video_path")
+        except:
+            video_path = None
+        if video_path:
+            # Re-extract frame and add point with mask visualization
+            display_image = extract_first_frame(video_path)
+            if display_image is not None:
+                # Add point to the image with enhanced visualization
+                x, y = evt.index[0], evt.index[1]
+                color = (0, 255, 0) if point_type == 'positive_point' else (255, 0, 0)
+                # Draw a larger, more visible point
+                cv2.circle(display_image, (x, y), 8, color, -1)
+                cv2.circle(display_image, (x, y), 12, (255, 255, 255), 2)
+                # Add point to selected points list - fix logic to match local version
+                new_sel_pix = sel_pix.copy() if sel_pix else []
+                new_sel_pix.append([x, y, point_type])
+                return display_image, new_sel_pix
+        return None, []
+    except Exception as e:
+        print(f"Error in select_point: {e}")
+        return None, []
+def reset_points(original_img: str, sel_pix):
+    """Reset points and restore original image"""
+    if original_img is None:
+        return None, []
+    try:
+        if BACKEND_AVAILABLE and backend_client:
+            # Try to use backend API
+            try:
+                print("🔧 Calling backend reset points API...")
+                # Call the unified API with reset_points function type
+                result = backend_client.predict(
+                    "reset_points",  # function_type
+                    None,            # video file (not used for reset_points)
+                    original_img,    # original_image_state
+                    sel_pix,         # selected_points
+                    "positive_point", # point_type (not used for reset_points)
+                    0,               # point_x (not used for reset_points)
+                    0,               # point_y (not used for reset_points)
+                    50,              # grid_size (not used for reset_points)
+                    756,             # vo_points (not used for reset_points)
+                    3,               # fps (not used for reset_points)
+                    api_name="/unified_api"
+                )
+                print(f"✅ Backend reset points API call successful!")
+                print(f"🔧 Result: {result}")
+                # Parse the result
+                if isinstance(result, dict) and result.get("success"):
+                    display_image = result.get("display_image", None)
+                    new_sel_pix = result.get("selected_points", [])
+                    # Fix: Convert display_image from list back to numpy array if needed
+                    if isinstance(display_image, list):
+                        display_image = np.array(display_image, dtype=np.uint8)
+                        print(f"🔧 Converted display_image from list to numpy array: {display_image.shape}")
+                    return display_image, new_sel_pix
+                else:
+                    print("Backend processing failed, using local fallback")
+                    # Fallback to local processing
+                    pass
+            except Exception as e:
+                print(f"Backend API call failed: {e}")
+                # Fallback to local processing
+                pass
+        # Fallback: local processing
+        print("Using local reset points...")
+        # Parse original image state
+        try:
+            state_data = json.loads(original_img)
+            video_path = state_data.get("video_path")
+        except:
+            video_path = None
+        if video_path:
+            # Re-extract original frame
+            display_image = extract_first_frame(video_path)
+            return display_image, []
+        return None, []
+    except Exception as e:
+        print(f"Error in reset_points: {e}")
+        return None, []
+gr.set_static_paths(paths=[Path.cwd().absolute()/"_viz"])
+def launch_viz(grid_size, vo_points, fps, original_image_state):
+    """Launch visualization with user-specific temp directory"""
+    if original_image_state is None:
+        return None, None, None
+    try:
+        if BACKEND_AVAILABLE and backend_client:
+            # Try to use backend API
+            try:
+                print(f"🔧 Calling backend API with parameters: grid_size={grid_size}, vo_points={vo_points}, fps={fps}")
+                print(f"🔧 Original image state type: {type(original_image_state)}")
+                print(f"🔧 Original image state preview: {str(original_image_state)[:100]}...")
+                # Validate and potentially fix the original_image_state format
+                state_to_send = original_image_state
+                # Check if this is a local processing state that needs to be converted
+                try:
+                    if isinstance(original_image_state, str):
+                        parsed_state = json.loads(original_image_state)
+                        if "video_path" in parsed_state and "frame" in parsed_state:
+                            # This is a local processing state, we need to handle differently
+                            print("🔧 Detected local processing state, cannot use backend for tracking")
+                            print("🔧 Backend requires proper video upload state from backend API")
+                            # Fall through to local processing
+                            raise ValueError("Local state cannot be processed by backend")
+                except json.JSONDecodeError:
+                    print("🔧 Invalid JSON state, cannot send to backend")
+                    raise ValueError("Invalid state format")
+                # Call the unified API with run_tracker function type
+                result = backend_client.predict(
+                    "run_tracker",        # function_type
+                    None,                 # video file (not used for run_tracker)
+                    state_to_send,        # original_image_state
+                    [],                   # selected_points (not used for run_tracker)
+                    "positive_point",     # point_type (not used for run_tracker)
+                    0,                    # point_x (not used for run_tracker)
+                    0,                    # point_y (not used for run_tracker)
+                    grid_size,            # grid_size
+                    vo_points,            # vo_points
+                    fps,                  # fps
+                    api_name="/unified_api"
+                )
+                print(f"✅ Backend API call successful!")
+                print(f"🔧 Result type: {type(result)}")
+                print(f"🔧 Result: {result}")
+                # Parse the result
+                if isinstance(result, dict) and result.get("success"):
+                    viz_html = result.get("viz_html", "")
+                    track_video_path = result.get("track_video_path", "")
+                    track_video_content = result.get("track_video_content", None)
+                    track_video_filename = result.get("track_video_filename", "tracked_video.mp4")
+                    # Save HTML to _viz directory (like local version)
+                    viz_dir = './_viz'
+                    os.makedirs(viz_dir, exist_ok=True)
+                    random_path = f'./_viz/_{time.time()}.html'
+                    with open(random_path, 'w', encoding='utf-8') as f:
+                        f.write(viz_html)
+                    # Create iframe HTML to display the saved file
+                    # Create iframe HTML
+                    iframe_html = f"""
+                    <div style='border: 3px solid #667eea; border-radius: 10px; overflow: hidden; box-shadow: 0 8px 32px rgba(102, 126, 234, 0.3);'>
+                        <iframe id="viz_iframe" src="/gradio_api/file={random_path}" width="100%" height="950px" style="border:none;"></iframe>
+                    </div>
+                    """
+                    print(f"💾 HTML saved to: {random_path}")
+                    print(f"📊 HTML content preview: {viz_html[:200]}...")
+                    # If we have base64 encoded video content, save it as a temporary file
+                    local_video_path = None
+                    if track_video_content:
+                        try:
+                            # Create a temporary file for the video
+                            temp_video_dir = "temp_frontend_videos"
+                            os.makedirs(temp_video_dir, exist_ok=True)
+                            # Generate unique filename to avoid conflicts
+                            timestamp = str(int(time.time() * 1000))
+                            local_video_path = os.path.join(temp_video_dir, f"{timestamp}_{track_video_filename}")
+                            # Decode base64 and save as video file
+                            video_bytes = base64.b64decode(track_video_content)
+                            with open(local_video_path, 'wb') as f:
+                                f.write(video_bytes)
+                            print(f"✅ Successfully saved tracking video to: {local_video_path}")
+                            print(f"🔧 Video file size: {len(video_bytes)} bytes")
+                        except Exception as e:
+                            print(f"❌ Failed to process tracking video: {e}")
+                            local_video_path = None
+                    else:
+                        print("⚠️ No tracking video content received from backend")
+                    # 返回iframe HTML、视频路径和HTML文件路径（用于下载）
+                    return iframe_html, local_video_path, random_path
+                else:
+                    error_msg = result.get("error", "Unknown error") if isinstance(result, dict) else "Backend processing failed"
+                    print(f"❌ Backend processing failed: {error_msg}")
+                    # Fall through to error message
+                    pass
+            except Exception as e:
+                print(f"❌ Backend API call failed: {e}")
+                print(f"🔧 Error type: {type(e)}")
+                print(f"🔧 Error details: {str(e)}")
+                # Check for specific gradio_client errors
+                if "AppError" in str(type(e)):
+                    print("🔧 Backend Space has internal errors (AppError)")
+                    print("🔧 The backend Space code has bugs or configuration issues")
+                    print("🔧 Contact the Space owner to fix the backend implementation")
+                elif "Could not fetch config" in str(e):
+                    print("🔧 Config fetch failed - possible Gradio version mismatch")
+                    print("🔧 Frontend and backend may be using incompatible Gradio versions")
+                elif "timeout" in str(e).lower():
+                    print("🔧 Backend request timed out - Space might be overloaded")
+                elif "Expecting value" in str(e):
+                    print("🔧 JSON parsing error in backend - state format mismatch")
+                    print("🔧 This happens when using local processing state with backend API")
+                    print("🔧 Please upload video again to use backend processing")
+                else:
+                    print(f"🔧 Unexpected error type: {type(e).__name__}")
+                print("🔄 Showing error message instead of visualization...")
+                # Fall through to error message
+                pass
+        # Create an informative error message based on the state
+        state_info = ""
+        try:
+            if isinstance(original_image_state, str):
+                parsed_state = json.loads(original_image_state)
+                if "video_path" in parsed_state:
+                    video_name = os.path.basename(parsed_state["video_path"])
+                    state_info = f"Video: {video_name}"
+        except:
+            state_info = "State format unknown"
+        # Fallback: show message that backend is required
+        error_message = f"""
+        <div style='border: 3px solid #ff6b6b; border-radius: 10px; padding: 20px; background-color: #fff5f5;'>
+            <h3 style='color: #d63031; margin-bottom: 15px;'>⚠️ Backend Processing Required</h3>
+            <p style='color: #2d3436; line-height: 1.6;'>
+                The tracking and visualization features require backend processing. The current setup is using local processing which is incompatible with the backend API.
+            </p>
+            <h4 style='color: #d63031; margin: 15px 0 10px 0;'>Solutions:</h4>
+            <ul style='color: #2d3436; line-height: 1.6;'>
+                <li><strong>Upload video again:</strong> This will properly initialize the backend state</li>
+                <li><strong>Select points on the frame:</strong> Ensure you've clicked on the object to track</li>
+                <li><strong>Check backend connection:</strong> Ensure the backend Space is running</li>
+                <li><strong>Use compatible state:</strong> Avoid local processing mode</li>
+            </ul>
+            <div style='background-color: #f8f9fa; border-radius: 5px; padding: 10px; margin-top: 15px;'>
+                <p style='color: #2d3436; font-weight: bold; margin: 0 0 5px 0;'>Debug Information:</p>
+                <p style='color: #666; font-size: 12px; margin: 0;'>Backend Available: {BACKEND_AVAILABLE}</p>
+                <p style='color: #666; font-size: 12px; margin: 0;'>Backend Client: {backend_client is not None}</p>
+                <p style='color: #666; font-size: 12px; margin: 0;'>Backend URL: {BACKEND_SPACE_URL}</p>
+                <p style='color: #666; font-size: 12px; margin: 0;'>State Info: {state_info}</p>
+                <p style='color: #666; font-size: 12px; margin: 0;'>Processing Mode: {"Backend" if BACKEND_AVAILABLE else "Local (Limited)"}</p>
+            </div>
+            <div style='background-color: #e3f2fd; border-radius: 5px; padding: 10px; margin-top: 10px; border-left: 4px solid #2196f3;'>
+                <p style='color: #1976d2; font-weight: bold; margin: 0 0 5px 0;'>💡 Quick Fix:</p>
+                <p style='color: #1976d2; font-size: 13px; margin: 0;'>
+                    Try uploading your video again - this should properly initialize the backend state for tracking.
+                </p>
+            </div>
+        </div>
+        """
+        return error_message, None, None
+    except Exception as e:
+        print(f"Error in launch_viz: {e}")
+        return None, None, None
+def clear_all():
+    """Clear all buffers and temporary files"""
+    return (None, None, [],
+            gr.update(value=50),
+            gr.update(value=756),
+            gr.update(value=3))
+def clear_all_with_download():
+    """Clear all buffers including download component"""
+    return (None, None, [],
+            gr.update(value=50),
+            gr.update(value=756),
+            gr.update(value=3),
+            None)  # HTML download component
+def update_tracker_model(model_name):
+    """Update tracker model (placeholder function)"""
+    return
+def get_video_settings(video_name):
+    """Get video-specific settings based on video name"""
+    video_settings = {
+        "kiss": (45, 700, 10),
+        "backpack": (40, 600, 2),
+        "kitchen": (60, 800, 3),
+        "pillow": (35, 500, 2),
+        "handwave": (35, 500, 8),
+        "hockey": (45, 700, 2),
+        "drifting": (35, 1000, 6),
+        "basketball": (45, 1500, 5),
+        "ken_block_0": (45, 700, 2),
+        "ego_kc1": (45, 500, 4),
+        "vertical_place": (45, 500, 3),
+        "ego_teaser": (45, 1200, 10),
+        "robot_unitree": (45, 500, 4),
+        "robot_3": (35, 400, 5),
+        "teleop2": (45, 256, 7),
+        "pusht": (45, 256, 10),
+        "cinema_0": (45, 356, 5),
+        "cinema_1": (45, 756, 3),
+    }
+    return video_settings.get(video_name, (50, 756, 3))
+def test_backend_connection():
+    """Test if backend is actually working"""
+    global BACKEND_AVAILABLE
+    if not backend_client:
+        return False
+    try:
+        print("Testing backend connection with a simple call...")
+        # Check if we have fns available
+        if hasattr(backend_client, 'fns') and backend_client.fns:
+            print("✅ Backend API functions are available")
+            print(f"🔧 Available function indices: {list(backend_client.fns.keys())}")
+            return True
+        else:
+            print("❌ Backend API functions not found")
+            return False
+    except Exception as e:
+        print(f"❌ Backend connection test failed: {e}")
+        return False
+def test_backend_api():
+    """Test specific backend API functions"""
+    if not BACKEND_AVAILABLE or not backend_client:
+        print("❌ Backend not available for testing")
+        return False
+    try:
+        print("🧪 Testing backend API functions...")
+        # Test if fns exist and show available indices
+        if hasattr(backend_client, 'fns') and backend_client.fns:
+            print(f"✅ Backend has {len(backend_client.fns)} functions available")
+            for idx in backend_client.fns.keys():
+                print(f"✅ Function {idx} is available")
+        else:
+            print("❌ No functions found in backend API")
+            return False
+        return True
+    except Exception as e:
+        print(f"❌ Backend API test failed: {e}")
+        return False
+# Initialize the backend connection
+print("🚀 Initializing frontend application...")
+result = initialize_backend()
+# Test backend connection if available
+if result and BACKEND_AVAILABLE:
+    print("✅ Backend connection successful!")
+else:
+    print("❌ Backend connection failed!")
+# Create the Gradio interface
+print("🎨 Creating Gradio interface...")
+with gr.Blocks(
+    theme=gr.themes.Soft(),
+    title="SpatialTracker V2 - Frontend",
+    css="""
+    .gradio-container {
+        max-width: 1200px !important;
+        margin: auto !important;
+    }
+    .gr-button {
+        margin: 5px;
+    }
+    .gr-form {
+        background: white;
+        border-radius: 10px;
+        padding: 20px;
+        box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+    }
+    /* 固定视频上传组件高度 */
+    .gr-video {
+        height: 300px !important;
+        min-height: 300px !important;
+        max-height: 300px !important;
+    }
+    .gr-video video {
+        height: 260px !important;
+        max-height: 260px !important;
+        object-fit: contain !important;
+        background: #f8f9fa;
+    }
+    .gr-video .gr-video-player {
+        height: 260px !important;
+        max-height: 260px !important;
+    }
+    /* 水平滚动的示例视频样式 */
+    .example-videos .gr-examples {
+        overflow: visible !important;
+    }
+    .example-videos .gr-examples .gr-table-wrapper {
+        overflow-x: auto !important;
+        overflow-y: hidden !important;
+        scrollbar-width: thin;
+        scrollbar-color: #667eea #f1f1f1;
+    }
+    .example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar {
+        height: 8px;
+    }
+    .example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar-track {
+        background: #f1f1f1;
+        border-radius: 4px;
+    }
+    .example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        border-radius: 4px;
+    }
+    .example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb:hover {
+        background: linear-gradient(135deg, #5a6fd8 0%, #6a4190 100%);
+    }
+    .example-videos .gr-examples .gr-table {
+        display: flex !important;
+        flex-wrap: nowrap !important;
+        min-width: max-content !important;
+        gap: 10px !important;
+    }
+    .example-videos .gr-examples .gr-table tbody {
+        display: flex !important;
+        flex-direction: row !important;
+        flex-wrap: nowrap !important;
+        gap: 10px !important;
+    }
+    .example-videos .gr-examples .gr-table tbody tr {
+        display: flex !important;
+        flex-direction: column !important;
+        min-width: 120px !important;
+        max-width: 120px !important;
+        margin: 0 !important;
+        background: white;
+        border-radius: 8px;
+        box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+        transition: all 0.3s ease;
+        cursor: pointer;
+    }
+    .example-videos .gr-examples .gr-table tbody tr:hover {
+        transform: translateY(-2px);
+        box-shadow: 0 4px 12px rgba(102, 126, 234, 0.2);
+    }
+    .example-videos .gr-examples .gr-table tbody tr td {
+        text-align: center !important;
+        padding: 8px !important;
+        border: none !important;
+    }
+    .example-videos .gr-examples .gr-table tbody tr td video {
+        border-radius: 6px !important;
+        width: 100% !important;
+        height: auto !important;
+    }
+    .example-videos .gr-examples .gr-table tbody tr td:last-child {
+        font-size: 12px !important;
+        font-weight: 500 !important;
+        color: #333 !important;
+        padding-top: 4px !important;
+    }
+    /* 新的水平滚动示例视频样式 */
+    .horizontal-examples .gr-examples {
+        overflow: visible !important;
+    }
+    .horizontal-examples .gr-examples .gr-table-wrapper {
+        overflow-x: auto !important;
+        overflow-y: hidden !important;
+        scrollbar-width: thin;
+        scrollbar-color: #667eea #f1f1f1;
+        padding: 10px 0;
+    }
+    .horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar {
+        height: 8px;
+    }
+    .horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar-track {
+        background: #f1f1f1;
+        border-radius: 4px;
+    }
+    .horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        border-radius: 4px;
+    }
+    .horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb:hover {
+        background: linear-gradient(135deg, #5a6fd8 0%, #6a4190 100%);
+    }
+    .horizontal-examples .gr-examples .gr-table {
+        display: flex !important;
+        flex-wrap: nowrap !important;
+        min-width: max-content !important;
+        gap: 15px !important;
+        padding-bottom: 10px;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody {
+        display: flex !important;
+        flex-direction: row !important;
+        flex-wrap: nowrap !important;
+        gap: 15px !important;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr {
+        display: flex !important;
+        flex-direction: column !important;
+        min-width: 160px !important;
+        max-width: 160px !important;
+        margin: 0 !important;
+        background: white;
+        border-radius: 12px;
+        box-shadow: 0 3px 12px rgba(0,0,0,0.12);
+        transition: all 0.3s ease;
+        cursor: pointer;
+        overflow: hidden;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr:hover {
+        transform: translateY(-4px);
+        box-shadow: 0 8px 20px rgba(102, 126, 234, 0.25);
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr td {
+        text-align: center !important;
+        padding: 0 !important;
+        border: none !important;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr td:first-child {
+        padding: 0 !important;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr td video {
+        border-radius: 8px 8px 0 0 !important;
+        width: 100% !important;
+        height: 90px !important;
+        object-fit: cover !important;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr td:last-child {
+        font-size: 11px !important;
+        font-weight: 600 !important;
+        color: #333 !important;
+        padding: 8px 12px !important;
+        background: linear-gradient(135deg, #f8f9ff 0%, #e6f3ff 100%);
+        border-radius: 0 0 8px 8px;
+    }
+    """
+) as demo:
+    gr.Markdown("""
+    # 🎯 SpatialTracker V2 - Frontend Interface
+    Welcome to SpatialTracker V2! This interface allows you to track any pixels in 3D using our model.
+    **Instructions:**
+    1. Upload a video file or select from examples below
+    2. Click on the object you want to track in the first frame
+    3. Adjust tracking parameters if needed
+    4. Click "Launch Visualization" to start tracking
+    """)
+    # Status indicator with more detailed information
+    if BACKEND_AVAILABLE:
+        status_text = "🟢 Backend Connected"
+        status_details = f"Connected to: {BACKEND_SPACE_URL}"
+    else:
+        status_text = "🟡 Running in Standalone Mode"
+        status_details = f"Backend unavailable: {BACKEND_SPACE_URL}"
+    gr.Markdown(f"**Status:** {status_text}")
+    gr.Markdown(f"<small style='color: #666;'>{status_details}</small>", elem_id="status-details")
+    # Example videos section - moved to top
+    with gr.Group(elem_classes=["example-videos"]):
+        gr.Markdown("### 📂 Example Videos")
+        gr.Markdown("Try these example videos to get started quickly:")
+        # Custom horizontal scrolling video gallery
+        gr.HTML("""
+        <div style='background-color: #f8f9ff; border-radius: 8px; padding: 10px; margin: 10px 0; border-left: 4px solid #667eea;'>
+            <p style='margin: 0; font-size: 13px; color: #666; display: flex; align-items: center; gap: 8px;'>
+                <span style='font-size: 16px;'>💡</span>
+                <strong>Tip:</strong> Scroll horizontally below to see all example videos
+            </p>
+        </div>
+        """)
+        # Define video_input here so it can be referenced in examples
+        video_input = gr.Video(
+            label="Upload Video or Select Example",
+            format="mp4",
+            height=300
+        )
+        # Create a horizontal scrolling container for the examples
+        with gr.Group(elem_classes=["horizontal-examples"]):
+            gr.Examples(
+                examples=[
+                    ["examples/kiss.mp4"],
+                    ["examples/backpack.mp4"],
+                    ["examples/pillow.mp4"],
+                    ["examples/handwave.mp4"],
+                    ["examples/hockey.mp4"],
+                    ["examples/drifting.mp4"],
+                    ["examples/ken_block_0.mp4"],
+                    ["examples/kitchen.mp4"],
+                    ["examples/basketball.mp4"],
+                    ["examples/ego_kc1.mp4"],
+                    ["examples/vertical_place.mp4"],
+                    ["examples/ego_teaser.mp4"],
+                    ["examples/robot_unitree.mp4"],
+                    ["examples/robot_3.mp4"],
+                    ["examples/teleop2.mp4"],
+                    ["examples/pusht.mp4"],
+                    ["examples/cinema_0.mp4"],
+                    ["examples/cinema_1.mp4"],
+                ],
+                inputs=video_input,
+                label="🎬 Click on any example to load it",
+                examples_per_page=16
+            )
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Interactive frame display
+            with gr.Group():
+                gr.Markdown("### 🎯 Point Selection")
+                gr.Markdown("Click on the object you want to track in the frame below:")
+                interactive_frame = gr.Image(
+                    label="Click to select tracking points",
+                    type="numpy",
+                    interactive=True
+                )
+                with gr.Row():
+                    point_type = gr.Radio(
+                        choices=["positive_point", "negative_point"],
+                        value="positive_point",
+                        label="Point Type",
+                        info="Positive points indicate the object to track, negative points indicate areas to avoid"
+                    )
+                with gr.Row():
+                    reset_points_btn = gr.Button("🔄 Reset Points", variant="secondary")
+                    clear_all_btn = gr.Button("🗑️ Clear All", variant="stop")
+        with gr.Column(scale=1):
+            # Tracking results
+            with gr.Group():
+                gr.Markdown("### 🎬 Tracking Results")
+                tracking_result_video = gr.Video(
+                    label="Tracking Result Video",
+                    interactive=False,
+                    height=300
+                )
+                # HTML文件下载组件
+                html_download = gr.File(
+                    label="📥 Download 3D Visualization HTML",
+                    interactive=False,
+                    visible=True
+                )
+    # Advanced settings section - changed to open=True
+    with gr.Accordion("⚙️ Advanced Settings", open=True):
+        gr.Markdown("Adjust these parameters to optimize tracking performance:")
+        with gr.Row():
+            grid_size = gr.Slider(
+                minimum=10,
+                maximum=100,
+                step=10,
+                value=50,
+                label="Grid Size",
+                info="Size of the tracking grid (larger = more detailed)"
+            )
+            vo_points = gr.Slider(
+                minimum=100,
+                maximum=2000,
+                step=50,
+                value=756,
+                label="VO Points",
+                info="Number of visual odometry points (more = better accuracy)"
+            )
+            fps = gr.Slider(
+                minimum=1,
+                maximum=30,
+                step=1,
+                value=3,
+                label="FPS",
+                info="Frames per second for processing (higher = smoother but slower)"
+            )
+    # Launch button
+    with gr.Row():
+        launch_btn = gr.Button("🚀 Start Tracking Now!", variant="primary", size="lg")
+    # 3D Visualization - Make it larger and more prominent
+    with gr.Row():
+        with gr.Column():
+            with gr.Group():
+                gr.Markdown("### 🌐 3D Trajectory Visualization")
+                gr.Markdown("Interactive 3D visualization of 3D point tracking and camera motion:")
+                viz_html = gr.HTML(
+                    label="3D Trajectory Visualization",
+                    value="""
+                    <div style='border: 3px solid #667eea; border-radius: 15px; padding: 40px;
+                                background: linear-gradient(135deg, #f8f9ff 0%, #e6f3ff 100%);
+                                text-align: center; min-height: 600px; display: flex;
+                                flex-direction: column; justify-content: center; align-items: center;
+                                box-shadow: 0 8px 32px rgba(102, 126, 234, 0.2);'>
+                        <div style='font-size: 48px; margin-bottom: 20px;'>🌐</div>
+                        <h2 style='color: #667eea; margin-bottom: 15px; font-size: 28px; font-weight: 600;'>
+                            3D Trajectory Visualization
+                        </h2>
+                        <p style='color: #666; font-size: 16px; line-height: 1.6; max-width: 500px; margin-bottom: 25px;'>
+                            Perceive the world with Pixel-wise 3D Motions!
+                        </p>
+                        <div style='background: rgba(102, 126, 234, 0.1); border-radius: 25px;
+                                    padding: 12px 24px; border: 2px solid rgba(102, 126, 234, 0.2);'>
+                            <span style='color: #667eea; font-weight: 600; font-size: 14px;'>
+                                ⚡ Powered by SpatialTracker V2
+                            </span>
+                        </div>
+                    </div>
+                    """,
+                    elem_id="viz_container"
+                )
+    # Hidden state variables
+    original_image_state = gr.State(None)
+    selected_points = gr.State([])
+    # Event handlers
+    video_input.change(
+        fn=handle_video_upload,
+        inputs=[video_input],
+        outputs=[original_image_state, interactive_frame, selected_points, grid_size, vo_points, fps]
+    )
+    interactive_frame.select(
+        fn=select_point,
+        inputs=[original_image_state, selected_points, point_type],
+        outputs=[interactive_frame, selected_points]
+    )
+    reset_points_btn.click(
+        fn=reset_points,
+        inputs=[original_image_state, selected_points],
+        outputs=[interactive_frame, selected_points]
+    )
+    clear_all_btn.click(
+        fn=clear_all_with_download,
+        outputs=[video_input, interactive_frame, selected_points, grid_size, vo_points, fps, html_download]
+    )
+    launch_btn.click(
+        fn=launch_viz,
+        inputs=[grid_size, vo_points, fps, original_image_state],
+        outputs=[viz_html, tracking_result_video, html_download]
+    )
+    # GitHub Star Reminder - Added back!
+    gr.HTML("""
+    <div style='background: linear-gradient(135deg, #e8eaff 0%, #f0f2ff 100%);
+                border-radius: 10px;
+                padding: 15px;
+                margin: 15px 0;
+                box-shadow: 0 2px 8px rgba(102, 126, 234, 0.1);
+                border: 1px solid rgba(102, 126, 234, 0.15);'>
+        <div style='text-align: center; color: #4a5568;'>
+            <h3 style='margin: 0 0 10px 0; font-size: 18px; text-shadow: none; color: #2d3748;'>
+                ⭐ Love SpatialTracker? Give us a Star! ⭐
+            </h3>
+            <p style='margin: 0 0 12px 0; font-size: 14px; opacity: 0.8; color: #4a5568;'>
+                Help us grow by starring our repository on GitHub! 🚀
+            </p>
+            <div style='display: flex; justify-content: center;'>
+                <a href="https://github.com/henry123-boy/SpaTrackerV2"
+                   target="_blank"
+                   style='display: inline-flex;
+                          align-items: center;
+                          gap: 6px;
+                          background: rgba(102, 126, 234, 0.1);
+                          color: #4a5568;
+                          padding: 8px 16px;
+                          border-radius: 20px;
+                          text-decoration: none;
+                          font-weight: bold;
+                          font-size: 14px;
+                          backdrop-filter: blur(5px);
+                          border: 1px solid rgba(102, 126, 234, 0.2);
+                          transition: all 0.3s ease;'
+                   onmouseover="this.style.background='rgba(102, 126, 234, 0.15)'; this.style.transform='translateY(-1px)'"
+                   onmouseout="this.style.background='rgba(102, 126, 234, 0.1)'; this.style.transform='translateY(0)'">
+                    <span style='font-size: 16px;'>⭐</span>
+                    Star on GitHub
+                </a>
+            </div>
+        </div>
+    </div>
+    """)
+    # Acknowledgment section for TAPIR3D - moved to the end
+    gr.HTML("""
+    <div style='background: linear-gradient(135deg, #fff8e1 0%, #fffbf0 100%);
+                border-radius: 8px;
+                padding: 12px;
+                margin: 15px 0;
+                box-shadow: 0 1px 4px rgba(255, 193, 7, 0.1);
+                border: 1px solid rgba(255, 193, 7, 0.2);'>
+        <div style='text-align: center; color: #5d4037;'>
+            <h5 style='margin: 0 0 6px 0; font-size: 14px; color: #5d4037;'>
+                Acknowledgments
+            </h5>
+            <p style='margin: 0; font-size: 12px; opacity: 0.9; color: #5d4037; line-height: 1.3;'>
+                Our 3D visualizer is adapted from <strong>TAPIP3D</strong>. We thank the authors for their excellent work!
+            </p>
+            <div style='margin-top: 6px;'>
+                <a href="https://github.com/zbw001/TAPIP3D"
+                   target="_blank"
+                   style='display: inline-flex;
+                          align-items: center;
+                          gap: 3px;
+                          background: rgba(255, 193, 7, 0.15);
+                          color: #5d4037;
+                          padding: 3px 10px;
+                          border-radius: 12px;
+                          text-decoration: none;
+                          font-weight: 500;
+                          font-size: 11px;
+                          border: 1px solid rgba(255, 193, 7, 0.3);
+                          transition: all 0.3s ease;'
+                   onmouseover="this.style.background='rgba(255, 193, 7, 0.2)'"
+                   onmouseout="this.style.background='rgba(255, 193, 7, 0.15)'">
+                    📚 TAPIP3D Repository
+                </a>
+            </div>
+        </div>
+    </div>
+    """)
+# Launch the interface
+if __name__ == "__main__":
+    print("🌟 Launching SpatialTracker V2 Frontend...")
+    print(f"🔗 Backend Status: {'Connected' if BACKEND_AVAILABLE else 'Disconnected'}")
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,
+        debug=True,
+        show_error=True
+    )