import os import time import json import base64 import numpy as np import cv2 import onnxruntime as ort import gradio as gr from PIL import Image # ----------------------------- # Config # ----------------------------- CLASS_NAMES = [ "Rat", "Ox", "Tiger", "Hare", "Dragon", "Snake", "Horse", "Ram", "Monkey", "Bird", "Dog", "Boar" ] # Put the model under a writable/cache folder. # HF Spaces provides a persistent cache mount in many runtimes; at worst it re-downloads on restart. CACHE_DIR = os.environ.get("HF_HOME", ".") MODEL_DIR = os.path.join(CACHE_DIR, "models", "yolox_nano") MODEL_NAME = "yolox_nano.onnx" MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME) # Your Google Drive direct download MODEL_DOWNLOAD_URL = "https://drive.google.com/uc?id=1xeh3rrIhSqH0BAv7jLCGPZf1waQ4tlXY" def ensure_model(): """Download the ONNX model once if not present.""" os.makedirs(MODEL_DIR, exist_ok=True) if os.path.exists(MODEL_PATH) and os.path.getsize(MODEL_PATH) > 1024 * 1024: return MODEL_PATH print(f"[Model] Downloading to: {MODEL_PATH}") import gdown # If you suspect corruption, delete then re-download if os.path.exists(MODEL_PATH): try: os.remove(MODEL_PATH) except Exception: pass gdown.download(MODEL_DOWNLOAD_URL, MODEL_PATH, quiet=False) return MODEL_PATH # ========================================== # YOLOX INFERENCE CLASS (ported from your code) # ========================================== class YoloxONNX(object): def __init__(self, model_path, class_names, score_thr=0.6, nms_thr=0.45, input_shape=(416, 416)): self.input_shape = input_shape self.score_thr = score_thr self.nms_thr = nms_thr self.class_names = class_names print(f"[Model] Loading ONNX from {model_path}...") self.session = ort.InferenceSession(model_path, providers=["CPUExecutionProvider"]) self.input_name = self.session.get_inputs()[0].name def inference(self, image_bgr): input_image, ratio = self.preprocess(image_bgr, self.input_shape) ort_inputs = {self.input_name: input_image[None, :, :, :]} outputs = self.session.run(None, ort_inputs) predictions = outputs[0][0] # batch=1 boxes_xyxy, scores, class_ids = self.postprocess(predictions, ratio) return boxes_xyxy, scores, class_ids def preprocess(self, image, input_size): if len(image.shape) == 3: padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114 else: padded_img = np.ones(input_size, dtype=np.uint8) * 114 r = min(input_size[0] / image.shape[0], input_size[1] / image.shape[1]) resized_img = cv2.resize( image, (int(image.shape[1] * r), int(image.shape[0] * r)), interpolation=cv2.INTER_LINEAR, ) padded_img[: int(image.shape[0] * r), : int(image.shape[1] * r)] = resized_img image = padded_img.transpose(2, 0, 1) # CHW image = np.ascontiguousarray(image, dtype=np.float32) return image, r def postprocess(self, predictions, ratio): boxes = predictions[:, :4] scores = predictions[:, 4:5] * predictions[:, 5:] boxes_xyxy = np.ones_like(boxes) boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2] / 2.0 boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3] / 2.0 boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2] / 2.0 boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.0 boxes_xyxy /= ratio dets = self.multiclass_nms(boxes_xyxy, scores, nms_thr=self.nms_thr, score_thr=self.score_thr) if dets is None: return [], [], [] final_boxes = dets[:, :4] final_scores = dets[:, 4] final_cls_inds = dets[:, 5] return final_boxes, final_scores, final_cls_inds def multiclass_nms(self, boxes, scores, nms_thr, score_thr): final_dets = [] num_classes = scores.shape[1] for cls_ind in range(num_classes): cls_scores = scores[:, cls_ind] valid_score_mask = cls_scores > score_thr if valid_score_mask.sum() == 0: continue valid_boxes = boxes[valid_score_mask] valid_scores = cls_scores[valid_score_mask] keep = self.nms_cpu(valid_boxes, valid_scores, nms_thr) for i in keep: final_dets.append([ valid_boxes[i][0], valid_boxes[i][1], valid_boxes[i][2], valid_boxes[i][3], valid_scores[i], cls_ind ]) if len(final_dets) == 0: return None return np.array(final_dets) def nms_cpu(self, boxes, scores, nms_thr): x1, y1, x2, y2 = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3] areas = (x2 - x1 + 1) * (y2 - y1 + 1) order = scores.argsort()[::-1] keep = [] while order.size > 0: i = order[0] keep.append(i) xx1 = np.maximum(x1[i], x1[order[1:]]) yy1 = np.maximum(y1[i], y1[order[1:]]) xx2 = np.minimum(x2[i], x2[order[1:]]) yy2 = np.minimum(y2[i], y2[order[1:]]) w = np.maximum(0.0, xx2 - xx1 + 1) h = np.maximum(0.0, yy2 - yy1 + 1) inter = w * h ovr = inter / (areas[i] + areas[order[1:]] - inter) inds = np.where(ovr <= nms_thr)[0] order = order[inds + 1] return keep # ----------------------------- # Load model once at startup # ----------------------------- _detector = None def get_detector(score_thr=0.6, nms_thr=0.45, input_size=416): global _detector if _detector is None: path = ensure_model() _detector = YoloxONNX( path, CLASS_NAMES, score_thr=float(score_thr), nms_thr=float(nms_thr), input_shape=(int(input_size), int(input_size)), ) return _detector def draw_detections(img_bgr, results): out = img_bgr.copy() for r in results: x1, y1, x2, y2 = map(int, r["box"]) label = r["label"] score = r["score"] cv2.rectangle(out, (x1, y1), (x2, y2), (0, 255, 0), 2) cv2.putText( out, f"{label} {score:.2f}", (x1, max(0, y1 - 8)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2, cv2.LINE_AA, ) return out def predict_ui(image_pil, score_thr, nms_thr, input_size): if image_pil is None: return None, [], "{}" # Gradio gives PIL (RGB). Convert to BGR for OpenCV/your pipeline img_rgb = np.array(image_pil) img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR) detector = get_detector(score_thr=score_thr, nms_thr=nms_thr, input_size=input_size) t0 = time.time() boxes, scores, class_ids = detector.inference(img_bgr) dt_ms = (time.time() - t0) * 1000 results = [] for i in range(len(boxes)): box = boxes[i] score = float(scores[i]) label_id = int(class_ids[i]) label = CLASS_NAMES[label_id] results.append({ "label": label, "score": score, "box": [float(box[0]), float(box[1]), float(box[2]), float(box[3])] }) annotated_bgr = draw_detections(img_bgr, results) annotated_rgb = cv2.cvtColor(annotated_bgr, cv2.COLOR_BGR2RGB) annotated_pil = Image.fromarray(annotated_rgb) table = [ [r["label"], r["score"], *r["box"]] for r in results ] meta = {"inference_ms": round(dt_ms, 2), "num_detections": len(results), "results": results} return annotated_pil, table, json.dumps(meta, indent=2) with gr.Blocks(title="Shinobi Hand Seal Detector (YOLOX Nano ONNX)") as demo: gr.Markdown( "Upload an image. The Space runs YOLOX Nano (ONNXRuntime CPU) and returns detected hand seals." ) with gr.Row(): inp = gr.Image(type="pil", label="Input Image") out_img = gr.Image(type="pil", label="Annotated Output") with gr.Row(): score_thr = gr.Slider(0.1, 0.95, value=0.6, step=0.05, label="Score threshold") nms_thr = gr.Slider(0.1, 0.95, value=0.45, step=0.05, label="NMS threshold") input_size = gr.Radio([320, 416, 512], value=416, label="Input size") btn = gr.Button("Detect") out_table = gr.Dataframe( headers=["label", "score", "x1", "y1", "x2", "y2"], datatype=["str", "number", "number", "number", "number", "number"], label="Detections", interactive=False, ) out_json = gr.Code(label="Raw JSON", language="json") btn.click( predict_ui, inputs=[inp, score_thr, nms_thr, input_size], outputs=[out_img, out_table, out_json], api_name="detect", # <-- add this ) demo.queue().launch(show_error=True)