import gradio as gr import supervision as sv import torch import spaces from utils.annotate import annotate_with_boxes from utils.models import load_models, run_inference, CHECKPOINTS from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \ CAPTION_TASK_NAMES, CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \ MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME MARKDOWN = """ # Better Florence-2 Playground 🔥

Florence-2 is a lightweight vision-language model open-sourced by Microsoft under the MIT license. The model demonstrates strong zero-shot and fine-tuning capabilities across tasks such as captioning, object detection, grounding, and segmentation. The model takes images and task prompts as input, generating the desired results in text format. It uses a DaViT vision encoder to convert images into visual token embeddings. These are then concatenated with BERT-generated text embeddings and processed by a transformer-based multi-modal encoder-decoder to generate the response. """ OBJECT_DETECTION_EXAMPLES = [ ["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"] ] CAPTION_EXAMPLES = [ ["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"], ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"], ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"] ] OCR_EXAMPLES = [ ["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg"], ] OCR_WITH_REGION_EXAMPLES = [ ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg"], ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/inference/license_plate_1.jpg"] ] DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") # DEVICE = "cuda" MODELS, PROCESSORS = load_models(DEVICE) @spaces.GPU def process(checkpoint_dropdown, task_dropdown, image_input): model = MODELS[checkpoint_dropdown] processor = PROCESSORS[checkpoint_dropdown] task = TASKS[task_dropdown] if task_dropdown in [OBJECT_DETECTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME]: _, response = run_inference( model, processor, DEVICE, image_input, task) detections = sv.Detections.from_lmm( lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size) return annotate_with_boxes(image_input, detections) elif task_dropdown in CAPTION_TASK_NAMES or task_dropdown == OCR_TASK_NAME: _, response = run_inference( model, processor, DEVICE, image_input, task) return response[task] with gr.Blocks() as demo: gr.Markdown(MARKDOWN) with gr.Row(): checkpoint_dropdown_component = gr.Dropdown( choices=CHECKPOINTS, value=CHECKPOINTS[0], label="Model", info="Select a Florence 2 model to use.") task_dropdown_component = gr.Dropdown( choices=TASK_NAMES, value=TASK_NAMES[0], label="Task", info="Select a task to perform with the model.") with gr.Row(): with gr.Column(): image_input_component = gr.Image(type='pil', label='Image Input') submit_button_component = gr.Button(value='Submit', variant='primary') with gr.Column(): image_output_component = gr.Image(type='pil', label='Image Output') text_output_component = gr.Textbox(label='Caption Output', visible=False) def on_dropdown_input(text): if text in CAPTION_TASK_NAMES + [OCR_TASK_NAME]: return [gr.Image(visible=False), gr.Textbox(visible=True)] else: return [gr.Image(visible=True), gr.Textbox(visible=False)] task_dropdown_component.input( on_dropdown_input, inputs=[task_dropdown_component], outputs=[image_output_component, text_output_component]) demo.launch(debug=False, show_error=True, max_threads=1)