import base64 from io import BytesIO import torch from PIL import Image from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection from typing import Any, Dict MODEL_ID = "IDEA-Research/grounding-dino-base" class EndpointHandler(): def __init__(self, path=""): self.device = "cuda" if torch.cuda.is_available() else "cpu" self.processor = AutoProcessor.from_pretrained(MODEL_ID) self.model = AutoModelForZeroShotObjectDetection.from_pretrained( MODEL_ID).to(self.device) def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: inputs = data.get('inputs') image_b64 = inputs.get('image_b64') prompt = inputs.get('prompt') if image_b64 is None or prompt is None: return { 'error': 'No image_b64 or prompt provided' } image_bytes = BytesIO(base64.b64decode(image_b64)) image = Image.open(image_bytes) inputs = self.processor(images=image, text=prompt, return_tensors='pt').to(self.device) with torch.no_grad(): outputs = self.model(**inputs) results = self.processor.post_process_grounded_object_detection( outputs, inputs.input_ids, text_threshold=0.3, target_sizes=[image.size[::-1]] ) if len(results) == 0 or len(results[0]['boxes']) == 0: return { 'error': 'No bounding boxes found' } bbox = results[0]['boxes'][0] return { 'x1': bbox[0].item(), 'x2': bbox[1].item(), 'y1': bbox[2].item(), 'y2': bbox[3].item() }