Skip to content

Object Detection Agent

ObjectDetectionAgent

Bases: SensoryAgent

A object detection agent that uses a remote object detection, i.e. YOLOWorld, Grounding Dino, to detect objects in an image.

Source code in mbodied/agents/sense/object_detection_agent.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
class ObjectDetectionAgent(SensoryAgent):
    """A object detection agent that uses a remote object detection, i.e. YOLOWorld, Grounding Dino, to detect objects in an image."""

    def __init__(
        self,
        model_src="https://api.mbodi.ai/sense/",
        model_kwargs=None,
        **kwargs,
    ):
        super().__init__(
            model_src=model_src,
            model_kwargs=model_kwargs,
            **kwargs,
        )

    def act(
        self,
        image: Image,
        objects: list[str] | str,
        model_type: str = "YOLOWorld",
        *args,
        api_name: str = "/detect",
        **kwargs,
    ) -> World:
        """Act based on the prompt and image using the remote object detection server.

        Args:
            image (Image): The image to act on.
            objects (list[str] | str): The objects to detect in the image.
            model_type (str): The model type to use for the object detection. Options: "YOLOWorld", "Grounding DINO".
            *args: Variable length argument list.
            api_name (str): The name of the API endpoint to use.
            **kwargs: Arbitrary keyword arguments.

        Returns:
            World: The world data with the detected objects.
        """
        if self.actor is None:
            raise ValueError("Remote actor for agent not initialized.")

        if isinstance(objects, list):
            objects = ",".join(objects)
        annotated_img, json_dict = self.actor.predict(
            image.base64, objects, model_type=model_type, *args, api_name=api_name, **kwargs
        )
        return World.model_validate(json_dict)

act(image, objects, model_type='YOLOWorld', *args, api_name='/detect', **kwargs)

Act based on the prompt and image using the remote object detection server.

Parameters:

Name Type Description Default
image Image

The image to act on.

required
objects list[str] | str

The objects to detect in the image.

required
model_type str

The model type to use for the object detection. Options: "YOLOWorld", "Grounding DINO".

'YOLOWorld'
*args

Variable length argument list.

()
api_name str

The name of the API endpoint to use.

'/detect'
**kwargs

Arbitrary keyword arguments.

{}

Returns:

Name Type Description
World World

The world data with the detected objects.

Source code in mbodied/agents/sense/object_detection_agent.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def act(
    self,
    image: Image,
    objects: list[str] | str,
    model_type: str = "YOLOWorld",
    *args,
    api_name: str = "/detect",
    **kwargs,
) -> World:
    """Act based on the prompt and image using the remote object detection server.

    Args:
        image (Image): The image to act on.
        objects (list[str] | str): The objects to detect in the image.
        model_type (str): The model type to use for the object detection. Options: "YOLOWorld", "Grounding DINO".
        *args: Variable length argument list.
        api_name (str): The name of the API endpoint to use.
        **kwargs: Arbitrary keyword arguments.

    Returns:
        World: The world data with the detected objects.
    """
    if self.actor is None:
        raise ValueError("Remote actor for agent not initialized.")

    if isinstance(objects, list):
        objects = ",".join(objects)
    annotated_img, json_dict = self.actor.predict(
        image.base64, objects, model_type=model_type, *args, api_name=api_name, **kwargs
    )
    return World.model_validate(json_dict)