SigLIP (Sigmoid Loss for Language Image Pre-Training)

SigLIP (Sigmoid Loss for Language Image Pre-Training) is a vision-language model that builds upon the principles of CLIP but introduces a key architectural change: it uses a sigmoid loss function instead of the softmax-based contrastive loss. Additionally, there are some slight implementation differences (no attention_mask for the text encoder, padding the text inputs, multihead attention pooling for the vision encoder rather than a linear projection layer).

This modification simplifies the training objective by treating the problem as a binary classification for each image-text pair (i.e., are they a positive or negative match?). This approach avoids the need for a global normalization over all pairs in a batch, which makes it more scalable and robust to noisy, web-scale data.

Key features of SigLIP: 1. Vision Encoder: A Vision Transformer (ViT) with a Multi-Head Attention Pooling (MAP) head. 2. Text Encoder: A standard Transformer model. 3. Sigmoid Loss: Enables training on larger batches and noisier datasets without requiring careful data curation or complex negative sampling strategies.

SigLIP was introduced in the paper "Sigmoid Loss for Language Image Pre-Training" and has demonstrated improved performance and training efficiency.

`jimm.models.siglip.SigLIPVisionModel`

Bases: Module

Source code in src/jimm/models/siglip/siglip_model.py

class SigLIPVisionModel(nnx.Module):
    def __init__(
        self,
        image_resolution: int,
        vision_layers: int,
        vision_width: int,
        vision_patch_size: int,
        use_gradient_checkpointing: bool = False,
        rngs: rnglib.Rngs = nnx.Rngs(0),
        dtype: DTypeLike = jnp.float32,
        param_dtype: DTypeLike = jnp.float32,
        mesh: Mesh | None = None,
        mesh_rules: MeshRules = DEFAULT_SHARDING,
    ):
        """Initialize the SigLIP Vision Encoder.

        Args:
            image_resolution (int): The resolution of the input images.
            vision_layers (int): The number of layers in the vision transformer.
            vision_width (int): The width of the vision transformer.
            vision_patch_size (int): The patch size of the vision transformer.
            use_gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
            rngs (rnglib.Rngs, optional): The random number generator state. Defaults to nnx.Rngs(0).
            dtype (DTypeLike, optional): The data type for computations. Defaults to jnp.float32.
            param_dtype (DTypeLike, optional): The data type for parameters. Defaults to jnp.float32.
            mesh (Mesh | None, optional): The device mesh for parameter sharding. Defaults to None.
            mesh_rules (MeshRules, optional): Logical axis sharding rules. Defaults to DEFAULT_SHARDING.
        """
        self.vision_layers = vision_layers
        self.vision_width = vision_width
        self.vision_patch_size = vision_patch_size
        self.dtype = dtype

        vision_heads = vision_width // 64

        self.encoder = VisionTransformerBase(
            img_size=image_resolution,
            patch_size=vision_patch_size,
            in_channels=3,
            hidden_size=vision_width,
            num_layers=vision_layers,
            num_heads=vision_heads,
            mlp_dim=vision_width * 4,
            use_pre_norm=False,
            use_patch_bias=True,
            use_quick_gelu=False,
            use_gradient_checkpointing=use_gradient_checkpointing,
            pooling_type="MAP",
            layernorm_epsilon=1e-6,
            dtype=dtype,
            param_dtype=param_dtype,
            mesh=mesh,
            rngs=rngs,
            mesh_rules=mesh_rules,
        )

    def __call__(self, image: Float[Array, "batch height width channels"], do_projection: bool = True) -> Float[Array, "batch vision_width"]:
        """Encode images into embeddings.

        Args:
            image (Float[Array, "batch height width channels"]): Batch of input images.
            do_projection (bool): Included for API compatibility with CLIP. SigLIP vision model doesn't have a projection layer. Defaults to True.

        Returns:
            Float[Array, "batch vision_width"]: Image embeddings.
        """
        return self.encoder(image)

    @classmethod
    def from_pretrained(
        cls,
        model_name_or_path: str,
        use_pytorch: bool = False,
        mesh: Mesh | None = None,
        dtype: DTypeLike = jnp.float32,
        param_dtype: DTypeLike = jnp.float32,
        use_gradient_checkpointing: bool = False,
        rngs: rnglib.Rngs = nnx.Rngs(0),
    ) -> "SigLIPVisionModel":
        """Load a pretrained vision encoder from a SigLIP checkpoint.

        Args:
            model_name_or_path (str): Path to local weights or HuggingFace model ID.
            use_pytorch (bool): Whether to load from PyTorch weights. Defaults to False.
            mesh (Mesh | None): Optional device mesh for parameter sharding. Defaults to None.
            dtype (DTypeLike): Data type for computations. Defaults to jnp.float32.
            param_dtype (DTypeLike): Data type for parameters. Defaults to jnp.float32.
            use_gradient_checkpointing (bool): Whether to use gradient checkpointing. Defaults to False.
            rngs (rnglib.Rngs): Random number generator keys. Defaults to nnx.Rngs(0).

        Returns:
            SigLIPVisionModel: Pretrained SigLIP vision model
        """
        from .params import load_vision_from_pretrained

        return load_vision_from_pretrained(cls, model_name_or_path, use_pytorch, mesh, dtype, param_dtype, use_gradient_checkpointing, rngs)

    def save_pretrained(self, save_directory: str) -> None:
        """Save model weights and config in HuggingFace format.

        Args:
            save_directory (str): Directory path where the model will be saved.
        """
        from .params import save_vision_pretrained

        save_vision_pretrained(self, save_directory)

`call(image, do_projection=True)`

Encode images into embeddings.

Parameters:

Name	Type	Description	Default
`image`	`Float[Array, 'batch height width channels']`	Batch of input images.	required
`do_projection`	`bool`	Included for API compatibility with CLIP. SigLIP vision model doesn't have a projection layer. Defaults to True.	`True`

Returns:

Type	Description
`Float[Array, 'batch vision_width']`	Float[Array, "batch vision_width"]: Image embeddings.

Source code in src/jimm/models/siglip/siglip_model.py

def __call__(self, image: Float[Array, "batch height width channels"], do_projection: bool = True) -> Float[Array, "batch vision_width"]:
    """Encode images into embeddings.

    Args:
        image (Float[Array, "batch height width channels"]): Batch of input images.
        do_projection (bool): Included for API compatibility with CLIP. SigLIP vision model doesn't have a projection layer. Defaults to True.

    Returns:
        Float[Array, "batch vision_width"]: Image embeddings.
    """
    return self.encoder(image)

`init(image_resolution, vision_layers, vision_width, vision_patch_size, use_gradient_checkpointing=False, rngs=nnx.Rngs(0), dtype=jnp.float32, param_dtype=jnp.float32, mesh=None, mesh_rules=DEFAULT_SHARDING)`

Initialize the SigLIP Vision Encoder.

Parameters:

Name	Type	Description	Default
`image_resolution`	`int`	The resolution of the input images.	required
`vision_layers`	`int`	The number of layers in the vision transformer.	required
`vision_width`	`int`	The width of the vision transformer.	required
`vision_patch_size`	`int`	The patch size of the vision transformer.	required
`use_gradient_checkpointing`	`bool`	Whether to use gradient checkpointing. Defaults to False.	`False`
`rngs`	`Rngs`	The random number generator state. Defaults to nnx.Rngs(0).	`Rngs(0)`
`dtype`	`DTypeLike`	The data type for computations. Defaults to jnp.float32.	`float32`
`param_dtype`	`DTypeLike`	The data type for parameters. Defaults to jnp.float32.	`float32`
`mesh`	`Mesh \| None`	The device mesh for parameter sharding. Defaults to None.	`None`
`mesh_rules`	`MeshRules`	Logical axis sharding rules. Defaults to DEFAULT_SHARDING.	`DEFAULT_SHARDING`

Source code in src/jimm/models/siglip/siglip_model.py

def __init__(
    self,
    image_resolution: int,
    vision_layers: int,
    vision_width: int,
    vision_patch_size: int,
    use_gradient_checkpointing: bool = False,
    rngs: rnglib.Rngs = nnx.Rngs(0),
    dtype: DTypeLike = jnp.float32,
    param_dtype: DTypeLike = jnp.float32,
    mesh: Mesh | None = None,
    mesh_rules: MeshRules = DEFAULT_SHARDING,
):
    """Initialize the SigLIP Vision Encoder.

    Args:
        image_resolution (int): The resolution of the input images.
        vision_layers (int): The number of layers in the vision transformer.
        vision_width (int): The width of the vision transformer.
        vision_patch_size (int): The patch size of the vision transformer.
        use_gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
        rngs (rnglib.Rngs, optional): The random number generator state. Defaults to nnx.Rngs(0).
        dtype (DTypeLike, optional): The data type for computations. Defaults to jnp.float32.
        param_dtype (DTypeLike, optional): The data type for parameters. Defaults to jnp.float32.
        mesh (Mesh | None, optional): The device mesh for parameter sharding. Defaults to None.
        mesh_rules (MeshRules, optional): Logical axis sharding rules. Defaults to DEFAULT_SHARDING.
    """
    self.vision_layers = vision_layers
    self.vision_width = vision_width
    self.vision_patch_size = vision_patch_size
    self.dtype = dtype

    vision_heads = vision_width // 64

    self.encoder = VisionTransformerBase(
        img_size=image_resolution,
        patch_size=vision_patch_size,
        in_channels=3,
        hidden_size=vision_width,
        num_layers=vision_layers,
        num_heads=vision_heads,
        mlp_dim=vision_width * 4,
        use_pre_norm=False,
        use_patch_bias=True,
        use_quick_gelu=False,
        use_gradient_checkpointing=use_gradient_checkpointing,
        pooling_type="MAP",
        layernorm_epsilon=1e-6,
        dtype=dtype,
        param_dtype=param_dtype,
        mesh=mesh,
        rngs=rngs,
        mesh_rules=mesh_rules,
    )

`from_pretrained(model_name_or_path, use_pytorch=False, mesh=None, dtype=jnp.float32, param_dtype=jnp.float32, use_gradient_checkpointing=False, rngs=nnx.Rngs(0))` `classmethod`

Load a pretrained vision encoder from a SigLIP checkpoint.

Parameters:

Name	Type	Description	Default
`model_name_or_path`	`str`	Path to local weights or HuggingFace model ID.	required
`use_pytorch`	`bool`	Whether to load from PyTorch weights. Defaults to False.	`False`
`mesh`	`Mesh \| None`	Optional device mesh for parameter sharding. Defaults to None.	`None`
`dtype`	`DTypeLike`	Data type for computations. Defaults to jnp.float32.	`float32`
`param_dtype`	`DTypeLike`	Data type for parameters. Defaults to jnp.float32.	`float32`
`use_gradient_checkpointing`	`bool`	Whether to use gradient checkpointing. Defaults to False.	`False`
`rngs`	`Rngs`	Random number generator keys. Defaults to nnx.Rngs(0).	`Rngs(0)`

Returns:

Name	Type	Description
`SigLIPVisionModel`	`SigLIPVisionModel`	Pretrained SigLIP vision model

Source code in src/jimm/models/siglip/siglip_model.py

@classmethod
def from_pretrained(
    cls,
    model_name_or_path: str,
    use_pytorch: bool = False,
    mesh: Mesh | None = None,
    dtype: DTypeLike = jnp.float32,
    param_dtype: DTypeLike = jnp.float32,
    use_gradient_checkpointing: bool = False,
    rngs: rnglib.Rngs = nnx.Rngs(0),
) -> "SigLIPVisionModel":
    """Load a pretrained vision encoder from a SigLIP checkpoint.

    Args:
        model_name_or_path (str): Path to local weights or HuggingFace model ID.
        use_pytorch (bool): Whether to load from PyTorch weights. Defaults to False.
        mesh (Mesh | None): Optional device mesh for parameter sharding. Defaults to None.
        dtype (DTypeLike): Data type for computations. Defaults to jnp.float32.
        param_dtype (DTypeLike): Data type for parameters. Defaults to jnp.float32.
        use_gradient_checkpointing (bool): Whether to use gradient checkpointing. Defaults to False.
        rngs (rnglib.Rngs): Random number generator keys. Defaults to nnx.Rngs(0).

    Returns:
        SigLIPVisionModel: Pretrained SigLIP vision model
    """
    from .params import load_vision_from_pretrained

    return load_vision_from_pretrained(cls, model_name_or_path, use_pytorch, mesh, dtype, param_dtype, use_gradient_checkpointing, rngs)

`save_pretrained(save_directory)`

Save model weights and config in HuggingFace format.

Parameters:

Name	Type	Description	Default
`save_directory`	`str`	Directory path where the model will be saved.	required

Source code in src/jimm/models/siglip/siglip_model.py

def save_pretrained(self, save_directory: str) -> None:
    """Save model weights and config in HuggingFace format.

    Args:
        save_directory (str): Directory path where the model will be saved.
    """
    from .params import save_vision_pretrained

    save_vision_pretrained(self, save_directory)

`jimm.models.siglip.SigLIP`

Bases: Module

Source code in src/jimm/models/siglip/siglip_model.py

class SigLIP(nnx.Module):
    def __init__(
        self,
        image_resolution: int,
        vision_layers: int,
        vision_width: int,
        vision_patch_size: int,
        context_length: int,
        vocab_size: int,
        transformer_width: int,
        transformer_heads: int,
        transformer_layers: int,
        use_gradient_checkpointing: bool = False,
        rngs: rnglib.Rngs = nnx.Rngs(0),
        dtype: DTypeLike = jnp.float32,
        param_dtype: DTypeLike = jnp.float32,
        mesh: Mesh | None = None,
        mesh_rules: MeshRules = DEFAULT_SHARDING,
    ):
        """Initialize the SigLIP model.

        Args:
            image_resolution (int): The resolution of the input images.
            vision_layers (int): The number of layers in the vision transformer.
            vision_width (int): The width of the vision transformer.
            vision_patch_size (int): The patch size of the vision transformer.
            context_length (int): The length of the context.
            vocab_size (int): The size of the vocabulary.
            transformer_width (int): The width of the transformer.
            transformer_heads (int): The number of attention heads in the transformer.
            transformer_layers (int): The number of layers in the transformer.
            use_gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
            rngs (rnglib.Rngs, optional): The random number generator state. Defaults to nnx.Rngs(0).
            dtype (DTypeLike, optional): The data type for computations. Defaults to jnp.float32.
            param_dtype (DTypeLike, optional): The data type for parameters. Defaults to jnp.float32.
            mesh (Mesh | None, optional): Optional device mesh for parameter sharding. Defaults to None.
            mesh_rules (MeshRules, optional): Logical axis sharding rules. Defaults to DEFAULT_SHARDING.
        """
        self.vision_layers = vision_layers
        self.vision_width = vision_width
        self.vision_patch_size = vision_patch_size
        self.context_length = context_length
        self.vocab_size = vocab_size
        self.transformer_width = transformer_width
        self.transformer_heads = transformer_heads
        self.transformer_layers = transformer_layers
        self.dtype = dtype
        self._original_config = None

        self.vision_heads = vision_width // 64
        self.vision_model = SigLIPVisionModel(
            image_resolution=image_resolution,
            vision_layers=vision_layers,
            vision_width=vision_width,
            vision_patch_size=vision_patch_size,
            use_gradient_checkpointing=use_gradient_checkpointing,
            rngs=rngs,
            dtype=dtype,
            param_dtype=param_dtype,
            mesh=mesh,
            mesh_rules=mesh_rules,
        )

        self.text_model = SigLIPTextModel(
            context_length=context_length,
            vocab_size=vocab_size,
            transformer_width=transformer_width,
            transformer_heads=transformer_heads,
            transformer_layers=transformer_layers,
            use_gradient_checkpointing=use_gradient_checkpointing,
            rngs=rngs,
            dtype=dtype,
            param_dtype=param_dtype,
            mesh=mesh,
            mesh_rules=mesh_rules,
        )

        self.logit_scale = nnx.Param(nnx.with_partitioning(nnx.initializers.ones_init(), ())(rngs.params(), ()))
        self.logit_bias = nnx.Param(nnx.with_partitioning(nnx.initializers.ones_init(), ())(rngs.params(), ()))

    def encode_image(self, image: Float[Array, "batch height width channels"]) -> Float[Array, "batch transformer_width"]:
        """Encode images into embeddings.

        Args:
            image (Float[Array, "batch height width channels"]): Batch of input images.

        Returns:
            Float[Array, "batch transformer_width"]: Image embeddings.
        """
        return self.vision_model(image)

    def encode_text(self, text: Int[Array, "batch context_length"]) -> Float[Array, "batch transformer_width"]:
        """Encode text tokens into embeddings.

        Args:
            text (Int[Array, "batch context_length"]): Batch of token sequences.

        Returns:
            Float[Array, "batch transformer_width"]: Text embeddings.
        """
        return self.text_model(text)

    def __call__(self, image: Float[Array, "batch height width channels"], text: Int[Array, "batch context_length"]) -> Float[Array, "batch batch"]:
        """Calculate similarity between image and text embeddings.

        Args:
            image (Float[Array, "batch height width channels"]): Batch of input images.
            text (Int[Array, "batch context_length"]): Batch of token sequences.

        Returns:
            Float[Array, "batch batch"]: Similarity scores between all pairs of images and texts.
        """
        image_features: Float[Array, "batch transformer_width"] = self.encode_image(image)
        text_features: Float[Array, "batch transformer_width"] = self.encode_text(text)

        image_features: Float[Array, "batch transformer_width"] = image_features / jnp.linalg.norm(image_features, axis=-1, keepdims=True)
        text_features: Float[Array, "batch transformer_width"] = text_features / jnp.linalg.norm(text_features, axis=-1, keepdims=True)

        logit_scale: Float[Array, ""] = jnp.exp(self.logit_scale.value)
        logits: Float[Array, "batch batch"] = logit_scale * image_features @ text_features.T + self.logit_bias.value
        return logits

    @classmethod
    def from_pretrained(
        cls,
        model_name_or_path: str,
        use_pytorch: bool = False,
        mesh: Mesh | None = None,
        dtype: DTypeLike = jnp.float32,
        param_dtype: DTypeLike = jnp.float32,
        use_gradient_checkpointing: bool = False,
        rngs: rnglib.Rngs = nnx.Rngs(0),
    ) -> "SigLIP":
        """Load a pretrained SigLIP model from a local path or HuggingFace Hub.

        Args:
            model_name_or_path (str): Path to local weights or HuggingFace model ID.
            use_pytorch (bool): Whether to load from PyTorch weights. Defaults to False.
            mesh (Mesh | None): Optional device mesh for parameter sharding. Defaults to None.
            dtype (DTypeLike): Data type for computations. Defaults to jnp.float32.
            param_dtype (DTypeLike): Data type for parameters. Defaults to jnp.float32.
            use_gradient_checkpointing (bool): Whether to use gradient checkpointing. Defaults to False.
            rngs (rnglib.Rngs): Random number generator keys. Defaults to nnx.Rngs(0).

        Returns:
            SigLIP: Pretrained SigLIP model
        """
        from .params import load_from_pretrained

        return load_from_pretrained(cls, model_name_or_path, use_pytorch, mesh, dtype, param_dtype, use_gradient_checkpointing, rngs)

    def save_pretrained(self, save_directory: str):
        """Save the model weights and config in HuggingFace format.

        Args:
            save_directory (str): Directory path where the model will be saved.
        """
        from .params import save_pretrained

        save_pretrained(self, save_directory)

`call(image, text)`

Calculate similarity between image and text embeddings.

Parameters:

Name	Type	Description	Default
`image`	`Float[Array, 'batch height width channels']`	Batch of input images.	required
`text`	`Int[Array, 'batch context_length']`	Batch of token sequences.	required

Returns:

Type	Description
`Float[Array, 'batch batch']`	Float[Array, "batch batch"]: Similarity scores between all pairs of images and texts.

Source code in src/jimm/models/siglip/siglip_model.py

def __call__(self, image: Float[Array, "batch height width channels"], text: Int[Array, "batch context_length"]) -> Float[Array, "batch batch"]:
    """Calculate similarity between image and text embeddings.

    Args:
        image (Float[Array, "batch height width channels"]): Batch of input images.
        text (Int[Array, "batch context_length"]): Batch of token sequences.

    Returns:
        Float[Array, "batch batch"]: Similarity scores between all pairs of images and texts.
    """
    image_features: Float[Array, "batch transformer_width"] = self.encode_image(image)
    text_features: Float[Array, "batch transformer_width"] = self.encode_text(text)

    image_features: Float[Array, "batch transformer_width"] = image_features / jnp.linalg.norm(image_features, axis=-1, keepdims=True)
    text_features: Float[Array, "batch transformer_width"] = text_features / jnp.linalg.norm(text_features, axis=-1, keepdims=True)

    logit_scale: Float[Array, ""] = jnp.exp(self.logit_scale.value)
    logits: Float[Array, "batch batch"] = logit_scale * image_features @ text_features.T + self.logit_bias.value
    return logits

`init(image_resolution, vision_layers, vision_width, vision_patch_size, context_length, vocab_size, transformer_width, transformer_heads, transformer_layers, use_gradient_checkpointing=False, rngs=nnx.Rngs(0), dtype=jnp.float32, param_dtype=jnp.float32, mesh=None, mesh_rules=DEFAULT_SHARDING)`

Initialize the SigLIP model.

Parameters:

Name	Type	Description	Default
`image_resolution`	`int`	The resolution of the input images.	required
`vision_layers`	`int`	The number of layers in the vision transformer.	required
`vision_width`	`int`	The width of the vision transformer.	required
`vision_patch_size`	`int`	The patch size of the vision transformer.	required
`context_length`	`int`	The length of the context.	required
`vocab_size`	`int`	The size of the vocabulary.	required
`transformer_width`	`int`	The width of the transformer.	required
`transformer_heads`	`int`	The number of attention heads in the transformer.	required
`transformer_layers`	`int`	The number of layers in the transformer.	required
`use_gradient_checkpointing`	`bool`	Whether to use gradient checkpointing. Defaults to False.	`False`
`rngs`	`Rngs`	The random number generator state. Defaults to nnx.Rngs(0).	`Rngs(0)`
`dtype`	`DTypeLike`	The data type for computations. Defaults to jnp.float32.	`float32`
`param_dtype`	`DTypeLike`	The data type for parameters. Defaults to jnp.float32.	`float32`
`mesh`	`Mesh \| None`	Optional device mesh for parameter sharding. Defaults to None.	`None`
`mesh_rules`	`MeshRules`	Logical axis sharding rules. Defaults to DEFAULT_SHARDING.	`DEFAULT_SHARDING`

Source code in src/jimm/models/siglip/siglip_model.py

def __init__(
    self,
    image_resolution: int,
    vision_layers: int,
    vision_width: int,
    vision_patch_size: int,
    context_length: int,
    vocab_size: int,
    transformer_width: int,
    transformer_heads: int,
    transformer_layers: int,
    use_gradient_checkpointing: bool = False,
    rngs: rnglib.Rngs = nnx.Rngs(0),
    dtype: DTypeLike = jnp.float32,
    param_dtype: DTypeLike = jnp.float32,
    mesh: Mesh | None = None,
    mesh_rules: MeshRules = DEFAULT_SHARDING,
):
    """Initialize the SigLIP model.

    Args:
        image_resolution (int): The resolution of the input images.
        vision_layers (int): The number of layers in the vision transformer.
        vision_width (int): The width of the vision transformer.
        vision_patch_size (int): The patch size of the vision transformer.
        context_length (int): The length of the context.
        vocab_size (int): The size of the vocabulary.
        transformer_width (int): The width of the transformer.
        transformer_heads (int): The number of attention heads in the transformer.
        transformer_layers (int): The number of layers in the transformer.
        use_gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
        rngs (rnglib.Rngs, optional): The random number generator state. Defaults to nnx.Rngs(0).
        dtype (DTypeLike, optional): The data type for computations. Defaults to jnp.float32.
        param_dtype (DTypeLike, optional): The data type for parameters. Defaults to jnp.float32.
        mesh (Mesh | None, optional): Optional device mesh for parameter sharding. Defaults to None.
        mesh_rules (MeshRules, optional): Logical axis sharding rules. Defaults to DEFAULT_SHARDING.
    """
    self.vision_layers = vision_layers
    self.vision_width = vision_width
    self.vision_patch_size = vision_patch_size
    self.context_length = context_length
    self.vocab_size = vocab_size
    self.transformer_width = transformer_width
    self.transformer_heads = transformer_heads
    self.transformer_layers = transformer_layers
    self.dtype = dtype
    self._original_config = None

    self.vision_heads = vision_width // 64
    self.vision_model = SigLIPVisionModel(
        image_resolution=image_resolution,
        vision_layers=vision_layers,
        vision_width=vision_width,
        vision_patch_size=vision_patch_size,
        use_gradient_checkpointing=use_gradient_checkpointing,
        rngs=rngs,
        dtype=dtype,
        param_dtype=param_dtype,
        mesh=mesh,
        mesh_rules=mesh_rules,
    )

    self.text_model = SigLIPTextModel(
        context_length=context_length,
        vocab_size=vocab_size,
        transformer_width=transformer_width,
        transformer_heads=transformer_heads,
        transformer_layers=transformer_layers,
        use_gradient_checkpointing=use_gradient_checkpointing,
        rngs=rngs,
        dtype=dtype,
        param_dtype=param_dtype,
        mesh=mesh,
        mesh_rules=mesh_rules,
    )

    self.logit_scale = nnx.Param(nnx.with_partitioning(nnx.initializers.ones_init(), ())(rngs.params(), ()))
    self.logit_bias = nnx.Param(nnx.with_partitioning(nnx.initializers.ones_init(), ())(rngs.params(), ()))

`encode_image(image)`

Encode images into embeddings.

Parameters:

Name	Type	Description	Default
`image`	`Float[Array, 'batch height width channels']`	Batch of input images.	required

Returns:

Type	Description
`Float[Array, 'batch transformer_width']`	Float[Array, "batch transformer_width"]: Image embeddings.

Source code in src/jimm/models/siglip/siglip_model.py

def encode_image(self, image: Float[Array, "batch height width channels"]) -> Float[Array, "batch transformer_width"]:
    """Encode images into embeddings.

    Args:
        image (Float[Array, "batch height width channels"]): Batch of input images.

    Returns:
        Float[Array, "batch transformer_width"]: Image embeddings.
    """
    return self.vision_model(image)

`encode_text(text)`

Encode text tokens into embeddings.

Parameters:

Name	Type	Description	Default
`text`	`Int[Array, 'batch context_length']`	Batch of token sequences.	required

Returns:

Type	Description
`Float[Array, 'batch transformer_width']`	Float[Array, "batch transformer_width"]: Text embeddings.

Source code in src/jimm/models/siglip/siglip_model.py

def encode_text(self, text: Int[Array, "batch context_length"]) -> Float[Array, "batch transformer_width"]:
    """Encode text tokens into embeddings.

    Args:
        text (Int[Array, "batch context_length"]): Batch of token sequences.

    Returns:
        Float[Array, "batch transformer_width"]: Text embeddings.
    """
    return self.text_model(text)

`from_pretrained(model_name_or_path, use_pytorch=False, mesh=None, dtype=jnp.float32, param_dtype=jnp.float32, use_gradient_checkpointing=False, rngs=nnx.Rngs(0))` `classmethod`

Load a pretrained SigLIP model from a local path or HuggingFace Hub.

Parameters:

Name	Type	Description	Default
`model_name_or_path`	`str`	Path to local weights or HuggingFace model ID.	required
`use_pytorch`	`bool`	Whether to load from PyTorch weights. Defaults to False.	`False`
`mesh`	`Mesh \| None`	Optional device mesh for parameter sharding. Defaults to None.	`None`
`dtype`	`DTypeLike`	Data type for computations. Defaults to jnp.float32.	`float32`
`param_dtype`	`DTypeLike`	Data type for parameters. Defaults to jnp.float32.	`float32`
`use_gradient_checkpointing`	`bool`	Whether to use gradient checkpointing. Defaults to False.	`False`
`rngs`	`Rngs`	Random number generator keys. Defaults to nnx.Rngs(0).	`Rngs(0)`

Returns:

Name	Type	Description
`SigLIP`	`SigLIP`	Pretrained SigLIP model

Source code in src/jimm/models/siglip/siglip_model.py

@classmethod
def from_pretrained(
    cls,
    model_name_or_path: str,
    use_pytorch: bool = False,
    mesh: Mesh | None = None,
    dtype: DTypeLike = jnp.float32,
    param_dtype: DTypeLike = jnp.float32,
    use_gradient_checkpointing: bool = False,
    rngs: rnglib.Rngs = nnx.Rngs(0),
) -> "SigLIP":
    """Load a pretrained SigLIP model from a local path or HuggingFace Hub.

    Args:
        model_name_or_path (str): Path to local weights or HuggingFace model ID.
        use_pytorch (bool): Whether to load from PyTorch weights. Defaults to False.
        mesh (Mesh | None): Optional device mesh for parameter sharding. Defaults to None.
        dtype (DTypeLike): Data type for computations. Defaults to jnp.float32.
        param_dtype (DTypeLike): Data type for parameters. Defaults to jnp.float32.
        use_gradient_checkpointing (bool): Whether to use gradient checkpointing. Defaults to False.
        rngs (rnglib.Rngs): Random number generator keys. Defaults to nnx.Rngs(0).

    Returns:
        SigLIP: Pretrained SigLIP model
    """
    from .params import load_from_pretrained

    return load_from_pretrained(cls, model_name_or_path, use_pytorch, mesh, dtype, param_dtype, use_gradient_checkpointing, rngs)

`save_pretrained(save_directory)`

Save the model weights and config in HuggingFace format.

Parameters:

Name	Type	Description	Default
`save_directory`	`str`	Directory path where the model will be saved.	required

Source code in src/jimm/models/siglip/siglip_model.py

def save_pretrained(self, save_directory: str):
    """Save the model weights and config in HuggingFace format.

    Args:
        save_directory (str): Directory path where the model will be saved.
    """
    from .params import save_pretrained

    save_pretrained(self, save_directory)

SigLIP (Sigmoid Loss for Language Image Pre-Training)

jimm.models.siglip.SigLIPVisionModel

__call__(image, do_projection=True)

__init__(image_resolution, vision_layers, vision_width, vision_patch_size, use_gradient_checkpointing=False, rngs=nnx.Rngs(0), dtype=jnp.float32, param_dtype=jnp.float32, mesh=None, mesh_rules=DEFAULT_SHARDING)

from_pretrained(model_name_or_path, use_pytorch=False, mesh=None, dtype=jnp.float32, param_dtype=jnp.float32, use_gradient_checkpointing=False, rngs=nnx.Rngs(0)) classmethod

save_pretrained(save_directory)

jimm.models.siglip.SigLIP

__call__(image, text)

__init__(image_resolution, vision_layers, vision_width, vision_patch_size, context_length, vocab_size, transformer_width, transformer_heads, transformer_layers, use_gradient_checkpointing=False, rngs=nnx.Rngs(0), dtype=jnp.float32, param_dtype=jnp.float32, mesh=None, mesh_rules=DEFAULT_SHARDING)

encode_image(image)

encode_text(text)

from_pretrained(model_name_or_path, use_pytorch=False, mesh=None, dtype=jnp.float32, param_dtype=jnp.float32, use_gradient_checkpointing=False, rngs=nnx.Rngs(0)) classmethod

save_pretrained(save_directory)

`jimm.models.siglip.SigLIPVisionModel`

`call(image, do_projection=True)`

`init(image_resolution, vision_layers, vision_width, vision_patch_size, use_gradient_checkpointing=False, rngs=nnx.Rngs(0), dtype=jnp.float32, param_dtype=jnp.float32, mesh=None, mesh_rules=DEFAULT_SHARDING)`

`from_pretrained(model_name_or_path, use_pytorch=False, mesh=None, dtype=jnp.float32, param_dtype=jnp.float32, use_gradient_checkpointing=False, rngs=nnx.Rngs(0))` `classmethod`

`save_pretrained(save_directory)`

`jimm.models.siglip.SigLIP`

`call(image, text)`

`init(image_resolution, vision_layers, vision_width, vision_patch_size, context_length, vocab_size, transformer_width, transformer_heads, transformer_layers, use_gradient_checkpointing=False, rngs=nnx.Rngs(0), dtype=jnp.float32, param_dtype=jnp.float32, mesh=None, mesh_rules=DEFAULT_SHARDING)`

`encode_image(image)`

`encode_text(text)`

`from_pretrained(model_name_or_path, use_pytorch=False, mesh=None, dtype=jnp.float32, param_dtype=jnp.float32, use_gradient_checkpointing=False, rngs=nnx.Rngs(0))` `classmethod`

`save_pretrained(save_directory)`