Ragged¶

`Ragged` ¶

Bases: Array, Generic[RDTYPE_co]

An awkward array with exactly 1 ragged dimension. The ragged dimension is None in its shape tuple.

Warning

Ragged arrays only support a subset of Awkward array features.

Strings are not supported since ASCII is sufficient for the bioinformatics domain.
Bytestrings count as a ragged dimension, and we break from the Awkward convention to not include a "var" in the type string.
Record-layout Ragged arrays (produced by ak.zip of Ragged inputs or by passing a record-layout ak.Array) return field-keyed dicts from dtype, data, and parts. Use rag["field"] for zero-copy single-field access. view, apply, and to_numpy are not defined on record layouts; access individual fields. Union types remain unsupported.

Source code in python/seqpro/rag/_array.py

class Ragged(ak.Array, Generic[RDTYPE_co]):
    """An awkward array with exactly 1 ragged dimension. The ragged dimension is `None` in its shape tuple.

    !!! warning
        Ragged arrays only support a subset of Awkward array features.

        - Strings are not supported since ASCII is sufficient for the bioinformatics domain.
        - Bytestrings count as a ragged dimension, and we break from the Awkward convention to not include a "var" in the type string.
        - Record-layout Ragged arrays (produced by `ak.zip` of Ragged inputs or by passing a record-layout
          `ak.Array`) return field-keyed dicts from `dtype`, `data`, and `parts`. Use `rag["field"]`
          for zero-copy single-field access. `view`, `apply`, and `to_numpy` are not defined on record
          layouts; access individual fields. Union types remain unsupported.

    """

    _parts: RagParts[RDTYPE_co] | dict[str, RagParts]

    def __init__(
        self,
        data: Content | ak.Array | Ragged[RDTYPE_co] | RagParts[RDTYPE_co],
    ):
        if isinstance(data, RagParts):
            content = _parts_to_content(data)
        else:
            content = _as_ragged(data, highlevel=False)
        super().__init__(content, behavior=deepcopy(ak.behavior))
        if isinstance(content, RecordArray) or _is_record_layout(content):
            # ak._update_class() demotes RecordArray layouts to plain ak.Array
            # because there is no "__list__" parameter at the record level.
            # Restore the Ragged subclass and cache per-field RagParts.
            self.__class__ = Ragged  # type: ignore[assignment]
            # Set sentinel first: self[f] -> __getitem__ -> _ensure_parts checks hasattr
            object.__setattr__(self, "_parts", {})
            shared_offsets = _extract_list_offsets(cast(Content, ak.to_layout(self)))
            self._parts = {
                f: RagParts(p.data, p.shape, shared_offsets)
                for f in ak.fields(self)
                for p in (unbox(self[f]),)
            }
        else:
            self._parts = unbox(self)

    def _ensure_parts(self) -> None:
        """Idempotent lazy init for `_parts`. Handles Ragged instances created
        via awkward behavior dispatch (e.g. `ak.zip`) that bypass `__init__`."""
        if hasattr(self, "_parts"):
            return
        layout = cast(Content, ak.to_layout(self))
        if isinstance(layout, RecordArray) or _is_record_layout(layout):
            # Set sentinel first to break the self[f] -> _ensure_parts cycle.
            object.__setattr__(self, "_parts", {})
            shared_offsets = _extract_list_offsets(layout)
            object.__setattr__(
                self,
                "_parts",
                {
                    f: RagParts(p.data, p.shape, shared_offsets)
                    for f in ak.fields(self)
                    for p in (unbox(self[f]),)
                },
            )
        else:
            object.__setattr__(self, "_parts", unbox(self))

    @staticmethod
    def from_offsets(
        data: NDArray[DTYPE_co],
        shape: tuple[int | None, ...],
        offsets: NDArray[OFFSET_TYPE],
    ) -> Ragged[DTYPE_co]:
        """Create a Ragged array from data, offsets, and shape.

        Parameters
        ----------
        data
            The data to create the Ragged array from.
        shape
            The shape of the Ragged array.
        offsets
            The offsets to create the Ragged array from.

        Returns
        -------
        Ragged[DTYPE_co]
        """
        try:
            rag_dim = shape.index(None)
        except ValueError:
            raise ValueError("Shape must have exactly one None dimension.")

        if offsets.ndim == 1:
            n_rag = len(offsets) - 1
        else:
            n_rag = offsets.shape[1]
        if n_rag != np.prod(shape[:rag_dim], dtype=int):  # type: ignore
            raise ValueError(
                f"Number of ragged segments {n_rag} does not match product of ragged components of shape {shape[:rag_dim]}"
            )

        if offsets.ndim == 1:
            size = offsets[-1] * np.prod(shape[rag_dim + 1 :], dtype=int)  # type: ignore
            if data.size != size:
                raise ValueError(
                    f"Data size {data.size} does not match size implied by shape and contiguous offsets: {size}"
                )

        parts = RagParts[DTYPE_co](data, shape, offsets)
        return Ragged(parts)

    @staticmethod
    def from_lengths(
        data: NDArray[DTYPE_co], lengths: NDArray[np.integer]
    ) -> Ragged[DTYPE_co]:
        """Create a Ragged array from data and lengths.

        Parameters
        ----------
        data
            The data to create the Ragged array from.
        lengths
            The lengths of the segments.

        Returns
        -------
        Ragged[DTYPE_co]
        """
        parts = RagParts[DTYPE_co].from_lengths(data, lengths)
        return Ragged(parts)

    parts = _PartsDescriptor()
    """The parts of the Ragged array. For record layouts, a dict of
    field name -> RagParts; all share the same offsets ndarray."""

    data = _DataDescriptor()
    """The data of the Ragged array. For record layouts, a dict of
    field name -> zero-copy ndarray view, in awkward field order."""

    @property
    def offsets(self) -> NDArray[OFFSET_TYPE]:
        """The offsets of the Ragged array. May have shape (n_ragged + 1) or (2, n_ragged).

        Returns
        -------
        NDArray[np.int64]
        """
        self._ensure_parts()
        if isinstance(self._parts, dict):
            return next(iter(self._parts.values())).offsets
        return self._parts.offsets

    @property
    def shape(self) -> tuple[int | None, ...]:
        """The shape of the Ragged array. The ragged dimension is `None`.

        Returns
        -------
        tuple[int | None, ...]
        """
        self._ensure_parts()
        if isinstance(self._parts, dict):
            return next(iter(self._parts.values())).shape
        return self._parts.shape

    @property
    def dtype(self) -> np.dtype[RDTYPE_co]:
        """The dtype of the Ragged array.

        For non-record layouts, returns the numpy dtype of the flat data buffer
        (e.g. ``np.dtype('int32')``).

        For record layouts, returns a numpy *structured* dtype whose field names
        and per-field dtypes match the Ragged record fields — for example::

            np.dtype([("seq", "S1"), ("score", "f4")])

        .. note::
            **Memory layout is SoA, not AoS.**  A numpy structured dtype normally
            implies Array-of-Structs packing, but here each field lives in its own
            contiguous buffer (Structure of Arrays).  The structured dtype is used
            purely as a convenient, numpy-compatible descriptor: it carries all
            field/dtype information in a single object without inventing a new type.

        Returns
        -------
        np.dtype[RDTYPE_co]
        """
        self._ensure_parts()
        if isinstance(self._parts, dict):
            return np.dtype([(f, p.data.dtype) for f, p in self._parts.items()])  # type: ignore[return-value]
        return self._parts.data.dtype

    @property
    def rag_dim(self) -> int:
        """The index of the ragged dimension.

        Returns
        -------
        int
        """
        return self.shape.index(None)

    @property
    def lengths(self) -> NDArray[np.integer]:
        """The lengths of the segments.

        Returns
        -------
        NDArray[np.integer]
        """
        if self.offsets.ndim == 1:
            lengths = np.diff(self.offsets)
        else:
            lengths = np.diff(self.offsets, axis=0)

        return lengths.reshape(self.shape[: self.rag_dim])  # type: ignore

    def view(self, dtype: type[DTYPE_co] | str) -> Ragged[DTYPE_co]:
        """Return a view of the data with the given dtype.

        Parameters
        ----------
        dtype
            Target dtype.

        Returns
        -------
        Ragged[DTYPE_co]
            Zero-copy view with reinterpreted dtype.
        """
        self._ensure_parts()
        if isinstance(self._parts, dict):
            raise NotImplementedError(
                "view is not defined on record-layout Ragged arrays; "
                "update fields individually, e.g. rag['f'] = rag['f'].view(dtype)."
            )
        # get a new layout, same data
        view = ak.without_parameters(self)

        # change view of the data
        parts = unbox(view)
        parts.data = parts.data.view(dtype)

        # init a new array with same base data
        view = Ragged(parts)
        return view

    @classmethod
    def empty(
        cls, shape: int | tuple[int | None, ...], dtype: type[DTYPE_co]
    ) -> Ragged[DTYPE_co]:
        """Create an empty Ragged array with the given shape and dtype.

        Parameters
        ----------
        shape
            Shape of the array. Must include exactly one `None` for the ragged dimension.
        dtype
            Element dtype.

        Returns
        -------
        Ragged[DTYPE_co]
        """
        data = np.empty(0, dtype=dtype)
        if isinstance(shape, int):
            shape = (shape,)
        rag_dim = shape.index(None)
        offsets = np.zeros(
            np.prod(shape[:rag_dim]) + 1,  # type: ignore
            dtype=OFFSET_TYPE,
        )
        parts = RagParts(data, shape, offsets)
        content = _parts_to_content(parts)
        return cast(Ragged[DTYPE_co], cls(content))

    @property
    def is_empty(self) -> bool:
        """Whether the Ragged array is empty.

        Returns
        -------
        bool
        """
        if self.offsets.ndim == 1:
            return self.offsets[-1] == 0
        else:
            return np.all(self.offsets[0] == self.offsets[1]).item()

    @property
    def is_contiguous(self) -> bool:
        """Whether the Ragged array is contiguous.

        Returns
        -------
        bool
        """
        contiguous_offsets = self.offsets.ndim == 1
        self._ensure_parts()
        if isinstance(self._parts, dict):
            contiguous_data = all(p.data.flags.contiguous for p in self._parts.values())
        else:
            contiguous_data = self._parts.data.flags.contiguous
        return contiguous_offsets and contiguous_data

    @property
    def is_base(self) -> bool:
        """Whether the Ragged array is a base array (owns its data, contiguous, no offset).

        Returns
        -------
        bool
        """
        self._ensure_parts()
        if isinstance(self._parts, dict):
            parts_list = list(self._parts.values())
            base_data = all(p.data.base is None for p in parts_list)
            data_size = parts_list[0].data.size
        else:
            base_data = self._parts.data.base is None
            data_size = self._parts.data.size
        return (
            base_data
            and self.is_contiguous
            and self.offsets[0] == 0
            and self.offsets[-1] == data_size
        )

    def to_numpy(self, allow_missing: bool = False) -> NDArray[RDTYPE_co]:
        """Convert to a dense NumPy array. Not zero-copy if offsets or data are non-contiguous.

        Parameters
        ----------
        allow_missing
            Passed through to `ak.Array.to_numpy`.

        Returns
        -------
        NDArray[RDTYPE_co]
        """
        self._ensure_parts()
        if isinstance(self._parts, dict):
            raise NotImplementedError(
                "to_numpy is not defined on record-layout Ragged arrays; "
                "convert fields individually."
            )
        arr = super().to_numpy(allow_missing=allow_missing)
        if self.dtype.type == np.bytes_:  # type: ignore[attr-defined] guaranteed by record check
            arr = arr[..., None].view("S1")
        return arr

    def __getitem__(self, where):
        arr = super().__getitem__(where)
        if isinstance(arr, ak.Array):
            if _n_var(arr) == 1:
                result = type(self)(arr)
                # For record field access, share the parent's offsets object (zero-copy).
                self._ensure_parts()
                if (
                    isinstance(where, str)
                    and isinstance(self._parts, dict)
                    and where in self._parts
                ):
                    result._ensure_parts()
                    assert isinstance(result._parts, RagParts)
                    result._parts = RagParts(
                        result._parts.data,
                        result._parts.shape,
                        self._parts[where].offsets,
                    )
                return result
            else:
                return _as_ak(arr)
        else:
            return arr

    def squeeze(
        self, axis: int | tuple[int, ...] | None = None
    ) -> Self | NDArray[RDTYPE_co] | dict[str, NDArray[RDTYPE_co]]:
        """Squeeze the ragged array along the given non-ragged axis.
        If squeezing would result in a 1D array, return the data as a numpy array.
        For record layouts, dispatches per-field; if fields collapse to 1D ndarrays,
        returns a dict of ndarrays, otherwise returns a record Ragged.

        Parameters
        ----------
        axis
            Axis or axes to squeeze. Must have size 1. If `None`, squeeze all size-1 axes.

        Returns
        -------
        Self | NDArray[RDTYPE_co] | dict[str, NDArray[RDTYPE_co]]
        """
        self._ensure_parts()
        if isinstance(self._parts, dict):
            squeezed = {f: self[f].squeeze(axis) for f in self._parts}
            first = next(iter(squeezed.values()))
            if isinstance(first, np.ndarray):
                return squeezed  # type: ignore[reportUnknownReturnType]
            return type(self)(ak.zip(squeezed, depth_limit=1))  # type: ignore[reportUnknownReturnType]
        if axis is None:
            data = self._parts.data.squeeze()
            shape = tuple(s for s in self.shape if s != 1)
            parts = RagParts[RDTYPE_co](data, shape, self.offsets)
            return type(self)(parts)

        if isinstance(axis, int):
            axis = (axis,)
        axis = tuple(a if a >= 0 else self.ndim + a + 1 for a in axis)
        for a in axis:
            if (size := self.shape[a]) != 1:
                raise ValueError(f"Cannot squeeze axis {a} of size {size}.")

        shape = tuple(s for i, s in enumerate(self.shape) if i not in axis)
        data_shape = tuple(
            s for i, s in enumerate(self.shape) if i not in axis and i > self.rag_dim
        )
        data = self._parts.data.reshape(len(self._parts.data), *data_shape)

        if shape == (None,):
            return data

        parts = RagParts[RDTYPE_co](data, shape, self.offsets)
        return type(self)(parts)

    def reshape(self, *shape: int | None | tuple[int | None, ...]) -> Self:
        """Reshape non-ragged axes.

        Parameters
        ----------
        *shape
            New shape including exactly one `None` for the ragged dimension.

        Returns
        -------
        Self
        """
        self._ensure_parts()
        if isinstance(self._parts, dict):
            reshaped = {f: self[f].reshape(*shape) for f in self._parts}
            return type(self)(ak.zip(reshaped, depth_limit=1))
        # this is correct because all reshaping operations preserve the layout i.e. raveled ordered
        if isinstance(shape[0], tuple):
            if len(shape) > 1:
                raise ValueError("Cannot mix tuple and non-tuple shapes.")
            shape = cast(tuple[tuple[int | None, ...]], shape)
            shape = shape[0]

        if TYPE_CHECKING:
            shape = cast(tuple[int | None, ...], shape)

        rag_dim = shape.index(None)
        rag_shape = cast(tuple[int, ...], self.shape[: self.rag_dim])
        n_rag = np.prod(rag_shape)
        new_rag_shape = cast(tuple[int, ...], shape[:rag_dim])
        n_new_rag = abs(np.prod(new_rag_shape))
        new_rag_shape = tuple(
            s if s >= 0 else int(n_rag // n_new_rag) for s in new_rag_shape
        )
        data = self._parts.data.reshape(len(self._parts.data), *shape[rag_dim + 1 :])
        new_shape = (*new_rag_shape, None, *data.shape[1:])
        parts = RagParts[RDTYPE_co](data, new_shape, self.offsets)
        return type(self)(parts)

    def to_ak(self) -> ak.Array:
        """Convert to a plain Awkward array, stripping the Ragged behavior.

        Returns
        -------
        ak.Array
        """
        arr = _as_ak(self)
        arr.behavior = None
        return arr

`data = _DataDescriptor()` `class-attribute` `instance-attribute` ¶

The data of the Ragged array. For record layouts, a dict of field name -> zero-copy ndarray view, in awkward field order.

`dtype` `property` ¶

The dtype of the Ragged array.

For non-record layouts, returns the numpy dtype of the flat data buffer (e.g. np.dtype('int32')).

For record layouts, returns a numpy structured dtype whose field names and per-field dtypes match the Ragged record fields — for example::

np.dtype([("seq", "S1"), ("score", "f4")])

.. note:: Memory layout is SoA, not AoS. A numpy structured dtype normally implies Array-of-Structs packing, but here each field lives in its own contiguous buffer (Structure of Arrays). The structured dtype is used purely as a convenient, numpy-compatible descriptor: it carries all field/dtype information in a single object without inventing a new type.

Returns:

Type	Description
`dtype[RDTYPE_co]`

`is_base` `property` ¶

Whether the Ragged array is a base array (owns its data, contiguous, no offset).

Returns:

Type	Description
`bool`

`is_contiguous` `property` ¶

Whether the Ragged array is contiguous.

Returns:

Type	Description
`bool`

`is_empty` `property` ¶

Whether the Ragged array is empty.

Returns:

Type	Description
`bool`

`lengths` `property` ¶

The lengths of the segments.

Returns:

Type	Description
`NDArray[integer]`

`offsets` `property` ¶

The offsets of the Ragged array. May have shape (n_ragged + 1) or (2, n_ragged).

Returns:

Type	Description
`NDArray[int64]`

`parts = _PartsDescriptor()` `class-attribute` `instance-attribute` ¶

The parts of the Ragged array. For record layouts, a dict of field name -> RagParts; all share the same offsets ndarray.

`rag_dim` `property` ¶

The index of the ragged dimension.

Returns:

Type	Description
`int`

`shape` `property` ¶

The shape of the Ragged array. The ragged dimension is None.

Returns:

Type	Description
`tuple[int \| None, ...]`

`empty(shape, dtype)` `classmethod` ¶

Create an empty Ragged array with the given shape and dtype.

Parameters:

Name	Type	Description	Default
`shape`	`int \| tuple[int \| None, ...]`	Shape of the array. Must include exactly one `None` for the ragged dimension.	required
`dtype`	`type[DTYPE_co]`	Element dtype.	required

Returns:

Type	Description
`Ragged[DTYPE_co]`

Source code in python/seqpro/rag/_array.py

@classmethod
def empty(
    cls, shape: int | tuple[int | None, ...], dtype: type[DTYPE_co]
) -> Ragged[DTYPE_co]:
    """Create an empty Ragged array with the given shape and dtype.

    Parameters
    ----------
    shape
        Shape of the array. Must include exactly one `None` for the ragged dimension.
    dtype
        Element dtype.

    Returns
    -------
    Ragged[DTYPE_co]
    """
    data = np.empty(0, dtype=dtype)
    if isinstance(shape, int):
        shape = (shape,)
    rag_dim = shape.index(None)
    offsets = np.zeros(
        np.prod(shape[:rag_dim]) + 1,  # type: ignore
        dtype=OFFSET_TYPE,
    )
    parts = RagParts(data, shape, offsets)
    content = _parts_to_content(parts)
    return cast(Ragged[DTYPE_co], cls(content))

`from_lengths(data, lengths)` `staticmethod` ¶

Create a Ragged array from data and lengths.

Parameters:

Name	Type	Description	Default
`data`	`NDArray[DTYPE_co]`	The data to create the Ragged array from.	required
`lengths`	`NDArray[integer]`	The lengths of the segments.	required

Returns:

Type	Description
`Ragged[DTYPE_co]`

Source code in python/seqpro/rag/_array.py

@staticmethod
def from_lengths(
    data: NDArray[DTYPE_co], lengths: NDArray[np.integer]
) -> Ragged[DTYPE_co]:
    """Create a Ragged array from data and lengths.

    Parameters
    ----------
    data
        The data to create the Ragged array from.
    lengths
        The lengths of the segments.

    Returns
    -------
    Ragged[DTYPE_co]
    """
    parts = RagParts[DTYPE_co].from_lengths(data, lengths)
    return Ragged(parts)

`from_offsets(data, shape, offsets)` `staticmethod` ¶

Create a Ragged array from data, offsets, and shape.

Parameters:

Name	Type	Description	Default
`data`	`NDArray[DTYPE_co]`	The data to create the Ragged array from.	required
`shape`	`tuple[int \| None, ...]`	The shape of the Ragged array.	required
`offsets`	`NDArray[OFFSET_TYPE]`	The offsets to create the Ragged array from.	required

Returns:

Type	Description
`Ragged[DTYPE_co]`

Source code in python/seqpro/rag/_array.py

@staticmethod
def from_offsets(
    data: NDArray[DTYPE_co],
    shape: tuple[int | None, ...],
    offsets: NDArray[OFFSET_TYPE],
) -> Ragged[DTYPE_co]:
    """Create a Ragged array from data, offsets, and shape.

    Parameters
    ----------
    data
        The data to create the Ragged array from.
    shape
        The shape of the Ragged array.
    offsets
        The offsets to create the Ragged array from.

    Returns
    -------
    Ragged[DTYPE_co]
    """
    try:
        rag_dim = shape.index(None)
    except ValueError:
        raise ValueError("Shape must have exactly one None dimension.")

    if offsets.ndim == 1:
        n_rag = len(offsets) - 1
    else:
        n_rag = offsets.shape[1]
    if n_rag != np.prod(shape[:rag_dim], dtype=int):  # type: ignore
        raise ValueError(
            f"Number of ragged segments {n_rag} does not match product of ragged components of shape {shape[:rag_dim]}"
        )

    if offsets.ndim == 1:
        size = offsets[-1] * np.prod(shape[rag_dim + 1 :], dtype=int)  # type: ignore
        if data.size != size:
            raise ValueError(
                f"Data size {data.size} does not match size implied by shape and contiguous offsets: {size}"
            )

    parts = RagParts[DTYPE_co](data, shape, offsets)
    return Ragged(parts)

`reshape(*shape)` ¶

Reshape non-ragged axes.

Parameters:

Name	Type	Description	Default
`*shape`	`int \| None \| tuple[int \| None, ...]`	New shape including exactly one `None` for the ragged dimension.	`()`

Returns:

Type	Description
`Self`

Source code in python/seqpro/rag/_array.py

def reshape(self, *shape: int | None | tuple[int | None, ...]) -> Self:
    """Reshape non-ragged axes.

    Parameters
    ----------
    *shape
        New shape including exactly one `None` for the ragged dimension.

    Returns
    -------
    Self
    """
    self._ensure_parts()
    if isinstance(self._parts, dict):
        reshaped = {f: self[f].reshape(*shape) for f in self._parts}
        return type(self)(ak.zip(reshaped, depth_limit=1))
    # this is correct because all reshaping operations preserve the layout i.e. raveled ordered
    if isinstance(shape[0], tuple):
        if len(shape) > 1:
            raise ValueError("Cannot mix tuple and non-tuple shapes.")
        shape = cast(tuple[tuple[int | None, ...]], shape)
        shape = shape[0]

    if TYPE_CHECKING:
        shape = cast(tuple[int | None, ...], shape)

    rag_dim = shape.index(None)
    rag_shape = cast(tuple[int, ...], self.shape[: self.rag_dim])
    n_rag = np.prod(rag_shape)
    new_rag_shape = cast(tuple[int, ...], shape[:rag_dim])
    n_new_rag = abs(np.prod(new_rag_shape))
    new_rag_shape = tuple(
        s if s >= 0 else int(n_rag // n_new_rag) for s in new_rag_shape
    )
    data = self._parts.data.reshape(len(self._parts.data), *shape[rag_dim + 1 :])
    new_shape = (*new_rag_shape, None, *data.shape[1:])
    parts = RagParts[RDTYPE_co](data, new_shape, self.offsets)
    return type(self)(parts)

`squeeze(axis=None)` ¶

Squeeze the ragged array along the given non-ragged axis. If squeezing would result in a 1D array, return the data as a numpy array. For record layouts, dispatches per-field; if fields collapse to 1D ndarrays, returns a dict of ndarrays, otherwise returns a record Ragged.

Parameters:

Name	Type	Description	Default
`axis`	`int \| tuple[int, ...] \| None`	Axis or axes to squeeze. Must have size 1. If `None`, squeeze all size-1 axes.	`None`

Returns:

Type	Description
`Self \| NDArray[RDTYPE_co] \| dict[str, NDArray[RDTYPE_co]]`

Source code in python/seqpro/rag/_array.py

def squeeze(
    self, axis: int | tuple[int, ...] | None = None
) -> Self | NDArray[RDTYPE_co] | dict[str, NDArray[RDTYPE_co]]:
    """Squeeze the ragged array along the given non-ragged axis.
    If squeezing would result in a 1D array, return the data as a numpy array.
    For record layouts, dispatches per-field; if fields collapse to 1D ndarrays,
    returns a dict of ndarrays, otherwise returns a record Ragged.

    Parameters
    ----------
    axis
        Axis or axes to squeeze. Must have size 1. If `None`, squeeze all size-1 axes.

    Returns
    -------
    Self | NDArray[RDTYPE_co] | dict[str, NDArray[RDTYPE_co]]
    """
    self._ensure_parts()
    if isinstance(self._parts, dict):
        squeezed = {f: self[f].squeeze(axis) for f in self._parts}
        first = next(iter(squeezed.values()))
        if isinstance(first, np.ndarray):
            return squeezed  # type: ignore[reportUnknownReturnType]
        return type(self)(ak.zip(squeezed, depth_limit=1))  # type: ignore[reportUnknownReturnType]
    if axis is None:
        data = self._parts.data.squeeze()
        shape = tuple(s for s in self.shape if s != 1)
        parts = RagParts[RDTYPE_co](data, shape, self.offsets)
        return type(self)(parts)

    if isinstance(axis, int):
        axis = (axis,)
    axis = tuple(a if a >= 0 else self.ndim + a + 1 for a in axis)
    for a in axis:
        if (size := self.shape[a]) != 1:
            raise ValueError(f"Cannot squeeze axis {a} of size {size}.")

    shape = tuple(s for i, s in enumerate(self.shape) if i not in axis)
    data_shape = tuple(
        s for i, s in enumerate(self.shape) if i not in axis and i > self.rag_dim
    )
    data = self._parts.data.reshape(len(self._parts.data), *data_shape)

    if shape == (None,):
        return data

    parts = RagParts[RDTYPE_co](data, shape, self.offsets)
    return type(self)(parts)

`to_ak()` ¶

Convert to a plain Awkward array, stripping the Ragged behavior.

Returns:

Type	Description
`Array`

Source code in python/seqpro/rag/_array.py

def to_ak(self) -> ak.Array:
    """Convert to a plain Awkward array, stripping the Ragged behavior.

    Returns
    -------
    ak.Array
    """
    arr = _as_ak(self)
    arr.behavior = None
    return arr

`to_numpy(allow_missing=False)` ¶

Convert to a dense NumPy array. Not zero-copy if offsets or data are non-contiguous.

Parameters:

Name	Type	Description	Default
`allow_missing`	`bool`	Passed through to `ak.Array.to_numpy`.	`False`

Returns:

Type	Description
`NDArray[RDTYPE_co]`

Source code in python/seqpro/rag/_array.py

def to_numpy(self, allow_missing: bool = False) -> NDArray[RDTYPE_co]:
    """Convert to a dense NumPy array. Not zero-copy if offsets or data are non-contiguous.

    Parameters
    ----------
    allow_missing
        Passed through to `ak.Array.to_numpy`.

    Returns
    -------
    NDArray[RDTYPE_co]
    """
    self._ensure_parts()
    if isinstance(self._parts, dict):
        raise NotImplementedError(
            "to_numpy is not defined on record-layout Ragged arrays; "
            "convert fields individually."
        )
    arr = super().to_numpy(allow_missing=allow_missing)
    if self.dtype.type == np.bytes_:  # type: ignore[attr-defined] guaranteed by record check
        arr = arr[..., None].view("S1")
    return arr

`view(dtype)` ¶

Return a view of the data with the given dtype.

Parameters:

Name	Type	Description	Default
`dtype`	`type[DTYPE_co] \| str`	Target dtype.	required

Returns:

Type	Description
`Ragged[DTYPE_co]`	Zero-copy view with reinterpreted dtype.

Source code in python/seqpro/rag/_array.py

def view(self, dtype: type[DTYPE_co] | str) -> Ragged[DTYPE_co]:
    """Return a view of the data with the given dtype.

    Parameters
    ----------
    dtype
        Target dtype.

    Returns
    -------
    Ragged[DTYPE_co]
        Zero-copy view with reinterpreted dtype.
    """
    self._ensure_parts()
    if isinstance(self._parts, dict):
        raise NotImplementedError(
            "view is not defined on record-layout Ragged arrays; "
            "update fields individually, e.g. rag['f'] = rag['f'].view(dtype)."
        )
    # get a new layout, same data
    view = ak.without_parameters(self)

    # change view of the data
    parts = unbox(view)
    parts.data = parts.data.view(dtype)

    # init a new array with same base data
    view = Ragged(parts)
    return view

`is_rag_dtype(rag, dtype)` ¶

Check if an object is a Ragged array with the given dtype (fails for record-layout Ragged arrays).

Parameters:

Name	Type	Description	Default
`rag`	`Any`	Object to check.	required
`dtype`	`DTYPE_co \| type[DTYPE_co]`	Expected dtype.	required

Returns:

Type	Description
`TypeIs[Ragged[DTYPE_co]]`	True if `rag` is a `Ragged` array whose dtype is a subtype of `dtype`.

Source code in python/seqpro/rag/_array.py

def is_rag_dtype(
    rag: Any, dtype: DTYPE_co | type[DTYPE_co]
) -> TypeIs[Ragged[DTYPE_co]]:
    """Check if an object is a `Ragged` array with the given dtype (fails for record-layout Ragged arrays).

    Parameters
    ----------
    rag
        Object to check.
    dtype
        Expected dtype.

    Returns
    -------
    TypeIs[Ragged[DTYPE_co]]
        True if `rag` is a `Ragged` array whose dtype is a subtype of `dtype`.
    """
    if not isinstance(rag, Ragged):
        return False
    if np.issubdtype(rag.dtype, np.void):  # structured dtype → record layout
        if not np.issubdtype(dtype, np.void):
            return False  # can't match structured Ragged with primitive dtype
        return rag.dtype == np.dtype(dtype)
    return np.issubdtype(rag.dtype, dtype)

`lengths_to_offsets(lengths, dtype=OFFSET_TYPE)` ¶

Convert lengths to offsets.

Parameters:

Name	Type	Description	Default
`lengths`	`NDArray[integer]`	Lengths of the segments.	required

Returns:

Type	Description
`NDArray[DTYPE]`	Offsets of the segments; length is len(lengths) + 1, starting with 0.

Source code in python/seqpro/rag/_utils.py

def lengths_to_offsets(
    lengths: NDArray[np.integer], dtype: type[DTYPE] | DTYPE = OFFSET_TYPE
) -> NDArray[DTYPE]:
    """Convert lengths to offsets.

    Parameters
    ----------
    lengths
        Lengths of the segments.

    Returns
    -------
    NDArray[DTYPE]
        Offsets of the segments; length is len(lengths) + 1, starting with 0.
    """
    offsets = np.empty(lengths.size + 1, dtype=dtype)
    offsets[0] = 0
    offsets[1:] = lengths.cumsum()
    return offsets

Ragged¶

Ragged ¶

data = _DataDescriptor() class-attribute instance-attribute ¶

dtype property ¶

is_base property ¶

is_contiguous property ¶

is_empty property ¶

lengths property ¶

offsets property ¶

parts = _PartsDescriptor() class-attribute instance-attribute ¶

rag_dim property ¶

shape property ¶

empty(shape, dtype) classmethod ¶

from_lengths(data, lengths) staticmethod ¶

from_offsets(data, shape, offsets) staticmethod ¶

reshape(*shape) ¶

squeeze(axis=None) ¶

to_ak() ¶

to_numpy(allow_missing=False) ¶

view(dtype) ¶

is_rag_dtype(rag, dtype) ¶

lengths_to_offsets(lengths, dtype=OFFSET_TYPE) ¶

`Ragged` ¶

`data = _DataDescriptor()` `class-attribute` `instance-attribute` ¶

`dtype` `property` ¶

`is_base` `property` ¶

`is_contiguous` `property` ¶

`is_empty` `property` ¶

`lengths` `property` ¶

`offsets` `property` ¶

`parts = _PartsDescriptor()` `class-attribute` `instance-attribute` ¶

`rag_dim` `property` ¶

`shape` `property` ¶

`empty(shape, dtype)` `classmethod` ¶

`from_lengths(data, lengths)` `staticmethod` ¶

`from_offsets(data, shape, offsets)` `staticmethod` ¶

`reshape(*shape)` ¶

`squeeze(axis=None)` ¶

`to_ak()` ¶

`to_numpy(allow_missing=False)` ¶

`view(dtype)` ¶

`is_rag_dtype(rag, dtype)` ¶

`lengths_to_offsets(lengths, dtype=OFFSET_TYPE)` ¶