Skip to content

Alphabets

AA = AminoAlphabet(*(map(list, zip(*(canonical_codons_to_aas.items()))))) module-attribute

DNA = NucleotideAlphabet('ACGT', 'TGCA') module-attribute

RNA = NucleotideAlphabet(alphabet='ACGU', complement='UGCA') module-attribute

AminoAlphabet

Source code in python/seqpro/alphabets/_alphabets.py
class AminoAlphabet:
    codons: list[str]
    amino_acids: list[str]
    codon_array: NDArray[np.bytes_]
    aa_array: NDArray[np.bytes_]
    codon_to_aa: dict[str, str]

    def __init__(self, codons: list[str], amino_acids: list[str]) -> None:
        """Construct an alphabet of amino acids and their mappings to codons.

        Parameters
        ----------
        codons
            List of codons.
        amino_acids
            List of amino acids, in the same order

        Raises
        ------
        ValueError
            _description_
        ValueError
            _description_
        """
        k = len(codons[0])
        if any(len(c) != k for c in codons):
            raise ValueError("Got codons with varying lengths.")
        if any(len(a) != 1 for a in amino_acids):
            raise ValueError("Got amino acid symbols that are multiple characters.")
        if len(codons) != len(amino_acids):
            raise ValueError(
                "Got different number of codons and amino acids for mapping."
            )

        self.codons = codons
        self.amino_acids = amino_acids

        self.codon_array = np.array(codons, "S")[..., None].view("S1")
        self.aa_array = np.array(amino_acids, "S1")

        self.codon_to_aa = dict(zip(codons, amino_acids))

    @overload
    def translate(
        self,
        seqs: StrSeqType,
        length_axis: int | None = None,
        *,
        nuc_alphabet: NucleotideAlphabet | None = None,
        truncate_stop: bool = False,
    ) -> NDArray[np.bytes_]: ...
    @overload
    def translate(
        self,
        seqs: Ragged[np.bytes_],
        length_axis: int | None = None,
        *,
        nuc_alphabet: NucleotideAlphabet | None = None,
        truncate_stop: bool = False,
    ) -> Ragged[np.bytes_]: ...
    @overload
    def translate(
        self,
        seqs: Ragged[np.uint8],
        length_axis: int | None = None,
        *,
        nuc_alphabet: NucleotideAlphabet,
        truncate_stop: bool = False,
    ) -> Ragged[np.uint8]: ...
    def translate(
        self,
        seqs: StrSeqType | Ragged[np.bytes_] | Ragged[np.uint8],
        length_axis: int | None = None,
        *,
        nuc_alphabet: NucleotideAlphabet | None = None,
        truncate_stop: bool = False,
    ) -> NDArray[np.bytes_] | Ragged[np.bytes_] | Ragged[np.uint8]:
        """Translate nucleotide sequences to amino acids.

        Parameters
        ----------
        seqs
            Nucleotide sequences. Ragged inputs must have all lengths divisible by
            the codon size. For OHE Ragged (uint8), nuc_alphabet is required.
        length_axis
            Only used for non-Ragged array input.
        nuc_alphabet
            Required when seqs is a Ragged OHE (uint8) array, to decode OHE → bytes.
        truncate_stop
            When True, each output sequence is truncated at the first stop codon
            (inclusive). Only valid for Ragged input. Default False.

        Returns
        -------
        NDArray[np.bytes_] | Ragged[np.bytes_] | Ragged[np.uint8]
            Translated amino acid sequences in the same container type as the input.
        """

        if not isinstance(seqs, Ragged):
            check_axes(seqs, length_axis, False)
            seqs = cast_seqs(seqs)
            codon_size = self.codon_array.shape[-1]
            if length_axis is None:
                length_axis = -1
            if seqs.shape[length_axis] % codon_size != 0:
                raise ValueError(
                    "Sequence length is not evenly divisible by codon length."
                )
            if length_axis < 0:
                length_axis = seqs.ndim + length_axis
            codons = np.lib.stride_tricks.sliding_window_view(
                seqs, window_shape=codon_size, axis=-1
            )[..., ::codon_size, :]
            codon_axis = length_axis + 1
            return gufunc_translate(
                codons.view(np.uint8),
                self.codon_array.view(np.uint8),
                self.aa_array.view(np.uint8),
                axes=[codon_axis, (-2, -1), (-1), ()],  # type: ignore
            ).view("S1")

        # --- Ragged path ---
        # Pack to ListOffsetArray so .data and .offsets are contiguous and valid.
        seqs = Ragged(ak.to_packed(seqs))

        is_ohe = is_rag_dtype(seqs, np.uint8)

        codon_size = self.codon_array.shape[-1]
        lengths = seqs.lengths.ravel()
        offsets = seqs.offsets  # 1D (n+1,) after to_packed

        if (lengths % codon_size != 0).any():
            raise ValueError(
                "All Ragged sequence lengths must be divisible by codon length."
            )

        n = len(lengths)

        # Decode OHE → bytes if needed. seqs.data is (total, n_nuc) for OHE or (total,) for bytes.
        if is_ohe:
            if nuc_alphabet is None:
                raise ValueError("nuc_alphabet is required for OHE Ragged input.")
            nuc_bytes_flat: NDArray[np.bytes_] = nuc_alphabet.decode_ohe(  # type: ignore[union-attr]
                seqs.data, ohe_axis=-1
            )
        else:
            nuc_bytes_flat = seqs.data

        # Translate the entire flat buffer in one vectorized call. This is valid because
        # translation is invariant to splitting/concatenation when all lengths % codon_size == 0.
        total = nuc_bytes_flat.shape[0]
        if total > 0:
            codons = np.lib.stride_tricks.sliding_window_view(
                nuc_bytes_flat, codon_size, axis=0
            )[::codon_size, :]  # (total // codon_size, codon_size)
            translated_flat: NDArray[np.bytes_] = gufunc_translate(
                codons.view(np.uint8),
                self.codon_array.view(np.uint8),
                self.aa_array.view(np.uint8),
                axes=[1, (-2, -1), (-1), ()],  # type: ignore
            ).view("S1")  # (total // codon_size,)
        else:
            translated_flat = np.empty(0, dtype="S1")

        new_offsets = offsets // codon_size  # (n+1,) position-based in translated_flat

        if truncate_stop:
            starts = new_offsets[:-1].astype(np.int64)
            full_ends = new_offsets[1:].astype(np.int64)
            ends = _nb_find_stop_ends(
                translated_flat.view(np.uint8), starts, full_ends, np.uint8(ord("*"))
            )
            out_offsets = np.stack(
                [starts, ends]
            )  # (2, n) — ListArray (non-contiguous view)
        else:
            out_offsets = new_offsets  # 1D — ListOffsetArray

        if is_ohe:
            n_aa = len(self.aa_array)
            ohe_flat = self.ohe(translated_flat).flatten()
            return Ragged.from_offsets(ohe_flat, (n, None, n_aa), out_offsets)
        else:
            return Ragged.from_offsets(translated_flat, (n, None), out_offsets)

    def ohe(self, seqs: StrSeqType) -> NDArray[np.uint8]:
        """One hot encode an amino acid sequence.

        Parameters
        ----------
        seqs

        Returns
        -------
        NDArray[np.uint8]
            One-hot encoded sequences; last axis is alphabet size, second-to-last is
            sequence length.
        """
        _seqs = cast_seqs(seqs)
        return gufunc_ohe(_seqs.view(np.uint8), self.aa_array.view(np.uint8))

    def decode_ohe(
        self,
        seqs: NDArray[np.uint8],
        ohe_axis: int,
        unknown_char: str = "X",
    ) -> NDArray[np.bytes_]:
        """Convert an OHE array to an S1 byte array.

        Parameters
        ----------
        seqs
        ohe_axis
        unknown_char
            Single character to use for unknown values, by default "X"

        Returns
        -------
        NDArray[np.bytes_]
            S1 byte array of decoded characters; ohe_axis is removed from the shape.
        """
        idx = gufunc_ohe_char_idx(seqs, axis=ohe_axis)  # type: ignore

        if ohe_axis < 0:
            ohe_axis_idx = seqs.ndim + ohe_axis
        else:
            ohe_axis_idx = ohe_axis

        shape = *seqs.shape[:ohe_axis_idx], *seqs.shape[ohe_axis_idx + 1 :]

        _alphabet = np.concatenate([self.aa_array, [unknown_char.encode("ascii")]])

        return _alphabet[idx].reshape(shape)

__init__(codons, amino_acids)

Construct an alphabet of amino acids and their mappings to codons.

Parameters:

Name Type Description Default
codons list[str]

List of codons.

required
amino_acids list[str]

List of amino acids, in the same order

required

Raises:

Type Description
ValueError

description

ValueError

description

Source code in python/seqpro/alphabets/_alphabets.py
def __init__(self, codons: list[str], amino_acids: list[str]) -> None:
    """Construct an alphabet of amino acids and their mappings to codons.

    Parameters
    ----------
    codons
        List of codons.
    amino_acids
        List of amino acids, in the same order

    Raises
    ------
    ValueError
        _description_
    ValueError
        _description_
    """
    k = len(codons[0])
    if any(len(c) != k for c in codons):
        raise ValueError("Got codons with varying lengths.")
    if any(len(a) != 1 for a in amino_acids):
        raise ValueError("Got amino acid symbols that are multiple characters.")
    if len(codons) != len(amino_acids):
        raise ValueError(
            "Got different number of codons and amino acids for mapping."
        )

    self.codons = codons
    self.amino_acids = amino_acids

    self.codon_array = np.array(codons, "S")[..., None].view("S1")
    self.aa_array = np.array(amino_acids, "S1")

    self.codon_to_aa = dict(zip(codons, amino_acids))

decode_ohe(seqs, ohe_axis, unknown_char='X')

Convert an OHE array to an S1 byte array.

Parameters:

Name Type Description Default
seqs NDArray[uint8]
required
ohe_axis int
required
unknown_char str

Single character to use for unknown values, by default "X"

'X'

Returns:

Type Description
NDArray[bytes_]

S1 byte array of decoded characters; ohe_axis is removed from the shape.

Source code in python/seqpro/alphabets/_alphabets.py
def decode_ohe(
    self,
    seqs: NDArray[np.uint8],
    ohe_axis: int,
    unknown_char: str = "X",
) -> NDArray[np.bytes_]:
    """Convert an OHE array to an S1 byte array.

    Parameters
    ----------
    seqs
    ohe_axis
    unknown_char
        Single character to use for unknown values, by default "X"

    Returns
    -------
    NDArray[np.bytes_]
        S1 byte array of decoded characters; ohe_axis is removed from the shape.
    """
    idx = gufunc_ohe_char_idx(seqs, axis=ohe_axis)  # type: ignore

    if ohe_axis < 0:
        ohe_axis_idx = seqs.ndim + ohe_axis
    else:
        ohe_axis_idx = ohe_axis

    shape = *seqs.shape[:ohe_axis_idx], *seqs.shape[ohe_axis_idx + 1 :]

    _alphabet = np.concatenate([self.aa_array, [unknown_char.encode("ascii")]])

    return _alphabet[idx].reshape(shape)

ohe(seqs)

One hot encode an amino acid sequence.

Parameters:

Name Type Description Default
seqs StrSeqType
required

Returns:

Type Description
NDArray[uint8]

One-hot encoded sequences; last axis is alphabet size, second-to-last is sequence length.

Source code in python/seqpro/alphabets/_alphabets.py
def ohe(self, seqs: StrSeqType) -> NDArray[np.uint8]:
    """One hot encode an amino acid sequence.

    Parameters
    ----------
    seqs

    Returns
    -------
    NDArray[np.uint8]
        One-hot encoded sequences; last axis is alphabet size, second-to-last is
        sequence length.
    """
    _seqs = cast_seqs(seqs)
    return gufunc_ohe(_seqs.view(np.uint8), self.aa_array.view(np.uint8))

translate(seqs, length_axis=None, *, nuc_alphabet=None, truncate_stop=False)

translate(seqs: StrSeqType, length_axis: int | None = None, *, nuc_alphabet: NucleotideAlphabet | None = None, truncate_stop: bool = False) -> NDArray[np.bytes_]
translate(seqs: Ragged[np.bytes_], length_axis: int | None = None, *, nuc_alphabet: NucleotideAlphabet | None = None, truncate_stop: bool = False) -> Ragged[np.bytes_]
translate(seqs: Ragged[np.uint8], length_axis: int | None = None, *, nuc_alphabet: NucleotideAlphabet, truncate_stop: bool = False) -> Ragged[np.uint8]

Translate nucleotide sequences to amino acids.

Parameters:

Name Type Description Default
seqs StrSeqType | Ragged[bytes_] | Ragged[uint8]

Nucleotide sequences. Ragged inputs must have all lengths divisible by the codon size. For OHE Ragged (uint8), nuc_alphabet is required.

required
length_axis int | None

Only used for non-Ragged array input.

None
nuc_alphabet NucleotideAlphabet | None

Required when seqs is a Ragged OHE (uint8) array, to decode OHE → bytes.

None
truncate_stop bool

When True, each output sequence is truncated at the first stop codon (inclusive). Only valid for Ragged input. Default False.

False

Returns:

Type Description
NDArray[bytes_] | Ragged[bytes_] | Ragged[uint8]

Translated amino acid sequences in the same container type as the input.

Source code in python/seqpro/alphabets/_alphabets.py
def translate(
    self,
    seqs: StrSeqType | Ragged[np.bytes_] | Ragged[np.uint8],
    length_axis: int | None = None,
    *,
    nuc_alphabet: NucleotideAlphabet | None = None,
    truncate_stop: bool = False,
) -> NDArray[np.bytes_] | Ragged[np.bytes_] | Ragged[np.uint8]:
    """Translate nucleotide sequences to amino acids.

    Parameters
    ----------
    seqs
        Nucleotide sequences. Ragged inputs must have all lengths divisible by
        the codon size. For OHE Ragged (uint8), nuc_alphabet is required.
    length_axis
        Only used for non-Ragged array input.
    nuc_alphabet
        Required when seqs is a Ragged OHE (uint8) array, to decode OHE → bytes.
    truncate_stop
        When True, each output sequence is truncated at the first stop codon
        (inclusive). Only valid for Ragged input. Default False.

    Returns
    -------
    NDArray[np.bytes_] | Ragged[np.bytes_] | Ragged[np.uint8]
        Translated amino acid sequences in the same container type as the input.
    """

    if not isinstance(seqs, Ragged):
        check_axes(seqs, length_axis, False)
        seqs = cast_seqs(seqs)
        codon_size = self.codon_array.shape[-1]
        if length_axis is None:
            length_axis = -1
        if seqs.shape[length_axis] % codon_size != 0:
            raise ValueError(
                "Sequence length is not evenly divisible by codon length."
            )
        if length_axis < 0:
            length_axis = seqs.ndim + length_axis
        codons = np.lib.stride_tricks.sliding_window_view(
            seqs, window_shape=codon_size, axis=-1
        )[..., ::codon_size, :]
        codon_axis = length_axis + 1
        return gufunc_translate(
            codons.view(np.uint8),
            self.codon_array.view(np.uint8),
            self.aa_array.view(np.uint8),
            axes=[codon_axis, (-2, -1), (-1), ()],  # type: ignore
        ).view("S1")

    # --- Ragged path ---
    # Pack to ListOffsetArray so .data and .offsets are contiguous and valid.
    seqs = Ragged(ak.to_packed(seqs))

    is_ohe = is_rag_dtype(seqs, np.uint8)

    codon_size = self.codon_array.shape[-1]
    lengths = seqs.lengths.ravel()
    offsets = seqs.offsets  # 1D (n+1,) after to_packed

    if (lengths % codon_size != 0).any():
        raise ValueError(
            "All Ragged sequence lengths must be divisible by codon length."
        )

    n = len(lengths)

    # Decode OHE → bytes if needed. seqs.data is (total, n_nuc) for OHE or (total,) for bytes.
    if is_ohe:
        if nuc_alphabet is None:
            raise ValueError("nuc_alphabet is required for OHE Ragged input.")
        nuc_bytes_flat: NDArray[np.bytes_] = nuc_alphabet.decode_ohe(  # type: ignore[union-attr]
            seqs.data, ohe_axis=-1
        )
    else:
        nuc_bytes_flat = seqs.data

    # Translate the entire flat buffer in one vectorized call. This is valid because
    # translation is invariant to splitting/concatenation when all lengths % codon_size == 0.
    total = nuc_bytes_flat.shape[0]
    if total > 0:
        codons = np.lib.stride_tricks.sliding_window_view(
            nuc_bytes_flat, codon_size, axis=0
        )[::codon_size, :]  # (total // codon_size, codon_size)
        translated_flat: NDArray[np.bytes_] = gufunc_translate(
            codons.view(np.uint8),
            self.codon_array.view(np.uint8),
            self.aa_array.view(np.uint8),
            axes=[1, (-2, -1), (-1), ()],  # type: ignore
        ).view("S1")  # (total // codon_size,)
    else:
        translated_flat = np.empty(0, dtype="S1")

    new_offsets = offsets // codon_size  # (n+1,) position-based in translated_flat

    if truncate_stop:
        starts = new_offsets[:-1].astype(np.int64)
        full_ends = new_offsets[1:].astype(np.int64)
        ends = _nb_find_stop_ends(
            translated_flat.view(np.uint8), starts, full_ends, np.uint8(ord("*"))
        )
        out_offsets = np.stack(
            [starts, ends]
        )  # (2, n) — ListArray (non-contiguous view)
    else:
        out_offsets = new_offsets  # 1D — ListOffsetArray

    if is_ohe:
        n_aa = len(self.aa_array)
        ohe_flat = self.ohe(translated_flat).flatten()
        return Ragged.from_offsets(ohe_flat, (n, None, n_aa), out_offsets)
    else:
        return Ragged.from_offsets(translated_flat, (n, None), out_offsets)

NucleotideAlphabet

Source code in python/seqpro/alphabets/_alphabets.py
class NucleotideAlphabet:
    alphabet: str
    """Alphabet excluding ambiguous characters e.g. "N" for DNA."""
    complement: str
    array: NDArray[np.bytes_]
    complement_map: dict[str, str]
    complement_map_bytes: dict[bytes, bytes]
    str_comp_table: dict[int, str]
    bytes_comp_table: bytes
    bytes_comp_array: NDArray[np.bytes_]

    def __init__(self, alphabet: str, complement: str) -> None:
        """Parse and validate sequence alphabets.

        Nucleic acid alphabets must be complemented by being reversed (without the
        unknown character). For example, `reverse(ACGT) = complement(ACGT) = TGCA`.

        Parameters
        ----------
        alphabet
            For example, DNA could be 'ACGT'.
        complement
            Complement of the alphabet, to continue the example this would be 'TGCA'.
        """
        self._validate(alphabet, complement)
        self.alphabet = alphabet
        self.complement = complement
        self.array = cast(
            NDArray[np.bytes_], np.frombuffer(self.alphabet.encode("ascii"), "|S1")
        )
        self.complement_map = dict(zip(list(self.alphabet), list(self.complement)))
        self.complement_map_bytes = {
            k.encode("ascii"): v.encode("ascii") for k, v in self.complement_map.items()
        }
        self.str_comp_table = str.maketrans(self.complement_map)
        self.bytes_comp_table = bytes.maketrans(
            self.alphabet.encode("ascii"), self.complement.encode("ascii")
        )
        self.bytes_comp_array = np.frombuffer(self.bytes_comp_table, "S1")

    def __len__(self):
        return len(self.alphabet)

    def _validate(self, alphabet: str, complement: str):
        if len(set(alphabet)) != len(alphabet):
            raise ValueError("Alphabet has repeated characters.")

        if len(set(complement)) != len(complement):
            raise ValueError("Complement has repeated characters.")

        for maybe_comp, comp in zip(alphabet[::-1], complement):
            if maybe_comp != comp:
                raise ValueError("Reverse of alphabet does not yield the complement.")

    def ohe(self, seqs: StrSeqType) -> NDArray[np.uint8]:
        """One hot encode a nucleotide sequence.

        Parameters
        ----------
        seqs

        Returns
        -------
        NDArray[np.uint8]
            One-hot encoded sequences; last axis is alphabet size, second-to-last is
            sequence length.
        """
        _seqs = cast_seqs(seqs)
        return gufunc_ohe(_seqs.view(np.uint8), self.array.view(np.uint8))

    def decode_ohe(
        self,
        seqs: NDArray[np.uint8],
        ohe_axis: int,
        unknown_char: str = "N",
    ) -> NDArray[np.bytes_]:
        """Convert an OHE array to an S1 byte array.

        Parameters
        ----------
        seqs
        ohe_axis
        unknown_char
            Single character to use for unknown values, by default "N"

        Returns
        -------
        NDArray[np.bytes_]
            S1 byte array of decoded characters; ohe_axis is removed from the shape.
        """
        idx = gufunc_ohe_char_idx(seqs, axis=ohe_axis)  # type: ignore

        if ohe_axis < 0:
            ohe_axis_idx = seqs.ndim + ohe_axis
        else:
            ohe_axis_idx = ohe_axis

        shape = *seqs.shape[:ohe_axis_idx], *seqs.shape[ohe_axis_idx + 1 :]

        _alphabet = np.concatenate([self.array, [unknown_char.encode("ascii")]])

        return _alphabet[idx].reshape(shape)

    def _complement_bytes(
        self, byte_arr: NDArray[np.bytes_], out: NDArray[np.bytes_] | None = None
    ) -> NDArray[np.bytes_]:
        if out is None:
            _out = out
        else:
            _out = out.view(np.uint8)
        _out = gufunc_complement_bytes(
            byte_arr.view(np.uint8), self.bytes_comp_array.view(np.uint8), _out
        )
        return _out.view("S1")

    def _rev_comp_byte(
        self,
        byte_arr: NDArray[np.bytes_],
        length_axis: int,
        out: NDArray[np.bytes_] | None = None,
    ) -> NDArray[np.bytes_]:
        out = self._complement_bytes(byte_arr, out)
        return np.flip(out, length_axis)

    @overload
    def reverse_complement(
        self,
        seqs: StrSeqType,
        length_axis: int | None = None,
        ohe_axis: int | None = None,
        out: NDArray[np.bytes_] | None = None,
    ) -> NDArray[np.bytes_]: ...
    @overload
    def reverse_complement(
        self,
        seqs: NDArray[np.uint8],
        length_axis: int | None = None,
        ohe_axis: int | None = None,
        out: NDArray[np.uint8] | None = None,
    ) -> NDArray[np.uint8]: ...
    @overload
    def reverse_complement(
        self,
        seqs: SeqType,
        length_axis: int | None = None,
        ohe_axis: int | None = None,
        out: NDArray[np.bytes_ | np.uint8] | None = None,
    ) -> NDArray[np.bytes_ | np.uint8]: ...
    def reverse_complement(
        self,
        seqs: SeqType,
        length_axis: int | None = None,
        ohe_axis: int | None = None,
        out: NDArray[np.bytes_ | np.uint8] | None = None,
    ) -> NDArray[np.bytes_ | np.uint8]:
        """Reverse complement a sequence.

        Parameters
        ----------
        seqs
        length_axis
            Length axis, by default None
        ohe_axis
            One hot encoding axis, by default None

        Returns
        -------
        NDArray[np.bytes_ | np.uint8]
            Reverse-complemented sequences as S1 bytes or uint8 for OHE input.
        """
        check_axes(seqs, length_axis, ohe_axis)

        seqs_ = cast_seqs(seqs)

        if is_dtype(seqs_, np.bytes_):
            assert out is None or is_dtype(out, np.bytes_)
            if length_axis is None:
                length_axis = -1
            return self._rev_comp_byte(seqs_, length_axis, out)
        elif is_dtype(seqs_, np.uint8):  # OHE
            assert length_axis is not None
            assert ohe_axis is not None
            assert out is None or is_dtype(out, np.uint8)
            out_ = np.flip(seqs_, axis=(length_axis, ohe_axis))
            if out is not None:
                out[:] = out_
                out_ = out
            return out_
        else:
            raise ValueError("Invalid sequence type.")

alphabet = alphabet instance-attribute

Alphabet excluding ambiguous characters e.g. "N" for DNA.

__init__(alphabet, complement)

Parse and validate sequence alphabets.

Nucleic acid alphabets must be complemented by being reversed (without the unknown character). For example, reverse(ACGT) = complement(ACGT) = TGCA.

Parameters:

Name Type Description Default
alphabet str

For example, DNA could be 'ACGT'.

required
complement str

Complement of the alphabet, to continue the example this would be 'TGCA'.

required
Source code in python/seqpro/alphabets/_alphabets.py
def __init__(self, alphabet: str, complement: str) -> None:
    """Parse and validate sequence alphabets.

    Nucleic acid alphabets must be complemented by being reversed (without the
    unknown character). For example, `reverse(ACGT) = complement(ACGT) = TGCA`.

    Parameters
    ----------
    alphabet
        For example, DNA could be 'ACGT'.
    complement
        Complement of the alphabet, to continue the example this would be 'TGCA'.
    """
    self._validate(alphabet, complement)
    self.alphabet = alphabet
    self.complement = complement
    self.array = cast(
        NDArray[np.bytes_], np.frombuffer(self.alphabet.encode("ascii"), "|S1")
    )
    self.complement_map = dict(zip(list(self.alphabet), list(self.complement)))
    self.complement_map_bytes = {
        k.encode("ascii"): v.encode("ascii") for k, v in self.complement_map.items()
    }
    self.str_comp_table = str.maketrans(self.complement_map)
    self.bytes_comp_table = bytes.maketrans(
        self.alphabet.encode("ascii"), self.complement.encode("ascii")
    )
    self.bytes_comp_array = np.frombuffer(self.bytes_comp_table, "S1")

decode_ohe(seqs, ohe_axis, unknown_char='N')

Convert an OHE array to an S1 byte array.

Parameters:

Name Type Description Default
seqs NDArray[uint8]
required
ohe_axis int
required
unknown_char str

Single character to use for unknown values, by default "N"

'N'

Returns:

Type Description
NDArray[bytes_]

S1 byte array of decoded characters; ohe_axis is removed from the shape.

Source code in python/seqpro/alphabets/_alphabets.py
def decode_ohe(
    self,
    seqs: NDArray[np.uint8],
    ohe_axis: int,
    unknown_char: str = "N",
) -> NDArray[np.bytes_]:
    """Convert an OHE array to an S1 byte array.

    Parameters
    ----------
    seqs
    ohe_axis
    unknown_char
        Single character to use for unknown values, by default "N"

    Returns
    -------
    NDArray[np.bytes_]
        S1 byte array of decoded characters; ohe_axis is removed from the shape.
    """
    idx = gufunc_ohe_char_idx(seqs, axis=ohe_axis)  # type: ignore

    if ohe_axis < 0:
        ohe_axis_idx = seqs.ndim + ohe_axis
    else:
        ohe_axis_idx = ohe_axis

    shape = *seqs.shape[:ohe_axis_idx], *seqs.shape[ohe_axis_idx + 1 :]

    _alphabet = np.concatenate([self.array, [unknown_char.encode("ascii")]])

    return _alphabet[idx].reshape(shape)

ohe(seqs)

One hot encode a nucleotide sequence.

Parameters:

Name Type Description Default
seqs StrSeqType
required

Returns:

Type Description
NDArray[uint8]

One-hot encoded sequences; last axis is alphabet size, second-to-last is sequence length.

Source code in python/seqpro/alphabets/_alphabets.py
def ohe(self, seqs: StrSeqType) -> NDArray[np.uint8]:
    """One hot encode a nucleotide sequence.

    Parameters
    ----------
    seqs

    Returns
    -------
    NDArray[np.uint8]
        One-hot encoded sequences; last axis is alphabet size, second-to-last is
        sequence length.
    """
    _seqs = cast_seqs(seqs)
    return gufunc_ohe(_seqs.view(np.uint8), self.array.view(np.uint8))

reverse_complement(seqs, length_axis=None, ohe_axis=None, out=None)

reverse_complement(seqs: StrSeqType, length_axis: int | None = None, ohe_axis: int | None = None, out: NDArray[np.bytes_] | None = None) -> NDArray[np.bytes_]
reverse_complement(seqs: NDArray[np.uint8], length_axis: int | None = None, ohe_axis: int | None = None, out: NDArray[np.uint8] | None = None) -> NDArray[np.uint8]
reverse_complement(seqs: SeqType, length_axis: int | None = None, ohe_axis: int | None = None, out: NDArray[np.bytes_ | np.uint8] | None = None) -> NDArray[np.bytes_ | np.uint8]

Reverse complement a sequence.

Parameters:

Name Type Description Default
seqs SeqType
required
length_axis int | None

Length axis, by default None

None
ohe_axis int | None

One hot encoding axis, by default None

None

Returns:

Type Description
NDArray[bytes_ | uint8]

Reverse-complemented sequences as S1 bytes or uint8 for OHE input.

Source code in python/seqpro/alphabets/_alphabets.py
def reverse_complement(
    self,
    seqs: SeqType,
    length_axis: int | None = None,
    ohe_axis: int | None = None,
    out: NDArray[np.bytes_ | np.uint8] | None = None,
) -> NDArray[np.bytes_ | np.uint8]:
    """Reverse complement a sequence.

    Parameters
    ----------
    seqs
    length_axis
        Length axis, by default None
    ohe_axis
        One hot encoding axis, by default None

    Returns
    -------
    NDArray[np.bytes_ | np.uint8]
        Reverse-complemented sequences as S1 bytes or uint8 for OHE input.
    """
    check_axes(seqs, length_axis, ohe_axis)

    seqs_ = cast_seqs(seqs)

    if is_dtype(seqs_, np.bytes_):
        assert out is None or is_dtype(out, np.bytes_)
        if length_axis is None:
            length_axis = -1
        return self._rev_comp_byte(seqs_, length_axis, out)
    elif is_dtype(seqs_, np.uint8):  # OHE
        assert length_axis is not None
        assert ohe_axis is not None
        assert out is None or is_dtype(out, np.uint8)
        out_ = np.flip(seqs_, axis=(length_axis, ohe_axis))
        if out is not None:
            out[:] = out_
            out_ = out
        return out_
    else:
        raise ValueError("Invalid sequence type.")