Alphabets¶

`AA = AminoAlphabet((map(list, zip((canonical_codons_to_aas.items())))))` `module-attribute` ¶

`DNA = NucleotideAlphabet('ACGT', 'TGCA')` `module-attribute` ¶

`RNA = NucleotideAlphabet(alphabet='ACGU', complement='UGCA')` `module-attribute` ¶

`AminoAlphabet` ¶

Source code in python/seqpro/alphabets/_alphabets.py

class AminoAlphabet:
    codons: list[str]
    amino_acids: list[str]
    codon_array: NDArray[np.bytes_]
    aa_array: NDArray[np.bytes_]
    codon_to_aa: dict[str, str]

    def __init__(self, codons: list[str], amino_acids: list[str]) -> None:
        """Construct an alphabet of amino acids and their mappings to codons.

        Parameters
        ----------
        codons
            List of codons.
        amino_acids
            List of amino acids, in the same order

        Raises
        ------
        ValueError
            _description_
        ValueError
            _description_
        """
        k = len(codons[0])
        if any(len(c) != k for c in codons):
            raise ValueError("Got codons with varying lengths.")
        if any(len(a) != 1 for a in amino_acids):
            raise ValueError("Got amino acid symbols that are multiple characters.")
        if len(codons) != len(amino_acids):
            raise ValueError(
                "Got different number of codons and amino acids for mapping."
            )

        self.codons = codons
        self.amino_acids = amino_acids

        self.codon_array = np.array(codons, "S")[..., None].view("S1")
        self.aa_array = np.array(amino_acids, "S1")

        self.codon_to_aa = dict(zip(codons, amino_acids))

    @overload
    def translate(
        self,
        seqs: StrSeqType,
        length_axis: int | None = None,
        *,
        nuc_alphabet: NucleotideAlphabet | None = None,
        truncate_stop: bool = False,
    ) -> NDArray[np.bytes_]: ...
    @overload
    def translate(
        self,
        seqs: Ragged[np.bytes_],
        length_axis: int | None = None,
        *,
        nuc_alphabet: NucleotideAlphabet | None = None,
        truncate_stop: bool = False,
    ) -> Ragged[np.bytes_]: ...
    @overload
    def translate(
        self,
        seqs: Ragged[np.uint8],
        length_axis: int | None = None,
        *,
        nuc_alphabet: NucleotideAlphabet,
        truncate_stop: bool = False,
    ) -> Ragged[np.uint8]: ...
    def translate(
        self,
        seqs: StrSeqType | Ragged[np.bytes_] | Ragged[np.uint8],
        length_axis: int | None = None,
        *,
        nuc_alphabet: NucleotideAlphabet | None = None,
        truncate_stop: bool = False,
    ) -> NDArray[np.bytes_] | Ragged[np.bytes_] | Ragged[np.uint8]:
        """Translate nucleotide sequences to amino acids.

        Parameters
        ----------
        seqs
            Nucleotide sequences. Ragged inputs must have all lengths divisible by
            the codon size. For OHE Ragged (uint8), nuc_alphabet is required.
        length_axis
            Only used for non-Ragged array input.
        nuc_alphabet
            Required when seqs is a Ragged OHE (uint8) array, to decode OHE → bytes.
        truncate_stop
            When True, each output sequence is truncated at the first stop codon
            (inclusive). Only valid for Ragged input. Default False.

        Returns
        -------
        NDArray[np.bytes_] | Ragged[np.bytes_] | Ragged[np.uint8]
            Translated amino acid sequences in the same container type as the input.
        """

        if not isinstance(seqs, Ragged):
            check_axes(seqs, length_axis, False)
            seqs = cast_seqs(seqs)
            codon_size = self.codon_array.shape[-1]
            if length_axis is None:
                length_axis = -1
            if seqs.shape[length_axis] % codon_size != 0:
                raise ValueError(
                    "Sequence length is not evenly divisible by codon length."
                )
            if length_axis < 0:
                length_axis = seqs.ndim + length_axis
            codons = np.lib.stride_tricks.sliding_window_view(
                seqs, window_shape=codon_size, axis=-1
            )[..., ::codon_size, :]
            codon_axis = length_axis + 1
            return gufunc_translate(
                codons.view(np.uint8),
                self.codon_array.view(np.uint8),
                self.aa_array.view(np.uint8),
                axes=[codon_axis, (-2, -1), (-1), ()],  # type: ignore
            ).view("S1")

        # --- Ragged path ---
        # Pack to ListOffsetArray so .data and .offsets are contiguous and valid.
        seqs = Ragged(ak.to_packed(seqs))

        is_ohe = is_rag_dtype(seqs, np.uint8)

        codon_size = self.codon_array.shape[-1]
        lengths = seqs.lengths.ravel()
        offsets = seqs.offsets  # 1D (n+1,) after to_packed

        if (lengths % codon_size != 0).any():
            raise ValueError(
                "All Ragged sequence lengths must be divisible by codon length."
            )

        n = len(lengths)

        # Decode OHE → bytes if needed. seqs.data is (total, n_nuc) for OHE or (total,) for bytes.
        if is_ohe:
            if nuc_alphabet is None:
                raise ValueError("nuc_alphabet is required for OHE Ragged input.")
            nuc_bytes_flat: NDArray[np.bytes_] = nuc_alphabet.decode_ohe(  # type: ignore[union-attr]
                seqs.data, ohe_axis=-1
            )
        else:
            nuc_bytes_flat = seqs.data

        # Translate the entire flat buffer in one vectorized call. This is valid because
        # translation is invariant to splitting/concatenation when all lengths % codon_size == 0.
        total = nuc_bytes_flat.shape[0]
        if total > 0:
            codons = np.lib.stride_tricks.sliding_window_view(
                nuc_bytes_flat, codon_size, axis=0
            )[::codon_size, :]  # (total // codon_size, codon_size)
            translated_flat: NDArray[np.bytes_] = gufunc_translate(
                codons.view(np.uint8),
                self.codon_array.view(np.uint8),
                self.aa_array.view(np.uint8),
                axes=[1, (-2, -1), (-1), ()],  # type: ignore
            ).view("S1")  # (total // codon_size,)
        else:
            translated_flat = np.empty(0, dtype="S1")

        new_offsets = offsets // codon_size  # (n+1,) position-based in translated_flat

        if truncate_stop:
            starts = new_offsets[:-1].astype(np.int64)
            full_ends = new_offsets[1:].astype(np.int64)
            ends = _nb_find_stop_ends(
                translated_flat.view(np.uint8), starts, full_ends, np.uint8(ord("*"))
            )
            out_offsets = np.stack(
                [starts, ends]
            )  # (2, n) — ListArray (non-contiguous view)
        else:
            out_offsets = new_offsets  # 1D — ListOffsetArray

        if is_ohe:
            n_aa = len(self.aa_array)
            ohe_flat = self.ohe(translated_flat).flatten()
            return Ragged.from_offsets(ohe_flat, (n, None, n_aa), out_offsets)
        else:
            return Ragged.from_offsets(translated_flat, (n, None), out_offsets)

    def ohe(self, seqs: StrSeqType) -> NDArray[np.uint8]:
        """One hot encode an amino acid sequence.

        Parameters
        ----------
        seqs

        Returns
        -------
        NDArray[np.uint8]
            One-hot encoded sequences; last axis is alphabet size, second-to-last is
            sequence length.
        """
        _seqs = cast_seqs(seqs)
        return gufunc_ohe(_seqs.view(np.uint8), self.aa_array.view(np.uint8))

    def decode_ohe(
        self,
        seqs: NDArray[np.uint8],
        ohe_axis: int,
        unknown_char: str = "X",
    ) -> NDArray[np.bytes_]:
        """Convert an OHE array to an S1 byte array.

        Parameters
        ----------
        seqs
        ohe_axis
        unknown_char
            Single character to use for unknown values, by default "X"

        Returns
        -------
        NDArray[np.bytes_]
            S1 byte array of decoded characters; ohe_axis is removed from the shape.
        """
        idx = gufunc_ohe_char_idx(seqs, axis=ohe_axis)  # type: ignore

        if ohe_axis < 0:
            ohe_axis_idx = seqs.ndim + ohe_axis
        else:
            ohe_axis_idx = ohe_axis

        shape = *seqs.shape[:ohe_axis_idx], *seqs.shape[ohe_axis_idx + 1 :]

        _alphabet = np.concatenate([self.aa_array, [unknown_char.encode("ascii")]])

        return _alphabet[idx].reshape(shape)

`init(codons, amino_acids)` ¶

Construct an alphabet of amino acids and their mappings to codons.

Parameters:

Name	Type	Description	Default
`codons`	`list[str]`	List of codons.	required
`amino_acids`	`list[str]`	List of amino acids, in the same order	required

Raises:

Type	Description
`ValueError`	description
`ValueError`	description

Source code in python/seqpro/alphabets/_alphabets.py

def __init__(self, codons: list[str], amino_acids: list[str]) -> None:
    """Construct an alphabet of amino acids and their mappings to codons.

    Parameters
    ----------
    codons
        List of codons.
    amino_acids
        List of amino acids, in the same order

    Raises
    ------
    ValueError
        _description_
    ValueError
        _description_
    """
    k = len(codons[0])
    if any(len(c) != k for c in codons):
        raise ValueError("Got codons with varying lengths.")
    if any(len(a) != 1 for a in amino_acids):
        raise ValueError("Got amino acid symbols that are multiple characters.")
    if len(codons) != len(amino_acids):
        raise ValueError(
            "Got different number of codons and amino acids for mapping."
        )

    self.codons = codons
    self.amino_acids = amino_acids

    self.codon_array = np.array(codons, "S")[..., None].view("S1")
    self.aa_array = np.array(amino_acids, "S1")

    self.codon_to_aa = dict(zip(codons, amino_acids))

`decode_ohe(seqs, ohe_axis, unknown_char='X')` ¶

Convert an OHE array to an S1 byte array.

Parameters:

Name	Type	Description	Default
`seqs`	`NDArray[uint8]`		required
`ohe_axis`	`int`		required
`unknown_char`	`str`	Single character to use for unknown values, by default "X"	`'X'`

Returns:

Type	Description
`NDArray[bytes_]`	S1 byte array of decoded characters; ohe_axis is removed from the shape.

Source code in python/seqpro/alphabets/_alphabets.py

def decode_ohe(
    self,
    seqs: NDArray[np.uint8],
    ohe_axis: int,
    unknown_char: str = "X",
) -> NDArray[np.bytes_]:
    """Convert an OHE array to an S1 byte array.

    Parameters
    ----------
    seqs
    ohe_axis
    unknown_char
        Single character to use for unknown values, by default "X"

    Returns
    -------
    NDArray[np.bytes_]
        S1 byte array of decoded characters; ohe_axis is removed from the shape.
    """
    idx = gufunc_ohe_char_idx(seqs, axis=ohe_axis)  # type: ignore

    if ohe_axis < 0:
        ohe_axis_idx = seqs.ndim + ohe_axis
    else:
        ohe_axis_idx = ohe_axis

    shape = *seqs.shape[:ohe_axis_idx], *seqs.shape[ohe_axis_idx + 1 :]

    _alphabet = np.concatenate([self.aa_array, [unknown_char.encode("ascii")]])

    return _alphabet[idx].reshape(shape)

`ohe(seqs)` ¶

One hot encode an amino acid sequence.

Parameters:

Name	Type	Description	Default
`seqs`	`StrSeqType`		required

Returns:

Type	Description
`NDArray[uint8]`	One-hot encoded sequences; last axis is alphabet size, second-to-last is sequence length.

Source code in python/seqpro/alphabets/_alphabets.py

def ohe(self, seqs: StrSeqType) -> NDArray[np.uint8]:
    """One hot encode an amino acid sequence.

    Parameters
    ----------
    seqs

    Returns
    -------
    NDArray[np.uint8]
        One-hot encoded sequences; last axis is alphabet size, second-to-last is
        sequence length.
    """
    _seqs = cast_seqs(seqs)
    return gufunc_ohe(_seqs.view(np.uint8), self.aa_array.view(np.uint8))

`translate(seqs, length_axis=None, *, nuc_alphabet=None, truncate_stop=False)` ¶

translate(seqs: StrSeqType, length_axis: int | None = None, *, nuc_alphabet: NucleotideAlphabet | None = None, truncate_stop: bool = False) -> NDArray[np.bytes_]

translate(seqs: Ragged[np.bytes_], length_axis: int | None = None, *, nuc_alphabet: NucleotideAlphabet | None = None, truncate_stop: bool = False) -> Ragged[np.bytes_]

translate(seqs: Ragged[np.uint8], length_axis: int | None = None, *, nuc_alphabet: NucleotideAlphabet, truncate_stop: bool = False) -> Ragged[np.uint8]

Translate nucleotide sequences to amino acids.

Parameters:

Name	Type	Description	Default
`seqs`	`StrSeqType \| Ragged[bytes_] \| Ragged[uint8]`	Nucleotide sequences. Ragged inputs must have all lengths divisible by the codon size. For OHE Ragged (uint8), nuc_alphabet is required.	required
`length_axis`	`int \| None`	Only used for non-Ragged array input.	`None`
`nuc_alphabet`	`NucleotideAlphabet \| None`	Required when seqs is a Ragged OHE (uint8) array, to decode OHE → bytes.	`None`
`truncate_stop`	`bool`	When True, each output sequence is truncated at the first stop codon (inclusive). Only valid for Ragged input. Default False.	`False`

Returns:

Type	Description
`NDArray[bytes_] \| Ragged[bytes_] \| Ragged[uint8]`	Translated amino acid sequences in the same container type as the input.

Source code in python/seqpro/alphabets/_alphabets.py

def translate(
    self,
    seqs: StrSeqType | Ragged[np.bytes_] | Ragged[np.uint8],
    length_axis: int | None = None,
    *,
    nuc_alphabet: NucleotideAlphabet | None = None,
    truncate_stop: bool = False,
) -> NDArray[np.bytes_] | Ragged[np.bytes_] | Ragged[np.uint8]:
    """Translate nucleotide sequences to amino acids.

    Parameters
    ----------
    seqs
        Nucleotide sequences. Ragged inputs must have all lengths divisible by
        the codon size. For OHE Ragged (uint8), nuc_alphabet is required.
    length_axis
        Only used for non-Ragged array input.
    nuc_alphabet
        Required when seqs is a Ragged OHE (uint8) array, to decode OHE → bytes.
    truncate_stop
        When True, each output sequence is truncated at the first stop codon
        (inclusive). Only valid for Ragged input. Default False.

    Returns
    -------
    NDArray[np.bytes_] | Ragged[np.bytes_] | Ragged[np.uint8]
        Translated amino acid sequences in the same container type as the input.
    """

    if not isinstance(seqs, Ragged):
        check_axes(seqs, length_axis, False)
        seqs = cast_seqs(seqs)
        codon_size = self.codon_array.shape[-1]
        if length_axis is None:
            length_axis = -1
        if seqs.shape[length_axis] % codon_size != 0:
            raise ValueError(
                "Sequence length is not evenly divisible by codon length."
            )
        if length_axis < 0:
            length_axis = seqs.ndim + length_axis
        codons = np.lib.stride_tricks.sliding_window_view(
            seqs, window_shape=codon_size, axis=-1
        )[..., ::codon_size, :]
        codon_axis = length_axis + 1
        return gufunc_translate(
            codons.view(np.uint8),
            self.codon_array.view(np.uint8),
            self.aa_array.view(np.uint8),
            axes=[codon_axis, (-2, -1), (-1), ()],  # type: ignore
        ).view("S1")

    # --- Ragged path ---
    # Pack to ListOffsetArray so .data and .offsets are contiguous and valid.
    seqs = Ragged(ak.to_packed(seqs))

    is_ohe = is_rag_dtype(seqs, np.uint8)

    codon_size = self.codon_array.shape[-1]
    lengths = seqs.lengths.ravel()
    offsets = seqs.offsets  # 1D (n+1,) after to_packed

    if (lengths % codon_size != 0).any():
        raise ValueError(
            "All Ragged sequence lengths must be divisible by codon length."
        )

    n = len(lengths)

    # Decode OHE → bytes if needed. seqs.data is (total, n_nuc) for OHE or (total,) for bytes.
    if is_ohe:
        if nuc_alphabet is None:
            raise ValueError("nuc_alphabet is required for OHE Ragged input.")
        nuc_bytes_flat: NDArray[np.bytes_] = nuc_alphabet.decode_ohe(  # type: ignore[union-attr]
            seqs.data, ohe_axis=-1
        )
    else:
        nuc_bytes_flat = seqs.data

    # Translate the entire flat buffer in one vectorized call. This is valid because
    # translation is invariant to splitting/concatenation when all lengths % codon_size == 0.
    total = nuc_bytes_flat.shape[0]
    if total > 0:
        codons = np.lib.stride_tricks.sliding_window_view(
            nuc_bytes_flat, codon_size, axis=0
        )[::codon_size, :]  # (total // codon_size, codon_size)
        translated_flat: NDArray[np.bytes_] = gufunc_translate(
            codons.view(np.uint8),
            self.codon_array.view(np.uint8),
            self.aa_array.view(np.uint8),
            axes=[1, (-2, -1), (-1), ()],  # type: ignore
        ).view("S1")  # (total // codon_size,)
    else:
        translated_flat = np.empty(0, dtype="S1")

    new_offsets = offsets // codon_size  # (n+1,) position-based in translated_flat

    if truncate_stop:
        starts = new_offsets[:-1].astype(np.int64)
        full_ends = new_offsets[1:].astype(np.int64)
        ends = _nb_find_stop_ends(
            translated_flat.view(np.uint8), starts, full_ends, np.uint8(ord("*"))
        )
        out_offsets = np.stack(
            [starts, ends]
        )  # (2, n) — ListArray (non-contiguous view)
    else:
        out_offsets = new_offsets  # 1D — ListOffsetArray

    if is_ohe:
        n_aa = len(self.aa_array)
        ohe_flat = self.ohe(translated_flat).flatten()
        return Ragged.from_offsets(ohe_flat, (n, None, n_aa), out_offsets)
    else:
        return Ragged.from_offsets(translated_flat, (n, None), out_offsets)

`NucleotideAlphabet` ¶

Source code in python/seqpro/alphabets/_alphabets.py

class NucleotideAlphabet:
    alphabet: str
    """Alphabet excluding ambiguous characters e.g. "N" for DNA."""
    complement: str
    array: NDArray[np.bytes_]
    complement_map: dict[str, str]
    complement_map_bytes: dict[bytes, bytes]
    str_comp_table: dict[int, str]
    bytes_comp_table: bytes
    bytes_comp_array: NDArray[np.bytes_]

    def __init__(self, alphabet: str, complement: str) -> None:
        """Parse and validate sequence alphabets.

        Nucleic acid alphabets must be complemented by being reversed (without the
        unknown character). For example, `reverse(ACGT) = complement(ACGT) = TGCA`.

        Parameters
        ----------
        alphabet
            For example, DNA could be 'ACGT'.
        complement
            Complement of the alphabet, to continue the example this would be 'TGCA'.
        """
        self._validate(alphabet, complement)
        self.alphabet = alphabet
        self.complement = complement
        self.array = cast(
            NDArray[np.bytes_], np.frombuffer(self.alphabet.encode("ascii"), "|S1")
        )
        self.complement_map = dict(zip(list(self.alphabet), list(self.complement)))
        self.complement_map_bytes = {
            k.encode("ascii"): v.encode("ascii") for k, v in self.complement_map.items()
        }
        self.str_comp_table = str.maketrans(self.complement_map)
        self.bytes_comp_table = bytes.maketrans(
            self.alphabet.encode("ascii"), self.complement.encode("ascii")
        )
        self.bytes_comp_array = np.frombuffer(self.bytes_comp_table, "S1")

    def __len__(self):
        return len(self.alphabet)

    def _validate(self, alphabet: str, complement: str):
        if len(set(alphabet)) != len(alphabet):
            raise ValueError("Alphabet has repeated characters.")

        if len(set(complement)) != len(complement):
            raise ValueError("Complement has repeated characters.")

        for maybe_comp, comp in zip(alphabet[::-1], complement):
            if maybe_comp != comp:
                raise ValueError("Reverse of alphabet does not yield the complement.")

    def ohe(self, seqs: StrSeqType) -> NDArray[np.uint8]:
        """One hot encode a nucleotide sequence.

        Parameters
        ----------
        seqs

        Returns
        -------
        NDArray[np.uint8]
            One-hot encoded sequences; last axis is alphabet size, second-to-last is
            sequence length.
        """
        _seqs = cast_seqs(seqs)
        return gufunc_ohe(_seqs.view(np.uint8), self.array.view(np.uint8))

    def decode_ohe(
        self,
        seqs: NDArray[np.uint8],
        ohe_axis: int,
        unknown_char: str = "N",
    ) -> NDArray[np.bytes_]:
        """Convert an OHE array to an S1 byte array.

        Parameters
        ----------
        seqs
        ohe_axis
        unknown_char
            Single character to use for unknown values, by default "N"

        Returns
        -------
        NDArray[np.bytes_]
            S1 byte array of decoded characters; ohe_axis is removed from the shape.
        """
        idx = gufunc_ohe_char_idx(seqs, axis=ohe_axis)  # type: ignore

        if ohe_axis < 0:
            ohe_axis_idx = seqs.ndim + ohe_axis
        else:
            ohe_axis_idx = ohe_axis

        shape = *seqs.shape[:ohe_axis_idx], *seqs.shape[ohe_axis_idx + 1 :]

        _alphabet = np.concatenate([self.array, [unknown_char.encode("ascii")]])

        return _alphabet[idx].reshape(shape)

    def _complement_bytes(
        self, byte_arr: NDArray[np.bytes_], out: NDArray[np.bytes_] | None = None
    ) -> NDArray[np.bytes_]:
        if out is None:
            _out = out
        else:
            _out = out.view(np.uint8)
        _out = gufunc_complement_bytes(
            byte_arr.view(np.uint8), self.bytes_comp_array.view(np.uint8), _out
        )
        return _out.view("S1")

    def _rev_comp_byte(
        self,
        byte_arr: NDArray[np.bytes_],
        length_axis: int,
        out: NDArray[np.bytes_] | None = None,
    ) -> NDArray[np.bytes_]:
        out = self._complement_bytes(byte_arr, out)
        return np.flip(out, length_axis)

    @overload
    def reverse_complement(
        self,
        seqs: StrSeqType,
        length_axis: int | None = None,
        ohe_axis: int | None = None,
        out: NDArray[np.bytes_] | None = None,
    ) -> NDArray[np.bytes_]: ...
    @overload
    def reverse_complement(
        self,
        seqs: NDArray[np.uint8],
        length_axis: int | None = None,
        ohe_axis: int | None = None,
        out: NDArray[np.uint8] | None = None,
    ) -> NDArray[np.uint8]: ...
    @overload
    def reverse_complement(
        self,
        seqs: SeqType,
        length_axis: int | None = None,
        ohe_axis: int | None = None,
        out: NDArray[np.bytes_ | np.uint8] | None = None,
    ) -> NDArray[np.bytes_ | np.uint8]: ...
    def reverse_complement(
        self,
        seqs: SeqType,
        length_axis: int | None = None,
        ohe_axis: int | None = None,
        out: NDArray[np.bytes_ | np.uint8] | None = None,
    ) -> NDArray[np.bytes_ | np.uint8]:
        """Reverse complement a sequence.

        Parameters
        ----------
        seqs
        length_axis
            Length axis, by default None
        ohe_axis
            One hot encoding axis, by default None

        Returns
        -------
        NDArray[np.bytes_ | np.uint8]
            Reverse-complemented sequences as S1 bytes or uint8 for OHE input.
        """
        check_axes(seqs, length_axis, ohe_axis)

        seqs_ = cast_seqs(seqs)

        if is_dtype(seqs_, np.bytes_):
            assert out is None or is_dtype(out, np.bytes_)
            if length_axis is None:
                length_axis = -1
            return self._rev_comp_byte(seqs_, length_axis, out)
        elif is_dtype(seqs_, np.uint8):  # OHE
            assert length_axis is not None
            assert ohe_axis is not None
            assert out is None or is_dtype(out, np.uint8)
            out_ = np.flip(seqs_, axis=(length_axis, ohe_axis))
            if out is not None:
                out[:] = out_
                out_ = out
            return out_
        else:
            raise ValueError("Invalid sequence type.")

`alphabet = alphabet` `instance-attribute` ¶

Alphabet excluding ambiguous characters e.g. "N" for DNA.

`init(alphabet, complement)` ¶

Parse and validate sequence alphabets.

Nucleic acid alphabets must be complemented by being reversed (without the unknown character). For example, reverse(ACGT) = complement(ACGT) = TGCA.

Parameters:

Name	Type	Description	Default
`alphabet`	`str`	For example, DNA could be 'ACGT'.	required
`complement`	`str`	Complement of the alphabet, to continue the example this would be 'TGCA'.	required

Source code in python/seqpro/alphabets/_alphabets.py

def __init__(self, alphabet: str, complement: str) -> None:
    """Parse and validate sequence alphabets.

    Nucleic acid alphabets must be complemented by being reversed (without the
    unknown character). For example, `reverse(ACGT) = complement(ACGT) = TGCA`.

    Parameters
    ----------
    alphabet
        For example, DNA could be 'ACGT'.
    complement
        Complement of the alphabet, to continue the example this would be 'TGCA'.
    """
    self._validate(alphabet, complement)
    self.alphabet = alphabet
    self.complement = complement
    self.array = cast(
        NDArray[np.bytes_], np.frombuffer(self.alphabet.encode("ascii"), "|S1")
    )
    self.complement_map = dict(zip(list(self.alphabet), list(self.complement)))
    self.complement_map_bytes = {
        k.encode("ascii"): v.encode("ascii") for k, v in self.complement_map.items()
    }
    self.str_comp_table = str.maketrans(self.complement_map)
    self.bytes_comp_table = bytes.maketrans(
        self.alphabet.encode("ascii"), self.complement.encode("ascii")
    )
    self.bytes_comp_array = np.frombuffer(self.bytes_comp_table, "S1")

`decode_ohe(seqs, ohe_axis, unknown_char='N')` ¶

Convert an OHE array to an S1 byte array.

Parameters:

Name	Type	Description	Default
`seqs`	`NDArray[uint8]`		required
`ohe_axis`	`int`		required
`unknown_char`	`str`	Single character to use for unknown values, by default "N"	`'N'`

Returns:

Type	Description
`NDArray[bytes_]`	S1 byte array of decoded characters; ohe_axis is removed from the shape.

Source code in python/seqpro/alphabets/_alphabets.py

def decode_ohe(
    self,
    seqs: NDArray[np.uint8],
    ohe_axis: int,
    unknown_char: str = "N",
) -> NDArray[np.bytes_]:
    """Convert an OHE array to an S1 byte array.

    Parameters
    ----------
    seqs
    ohe_axis
    unknown_char
        Single character to use for unknown values, by default "N"

    Returns
    -------
    NDArray[np.bytes_]
        S1 byte array of decoded characters; ohe_axis is removed from the shape.
    """
    idx = gufunc_ohe_char_idx(seqs, axis=ohe_axis)  # type: ignore

    if ohe_axis < 0:
        ohe_axis_idx = seqs.ndim + ohe_axis
    else:
        ohe_axis_idx = ohe_axis

    shape = *seqs.shape[:ohe_axis_idx], *seqs.shape[ohe_axis_idx + 1 :]

    _alphabet = np.concatenate([self.array, [unknown_char.encode("ascii")]])

    return _alphabet[idx].reshape(shape)

`ohe(seqs)` ¶

One hot encode a nucleotide sequence.

Parameters:

Name	Type	Description	Default
`seqs`	`StrSeqType`		required

Returns:

Type	Description
`NDArray[uint8]`	One-hot encoded sequences; last axis is alphabet size, second-to-last is sequence length.

Source code in python/seqpro/alphabets/_alphabets.py

def ohe(self, seqs: StrSeqType) -> NDArray[np.uint8]:
    """One hot encode a nucleotide sequence.

    Parameters
    ----------
    seqs

    Returns
    -------
    NDArray[np.uint8]
        One-hot encoded sequences; last axis is alphabet size, second-to-last is
        sequence length.
    """
    _seqs = cast_seqs(seqs)
    return gufunc_ohe(_seqs.view(np.uint8), self.array.view(np.uint8))

`reverse_complement(seqs, length_axis=None, ohe_axis=None, out=None)` ¶

reverse_complement(seqs: StrSeqType, length_axis: int | None = None, ohe_axis: int | None = None, out: NDArray[np.bytes_] | None = None) -> NDArray[np.bytes_]

reverse_complement(seqs: NDArray[np.uint8], length_axis: int | None = None, ohe_axis: int | None = None, out: NDArray[np.uint8] | None = None) -> NDArray[np.uint8]

reverse_complement(seqs: SeqType, length_axis: int | None = None, ohe_axis: int | None = None, out: NDArray[np.bytes_ | np.uint8] | None = None) -> NDArray[np.bytes_ | np.uint8]

Reverse complement a sequence.

Parameters:

Name	Type	Description	Default
`seqs`	`SeqType`		required
`length_axis`	`int \| None`	Length axis, by default None	`None`
`ohe_axis`	`int \| None`	One hot encoding axis, by default None	`None`

Returns:

Type	Description
`NDArray[bytes_ \| uint8]`	Reverse-complemented sequences as S1 bytes or uint8 for OHE input.

Source code in python/seqpro/alphabets/_alphabets.py

def reverse_complement(
    self,
    seqs: SeqType,
    length_axis: int | None = None,
    ohe_axis: int | None = None,
    out: NDArray[np.bytes_ | np.uint8] | None = None,
) -> NDArray[np.bytes_ | np.uint8]:
    """Reverse complement a sequence.

    Parameters
    ----------
    seqs
    length_axis
        Length axis, by default None
    ohe_axis
        One hot encoding axis, by default None

    Returns
    -------
    NDArray[np.bytes_ | np.uint8]
        Reverse-complemented sequences as S1 bytes or uint8 for OHE input.
    """
    check_axes(seqs, length_axis, ohe_axis)

    seqs_ = cast_seqs(seqs)

    if is_dtype(seqs_, np.bytes_):
        assert out is None or is_dtype(out, np.bytes_)
        if length_axis is None:
            length_axis = -1
        return self._rev_comp_byte(seqs_, length_axis, out)
    elif is_dtype(seqs_, np.uint8):  # OHE
        assert length_axis is not None
        assert ohe_axis is not None
        assert out is None or is_dtype(out, np.uint8)
        out_ = np.flip(seqs_, axis=(length_axis, ohe_axis))
        if out is not None:
            out[:] = out_
            out_ = out
        return out_
    else:
        raise ValueError("Invalid sequence type.")

Alphabets¶

AA = AminoAlphabet(*(map(list, zip(*(canonical_codons_to_aas.items()))))) module-attribute ¶

DNA = NucleotideAlphabet('ACGT', 'TGCA') module-attribute ¶

RNA = NucleotideAlphabet(alphabet='ACGU', complement='UGCA') module-attribute ¶

AminoAlphabet ¶

__init__(codons, amino_acids) ¶

decode_ohe(seqs, ohe_axis, unknown_char='X') ¶

ohe(seqs) ¶

translate(seqs, length_axis=None, *, nuc_alphabet=None, truncate_stop=False) ¶

NucleotideAlphabet ¶

alphabet = alphabet instance-attribute ¶

__init__(alphabet, complement) ¶

decode_ohe(seqs, ohe_axis, unknown_char='N') ¶

ohe(seqs) ¶

reverse_complement(seqs, length_axis=None, ohe_axis=None, out=None) ¶

`AA = AminoAlphabet((map(list, zip((canonical_codons_to_aas.items())))))` `module-attribute` ¶

`DNA = NucleotideAlphabet('ACGT', 'TGCA')` `module-attribute` ¶

`RNA = NucleotideAlphabet(alphabet='ACGU', complement='UGCA')` `module-attribute` ¶

`AminoAlphabet` ¶

`init(codons, amino_acids)` ¶

`decode_ohe(seqs, ohe_axis, unknown_char='X')` ¶

`ohe(seqs)` ¶

`translate(seqs, length_axis=None, *, nuc_alphabet=None, truncate_stop=False)` ¶

`NucleotideAlphabet` ¶

`alphabet = alphabet` `instance-attribute` ¶

`init(alphabet, complement)` ¶

`decode_ohe(seqs, ohe_axis, unknown_char='N')` ¶

`ohe(seqs)` ¶

`reverse_complement(seqs, length_axis=None, ohe_axis=None, out=None)` ¶