Skip to content

API Reference

bin_coverage(coverage, bin_width, length_axis, normalize=False)

Bin coverage by summing over non-overlapping windows.

Parameters:

Name Type Description Default
coverage_array
required
bin_width int

Width of the windows to sum over. Must be an even divisor of the length of the coverage array. If not, raises an error.

required
length_axis int
required
normalize

Whether to normalize by the length of the bin.

False

Returns:

Type Description
NDArray[number]

Coverage summed into bins of width bin_width along length_axis.

Source code in python/seqpro/_modifiers.py
def bin_coverage(
    coverage: NDArray[np.number],
    bin_width: int,
    length_axis: int,
    normalize=False,
) -> NDArray[np.number]:
    """Bin coverage by summing over non-overlapping windows.

    Parameters
    ----------
    coverage_array
    bin_width
        Width of the windows to sum over. Must be an even divisor of the length
        of the coverage array. If not, raises an error.
    length_axis
    normalize
        Whether to normalize by the length of the bin.

    Returns
    -------
    NDArray[np.number]
        Coverage summed into bins of width `bin_width` along `length_axis`.
    """
    length = coverage.shape[length_axis]
    if length % bin_width != 0:
        raise ValueError("Bin width must evenly divide length.")
    binned_coverage = np.add.reduceat(
        coverage, np.arange(0, length, bin_width), axis=length_axis
    )
    if normalize:
        binned_coverage /= bin_width
    return binned_coverage

cast_seqs(seqs)

cast_seqs(seqs: NDArray[np.uint8]) -> NDArray[np.uint8]
cast_seqs(seqs: StrSeqType) -> NDArray[np.bytes_]
cast_seqs(seqs: SeqType) -> NDArray[np.bytes_ | np.uint8]

Cast any sequence type to be a NumPy array of ASCII characters (or left alone as 8-bit unsigned integers if the input is OHE).

Parameters:

Name Type Description Default
seqs SeqType
required

Returns:

Type Description
NDArray[bytes_ | uint8]

S1 byte array, or unchanged uint8 array if input is OHE.

Source code in python/seqpro/_utils.py
def cast_seqs(seqs: SeqType) -> NDArray[np.bytes_ | np.uint8]:
    """Cast any sequence type to be a NumPy array of ASCII characters (or left alone as
    8-bit unsigned integers if the input is OHE).

    Parameters
    ----------
    seqs

    Returns
    -------
    NDArray[np.bytes_ | np.uint8]
        S1 byte array, or unchanged uint8 array if input is OHE.
    """
    if isinstance(seqs, str):
        if len(seqs) == 0:
            raise ValueError("Empty string cannot be cast to a sequence array.")
        return np.array([seqs], "S").view("S1")
    elif isinstance(seqs, bytes):
        return np.array([seqs]).view("S1")
    elif isinstance(seqs, Sequence):
        return np.array(seqs, "S")[..., None].view("S1")
    elif seqs.dtype.itemsize > 1:  # dtype == U or bigger than S1
        return seqs.astype("S")[..., None].view("S1")
    else:
        return cast(NDArray[np.bytes_ | np.uint8], seqs)

decode_ohe(seqs, ohe_axis, alphabet, unknown_char='N')

Convert an OHE array to an S1 byte array.

Parameters:

Name Type Description Default
seqs NDArray[uint8]
required
ohe_axis int
required
alphabet NucleotideAlphabet | AminoAlphabet
required
unknown_char str

Single character to use for unknown values, by default "N"

'N'

Returns:

Type Description
NDArray[bytes_]

S1 byte array of decoded characters.

Source code in python/seqpro/_encoders.py
def decode_ohe(
    seqs: NDArray[np.uint8],
    ohe_axis: int,
    alphabet: NucleotideAlphabet | AminoAlphabet,
    unknown_char: str = "N",
) -> NDArray[np.bytes_]:
    """Convert an OHE array to an S1 byte array.

    Parameters
    ----------
    seqs
    ohe_axis
    alphabet
    unknown_char
        Single character to use for unknown values, by default "N"

    Returns
    -------
    NDArray[np.bytes_]
        S1 byte array of decoded characters.
    """
    return alphabet.decode_ohe(seqs=seqs, ohe_axis=ohe_axis, unknown_char=unknown_char)

decode_tokens(seqs, token_map, unknown_char='N')

Untokenize nucleotides. Replaces each token/index with its corresponding nucleotide in the alphabet.

Parameters:

Name Type Description Default
ids
required
alphabet
required
tokens

List of tokens to use for each nucleotide, by default None

required
unknown_char str

Character to replace unknown tokens with, by default 'N'

'N'

Returns:

Type Description
NDArray[bytes_]

S1 byte array of decoded characters with the same shape as the input.

Source code in python/seqpro/_encoders.py
def decode_tokens(
    seqs: NDArray[np.int32],
    token_map: dict[str, int],
    unknown_char: str = "N",
) -> NDArray[np.bytes_]:
    """Untokenize nucleotides. Replaces each token/index with its corresponding
    nucleotide in the alphabet.


    Parameters
    ----------
    ids
    alphabet
    tokens
        List of tokens to use for each nucleotide, by default None
    unknown_char
        Character to replace unknown tokens with, by default 'N'


    Returns
    -------
    NDArray[np.bytes_]
        S1 byte array of decoded characters with the same shape as the input.
    """
    target = np.array([c.encode("ascii") for c in token_map]).view(np.uint8)
    source = np.array(list(token_map.values()), dtype=np.int32)
    _unk_char = np.uint8(ord(unknown_char))
    _seqs = gufunc_tokenize(seqs, source, target, _unk_char).view("S1")
    return _seqs

gc_content(seqs, normalize=True, length_axis=None, alphabet=None, ohe_axis=None)

Compute the number or proportion of G & C nucleotides.

Parameters:

Name Type Description Default
seqs SeqType
required
normalize

True => return proportions False => return counts

True
length_axis int | None

Needed if seqs is an array.

None
alphabet NucleotideAlphabet | None

Needed if seqs is OHE.

None
ohe_axis int | None

Needed if seqs is OHE.

None

Returns:

Type Description
NDArray[integer | float64]

Integers if unnormalized, otherwise floats.

Source code in python/seqpro/_analyzers.py
def gc_content(
    seqs: SeqType,
    normalize=True,
    length_axis: int | None = None,
    alphabet: NucleotideAlphabet | None = None,
    ohe_axis: int | None = None,
) -> NDArray[np.integer | np.float64]:
    """Compute the number or proportion of G & C nucleotides.

    Parameters
    ----------
    seqs
    normalize
        True => return proportions
        False => return counts
    length_axis
        Needed if seqs is an array.
    alphabet
        Needed if seqs is OHE.
    ohe_axis
        Needed if seqs is OHE.

    Returns
    -------
    NDArray[np.integer | np.float64]
        Integers if unnormalized, otherwise floats.
    """
    check_axes(seqs, length_axis, ohe_axis)

    seqs = cast_seqs(seqs)

    if length_axis is None:  # length axis after casting strings
        length_axis = seqs.ndim - 1
    elif length_axis < 0:
        length_axis = seqs.ndim + length_axis

    if seqs.dtype == np.uint8:  # OHE
        if alphabet is None:
            raise ValueError("Need an alphabet to analyze OHE sequences.")
        assert ohe_axis is not None

        gc_idx = np.flatnonzero(np.isin(alphabet.array, np.array([b"C", b"G"])))
        gc_content = cast(
            NDArray[np.integer],
            np.take(seqs, gc_idx, ohe_axis).sum((length_axis, ohe_axis)),
        )
    else:
        gc_content = cast(
            NDArray[np.integer], np.isin(seqs, np.array([b"C", b"G"])).sum(length_axis)
        )

    if normalize:
        gc_content = gc_content / seqs.shape[length_axis]

    return gc_content

jitter(*arrays, max_jitter, length_axis, jitter_axes, seed=None)

Randomly jitter data from arrays, using the same jitter across arrays.

Parameters:

Name Type Description Default
*arrays NDArray[DTYPE]

Arrays to be jittered. They must have the same sized jitter and length axes.

()
max_jitter int

Maximum jitter amount.

required
length_axis int
required
jitter_axes int | tuple[int, ...]

Each slice along the jitter axes will be randomly jittered independently. Thus, if jitter_axes = 0, then every slice of data along axis 0 would be jittered independently. If jitter_axes = (0, 1), then each slice along axes 0 and 1 would be randomly jittered independently.

required
seed int | Generator | None

Random seed or generator, by default None

None

Returns:

Type Description
tuple[NDArray[DTYPE], ...]

Jittered arrays. Each will have a new length equal to length - 2*max_jitter.

Raises:

Type Description
ValueError

If any arrays have insufficient length to be jittered.

Source code in python/seqpro/_modifiers.py
def jitter(
    *arrays: NDArray[DTYPE],
    max_jitter: int,
    length_axis: int,
    jitter_axes: int | tuple[int, ...],
    seed: int | np.random.Generator | None = None,
) -> tuple[NDArray[DTYPE], ...]:
    """Randomly jitter data from arrays, using the same jitter across arrays.

    Parameters
    ----------
    *arrays
        Arrays to be jittered. They must have the same sized jitter and length
        axes.
    max_jitter
        Maximum jitter amount.
    length_axis
    jitter_axes
        Each slice along the jitter axes will be randomly jittered *independently*.
        Thus, if jitter_axes = 0, then every slice of data along axis 0 would be
        jittered independently. If jitter_axes = (0, 1), then each slice along axes 0
        and 1 would be randomly jittered independently.
    seed
        Random seed or generator, by default None

    Returns
    -------
    tuple[NDArray[DTYPE], ...]
        Jittered arrays. Each will have a new length equal to length - 2*max_jitter.

    Raises
    ------
    ValueError
        If any arrays have insufficient length to be jittered.
    """
    if isinstance(jitter_axes, int):
        jitter_axes = (jitter_axes,)

    # move jitter axes and length axis to back such that shape = (..., jitter, length)
    arrays, destination_axes = _align_axes(*arrays, axes=(*jitter_axes, length_axis))
    short_arrays = []
    for i, arr in enumerate(arrays):
        if arr.shape[-1] - 2 * max_jitter <= 0:
            short_arrays.append(i)
    if short_arrays:
        raise ValueError(
            f"Arrays {short_arrays} have insufficient length to be jittered with max_jitter={max_jitter}."
        )

    jitter_axes_shape = arrays[0].shape[-len(jitter_axes) - 1 : -1]
    if seed is None or isinstance(seed, int):
        rng = np.random.default_rng(seed)
    else:
        rng = seed
    starts = rng.integers(0, 2 * max_jitter + 1, jitter_axes_shape)

    sliced_arrs: list[NDArray] = []
    for arr in arrays:
        jittered_length = arr.shape[-1] - 2 * max_jitter
        sliced = _slice_kmers(arr, starts, jittered_length)
        sliced = np.moveaxis(sliced, destination_axes, [*jitter_axes, length_axis])
        sliced_arrs.append(sliced)

    return tuple(sliced_arrs)

k_shuffle(seqs, k, alphabet, *, length_axis=None, ohe_axis=None, seed=None)

Shuffle sequences while preserving k-let frequencies.

Parameters:

Name Type Description Default
seqs SeqType
required
k int

Size of k-lets to preserve frequencies of.

required
alphabet NucleotideAlphabet

Alphabet, needed for OHE sequence input.

required
length_axis int | None

Needed for array input. Axis that corresponds to the length of sequences.

None
ohe_axes

Needed for OHE input. Axis that corresponds to the one hot encoding, should be the same size as the length of the alphabet.

required
seed int | Generator | None

Seed or generator for shuffling.

None

Returns:

Type Description
NDArray[bytes_ | uint8]

Shuffled sequences as bytes (S1) or uint8 for string or OHE input, respectively.

Source code in python/seqpro/_modifiers.py
def k_shuffle(
    seqs: SeqType,
    k: int,
    alphabet: NucleotideAlphabet,
    *,
    length_axis: int | None = None,
    ohe_axis: int | None = None,
    seed: int | np.random.Generator | None = None,
) -> NDArray[np.bytes_ | np.uint8]:
    """Shuffle sequences while preserving k-let frequencies.

    Parameters
    ----------
    seqs
    k
        Size of k-lets to preserve frequencies of.
    alphabet
        Alphabet, needed for OHE sequence input.
    length_axis
        Needed for array input. Axis that corresponds to the length of sequences.
    ohe_axes
        Needed for OHE input. Axis that corresponds to the one hot encoding, should be
        the same size as the length of the alphabet.
    seed
        Seed or generator for shuffling.

    Returns
    -------
    NDArray[np.bytes_ | np.uint8]
        Shuffled sequences as bytes (S1) or uint8 for string or OHE input, respectively.
    """

    check_axes(seqs, length_axis, ohe_axis)

    if isinstance(seed, np.random.Generator):
        seed = seed.integers(0, np.iinfo(np.int32).max)  # type: ignore

    seqs = cast_seqs(seqs)

    # only get here if seqs was str or list[str]
    if length_axis is None:
        length_axis = seqs.ndim - 1

    if seqs.dtype == np.uint8:
        assert ohe_axis is not None
        seqs = cast(NDArray[np.uint8], seqs)
        ohe = True
        seqs = alphabet.decode_ohe(seqs, ohe_axis=ohe_axis)
    else:
        ohe = False

    seqs = np.moveaxis(seqs, length_axis, -1)  # length must be final

    shuffled = _k_shuffle(seqs.view("u1"), k, len(alphabet), seed).view("S1")

    shuffled = np.moveaxis(shuffled, -1, length_axis)  # put length back where it was

    if ohe:
        assert ohe_axis is not None
        assert alphabet is not None
        shuffled = cast(NDArray[np.bytes_], shuffled)
        shuffled = alphabet.ohe(shuffled).swapaxes(-1, ohe_axis)

    return shuffled

length(seqs)

Calculate the length of each sequence.

Parameters:

Name Type Description Default
seqs str | list[str]

List of sequences.

required

Returns:

Type Description
NDArray[integer]

Array containing the length of each sequence.

Source code in python/seqpro/_analyzers.py
def length(seqs: str | list[str]) -> NDArray[np.integer]:
    """Calculate the length of each sequence.

    Parameters
    ----------
    seqs
        List of sequences.

    Returns
    -------
    NDArray[np.integer]
        Array containing the length of each sequence.
    """
    _seqs = cast_seqs(seqs)
    return (_seqs != b"").sum(-1)

nucleotide_content(seqs, normalize=True, length_axis=None, alphabet=None)

Compute the number or proportion of each nucleotide.

Parameters:

Name Type Description Default
seqs SeqType
required
normalize

True => return proportions False => return counts

True
length_axis int | None

Needed if seqs is an array.

None

Returns:

Type Description
NDArray[integer | floating]

Integers if unnormalized, otherwise floats.

Source code in python/seqpro/_analyzers.py
def nucleotide_content(
    seqs: SeqType,
    normalize=True,
    length_axis: int | None = None,
    alphabet: NucleotideAlphabet | None = None,
) -> NDArray[np.integer | np.floating]:
    """Compute the number or proportion of each nucleotide.

    Parameters
    ----------
    seqs
    normalize
        True => return proportions
        False => return counts
    length_axis
        Needed if seqs is an array.

    Returns
    -------
    NDArray[np.integer | np.floating]
        Integers if unnormalized, otherwise floats.
    """
    check_axes(seqs, length_axis, False)

    seqs = cast_seqs(seqs)

    if length_axis is None:
        length_axis = seqs.ndim - 1
    elif length_axis < 0:
        length_axis = seqs.ndim + length_axis

    if seqs.dtype == np.uint8:  # OHE
        nuc_content = cast(NDArray[np.integer], seqs.sum(length_axis))
    else:
        if alphabet is None:
            raise ValueError("Need an alphabet to analyze string nucleotide content.")
        nuc_content = np.zeros(
            (*seqs.shape[:length_axis], *seqs.shape[length_axis + 1 :], len(alphabet)),
            dtype=np.int64,
        )
        for i, nuc in enumerate(alphabet.array):
            nuc_content[..., i] = (seqs == nuc).sum(length_axis)

    if normalize:
        nuc_content = nuc_content / seqs.shape[length_axis]

    return nuc_content

ohe(seqs, alphabet)

One hot encode a nucleotide sequence.

Parameters:

Name Type Description Default
seqs StrSeqType
required
alphabet NucleotideAlphabet | AminoAlphabet
required

Returns:

Type Description
NDArray[uint8]

One-hot encoded sequences with shape (..., length, alphabet_size).

Source code in python/seqpro/_encoders.py
def ohe(
    seqs: StrSeqType, alphabet: NucleotideAlphabet | AminoAlphabet
) -> NDArray[np.uint8]:
    """One hot encode a nucleotide sequence.

    Parameters
    ----------
    seqs
    alphabet

    Returns
    -------
    NDArray[np.uint8]
        One-hot encoded sequences with shape (..., length, alphabet_size).
    """
    return alphabet.ohe(seqs)

pad_seqs(seqs, pad, pad_value=None, length=None, length_axis=None)

Pad (or truncate) sequences on either the left, right, or both sides.

Parameters:

Name Type Description Default
seqs SeqType
required
pad Literal['left', 'both', 'right']

How to pad. If padding on both sides and an odd amount of padding is needed, 1 more pad value will be on the right side. Similarly for truncating, if an odd amount length needs to be truncated, 1 more character will be truncated from the right side.

required
pad_val

Single character to pad sequences with. Needed for string input. Ignored for OHE sequences.

required
length int | None

Needed for character or OHE array input. Length to pad or truncate sequences to. If not given, uses the length of longest sequence.

None
length_axis int | None

Needed for array input.

None

Returns:

Type Description
NDArray[bytes_ | uint8]

Padded (or truncated) sequences as S1 bytes or uint8 for OHE input.

Source code in python/seqpro/_encoders.py
def pad_seqs(
    seqs: SeqType,
    pad: Literal["left", "both", "right"],
    pad_value: str | None = None,
    length: int | None = None,
    length_axis: int | None = None,
) -> NDArray[np.bytes_ | np.uint8]:
    """Pad (or truncate) sequences on either the left, right, or both sides.

    Parameters
    ----------
    seqs
    pad
        How to pad. If padding on both sides and an odd amount of padding is needed, 1
        more pad value will be on the right side. Similarly for truncating, if an odd
        amount length needs to be truncated, 1 more character will be truncated from the
        right side.
    pad_val
        Single character to pad sequences with. Needed for string input. Ignored for OHE
        sequences.
    length
        Needed for character or OHE array input. Length to pad or truncate sequences to.
        If not given, uses the length of longest sequence.
    length_axis
        Needed for array input.

    Returns
    -------
    NDArray[np.bytes_ | np.uint8]
        Padded (or truncated) sequences as S1 bytes or uint8 for OHE input.
    """
    check_axes(seqs, length_axis, False)

    string_input = (
        isinstance(seqs, (str, list))
        or (isinstance(seqs, np.ndarray) and seqs.dtype.kind == "U")
        or (isinstance(seqs, np.ndarray) and seqs.dtype.type == np.object_)
    )

    seqs = cast_seqs(seqs)

    if length_axis is None:
        length_axis = seqs.ndim - 1

    if string_input:
        if pad_value is None:
            raise ValueError("Need a pad value for plain string input.")

        if length is not None:
            seqs = seqs[..., :length]

        seqs = seqs.view(np.uint8)

        if pad == "left":
            seqs = gufunc_pad_left(seqs)
        elif pad == "both":
            seqs = gufunc_pad_both(seqs)

        # convert empty character '' to pad_val
        seqs[seqs == 0] = ord(pad_value)

        seqs = cast(NDArray[np.bytes_], seqs.view("S1"))
    else:
        if length is None:
            raise ValueError("Need a length for array input.")

        length_diff = seqs.shape[length_axis] - length

        if length_diff == 0:
            return seqs
        elif length_diff > 0:  # longer than needed, truncate
            if pad == "left":
                seqs = array_slice(seqs, length_axis, slice(-length))
            elif pad == "both":
                seqs = array_slice(
                    seqs, length_axis, slice(length_diff // 2, -length_diff // 2)
                )
            else:
                seqs = array_slice(seqs, length_axis, slice(None, length))
        else:  # shorter than needed, pad
            pad_arr_shape = (
                *seqs.shape[:length_axis],
                -length_diff,
                *seqs.shape[length_axis + 1 :],
            )
            if seqs.dtype == np.uint8:
                pad_arr = np.zeros(pad_arr_shape, np.uint8)
            else:
                if pad_value is None:
                    raise ValueError("Need a pad value for byte array input.")
                pad_arr = np.full(pad_arr_shape, pad_value.encode("ascii"), dtype="S1")
            seqs = np.concatenate([seqs, pad_arr], axis=length_axis)

    return seqs

random_seqs(shape, alphabet, seed=None)

Generate random nucleotide sequences.

Parameters:

Name Type Description Default
shape int | tuple[int, ...]

Shape of sequences to generate

required
alphabet NucleotideAlphabet

Alphabet to sample nucleotides from.

required
seed int | Generator | None

Random seed or generator.

None

Returns:

Type Description
NDArray[bytes_]

Randomly generated sequences of shape shape with S1 dtype.

Source code in python/seqpro/_modifiers.py
def random_seqs(
    shape: int | tuple[int, ...],
    alphabet: NucleotideAlphabet,
    seed: int | np.random.Generator | None = None,
) -> NDArray[np.bytes_]:
    """Generate random nucleotide sequences.

    Parameters
    ----------
    shape
        Shape of sequences to generate
    alphabet
        Alphabet to sample nucleotides from.
    seed
        Random seed or generator.

    Returns
    -------
    NDArray[np.bytes_]
        Randomly generated sequences of shape `shape` with S1 dtype.
    """
    if isinstance(seed, int) or seed is None:
        seed = np.random.default_rng(seed)
    return seed.choice(alphabet.array, size=shape)

reverse_complement(seqs, alphabet, length_axis=None, ohe_axis=None)

Reverse complement a sequence.

Parameters:

Name Type Description Default
seqs SeqType
required
alphabet NucleotideAlphabet
required
length_axis int | None

Needed for array input. Length axis, by default None

None
ohe_axis int | None

Needed for OHE input. One hot encoding axis, by default None

None

Returns:

Type Description
NDArray[bytes_ | uint8]

Reverse-complemented sequences as S1 bytes or uint8 for OHE input.

Source code in python/seqpro/_modifiers.py
def reverse_complement(
    seqs: SeqType,
    alphabet: NucleotideAlphabet,
    length_axis: int | None = None,
    ohe_axis: int | None = None,
) -> NDArray[np.bytes_ | np.uint8]:
    """Reverse complement a sequence.

    Parameters
    ----------
    seqs
    alphabet
    length_axis
        Needed for array input. Length axis, by default None
    ohe_axis
        Needed for OHE input. One hot encoding axis, by default None

    Returns
    -------
    NDArray[np.bytes_ | np.uint8]
        Reverse-complemented sequences as S1 bytes or uint8 for OHE input.
    """
    return alphabet.reverse_complement(seqs, length_axis, ohe_axis)

tokenize(seqs, token_map, unknown_token, out=None)

Tokenize nucleotides. Replaces each nucleotide with its corresponding token, if provided. Otherwise, uses each nucleotide's index in the alphabet. Nucleotides not in the alphabet or list of tokens are replaced with -1.

Parameters:

Name Type Description Default
seqs StrSeqType

Sequences to tokenize.

required
token_map dict[str, int]

Mapping of nucleotides to tokens.

required
unknown_token int

Token to use for unknown values.

required
out NDArray[int32] | None

Output array to store the result in. If not provided, a new array is created.

None

Returns:

Type Description
NDArray[int32]

Integer token IDs with the same shape as the input sequences.

Source code in python/seqpro/_encoders.py
def tokenize(
    seqs: StrSeqType,
    token_map: dict[str, int],
    unknown_token: int,
    out: NDArray[np.int32] | None = None,
) -> NDArray[np.int32]:
    """Tokenize nucleotides. Replaces each nucleotide with its corresponding token, if provided. Otherwise, uses each
    nucleotide's index in the alphabet. Nucleotides not in the alphabet or list of tokens are replaced with -1.

    Parameters
    ----------
    seqs
        Sequences to tokenize.
    token_map
        Mapping of nucleotides to tokens.
    unknown_token
        Token to use for unknown values.
    out
        Output array to store the result in. If not provided, a new array is created.

    Returns
    -------
    NDArray[np.int32]
        Integer token IDs with the same shape as the input sequences.
    """
    seqs = cast_seqs(seqs)
    source = np.array([c.encode("ascii") for c in token_map]).view(np.uint8)
    target = np.array(list(token_map.values()), dtype=np.int32)
    _unknown_token = np.int32(unknown_token)
    return gufunc_tokenize(seqs.view(np.uint8), source, target, _unknown_token, out)