Skip to content

BED

from_pyr(pyr)

Convert a PyRanges object to a BED-like DataFrame.

Parameters:

Name Type Description Default
pyr PyRanges

PyRanges object with at least the columns "Chromosome", "Start", and "End".

required

Returns:

Type Description
DataFrame

BED-like DataFrame with columns renamed to "chrom", "chromStart", and "chromEnd".

Source code in python/seqpro/bed.py
def from_pyr(pyr: pr.PyRanges) -> pl.DataFrame:
    """Convert a PyRanges object to a BED-like DataFrame.

    Parameters
    ----------
    pyr
        PyRanges object with at least the columns "Chromosome", "Start", and "End".

    Returns
    -------
    pl.DataFrame
        BED-like DataFrame with columns renamed to "chrom", "chromStart", and "chromEnd".
    """
    return (
        pl.from_pandas(pyr.df)
        .rename(
            {
                "Chromosome": "chrom",
                "Start": "chromStart",
                "End": "chromEnd",
                "Strand": "strand",
            },
            strict=False,
        )
        .with_columns(
            # pyranges casts these to categorical, but we want them back as strings
            pl.col(r"^(chrom|strand)$").cast(pl.Utf8),
        )
    )

read(path)

Reads a bed-like (BED3+) file as a pandas DataFrame. The file type is inferred from the file extension and supports .bed, .narrowPeak, and .broadPeak.

Parameters:

Name Type Description Default
path PathLike

Path to the bed-like file.

required

Returns:

Type Description
DataFrame

BED-like DataFrame with typed columns and zero-based coordinate metadata.

Source code in python/seqpro/bed.py
def read(path: PathLike) -> pl.DataFrame:
    """Reads a bed-like (BED3+) file as a pandas DataFrame. The file type is inferred
    from the file extension and supports .bed, .narrowPeak, and .broadPeak.

    Parameters
    ----------
    path
        Path to the bed-like file.

    Returns
    -------
    pl.DataFrame
        BED-like DataFrame with typed columns and zero-based coordinate metadata.
    """
    path = Path(path)
    if ".bed" in path.suffixes:
        result = _read_bed(path)
    elif ".narrowPeak" in path.suffixes:
        result = _read_narrowpeak(path)
    elif ".broadPeak" in path.suffixes:
        result = _read_broadpeak(path)
    else:
        raise ValueError(
            f"""Unrecognized file extension: {"".join(path.suffixes)}. Expected one of
            .bed, .narrowPeak, or .broadPeak (potentially gzipped)."""
        )
    result.config_meta.set(coordinate_system_zero_based=True)  # type: ignore[attr-defined]
    return result

set_schema(bed, to, from_=None)

Rename coordinate columns to match a target schema.

Parameters:

Name Type Description Default
df

A polars or pandas DataFrame with genomic coordinate columns.

required
to SchemaLike

Target schema: a shorthand string ("bed", "pb", "pr", "gtf") or a tuple of column names (chrom, start, end[, strand]).

required
from_ SchemaLike | None

Source schema hint. Auto-detected if not provided.

None
Source code in python/seqpro/_coords.py
def set_schema(
    bed: IntoFrameT, to: SchemaLike, from_: SchemaLike | None = None
) -> IntoFrameT:
    """Rename coordinate columns to match a target schema.

    Parameters
    ----------
    df
        A polars or pandas DataFrame with genomic coordinate columns.
    to
        Target schema: a shorthand string ("bed", "pb", "pr", "gtf") or
        a tuple of column names (chrom, start, end[, strand]).
    from_
        Source schema hint. Auto-detected if not provided.
    """
    bed = nw.from_native(bed)

    src = detect_schema(bed, hint=from_)
    tgt = _resolve_schema(to)

    cols = bed.columns
    rename_map: dict[str, str] = {}
    for src_col, tgt_col in [
        (src.chrom, tgt.chrom),
        (src.start, tgt.start),
        (src.end, tgt.end),
    ]:
        if src_col != tgt_col and src_col in cols:
            rename_map[src_col] = tgt_col

    if (
        src.strand is not None
        and tgt.strand is not None
        and src.strand != tgt.strand
        and src.strand in cols
    ):
        rename_map[src.strand] = tgt.strand

    result = nw.to_native(bed.rename(rename_map))

    if isinstance(result, pl.DataFrame):
        result.config_meta.set(coordinate_system_zero_based=tgt.zero_based)  # type: ignore[attr-defined]

    return result

sort(bed)

Sort a BED-like DataFrame by chromosome, start, and end position, using the natural order of chromosome names e.g. 1, 2, ..., 10, ...

Parameters:

Name Type Description Default
bed FrameT

DataFrame with BED-format columns: "chrom", "chromStart", "chromEnd". Accepts polars or pandas DataFrames.

required

Returns:

Type Description
FrameT

Sorted DataFrame of the same type as the input.

Source code in python/seqpro/bed.py
def sort(bed: FrameT) -> FrameT:
    """Sort a BED-like DataFrame by chromosome, start, and end position, using the natural
    order of chromosome names e.g. 1, 2, ..., 10, ...

    Parameters
    ----------
    bed
        DataFrame with BED-format columns: "chrom", "chromStart", "chromEnd".
        Accepts polars or pandas DataFrames.

    Returns
    -------
    FrameT
        Sorted DataFrame of the same type as the input.
    """
    bed = nw.from_native(bed)
    order = natsorted(bed["chrom"].unique().to_list())
    bed = (
        bed.with_columns(_seqpro_chrom_sort_key_=nw.col("chrom").cast(nw.Enum(order)))
        .sort("_seqpro_chrom_sort_key_", "chromStart", "chromEnd")
        .drop("_seqpro_chrom_sort_key_")
    )
    return nw.to_native(bed)

to_pyr(bedlike)

Convert a BED-like DataFrame to a PyRanges object.

Warning

PyRanges automatically sorts the DataFrame by chromosome and start position, so the order of the regions may change after conversion. You can keep track of the original order by adding an index column before converting to a PyRanges object. After converting back to a DataFrame, you can sort the DataFrame by the index to get the original order.

Parameters:

Name Type Description Default
bedlike

BED-like DataFrame (polars or pandas) with at least the columns "chrom", "chromStart", and "chromEnd".

required

Returns:

Type Description
PyRanges

PyRanges object with columns renamed to "Chromosome", "Start", and "End".

Source code in python/seqpro/bed.py
def to_pyr(bedlike) -> pr.PyRanges:
    """Convert a BED-like DataFrame to a PyRanges object.

    !!! warning
        PyRanges automatically sorts the DataFrame by chromosome and start position, so the order of
        the regions may change after conversion. You can keep track of the original
        order by [adding an index column](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.with_row_index.html)
        before converting to a PyRanges object. After converting back to a DataFrame, you can sort the DataFrame by the index to
        get the original order.

    Parameters
    ----------
    bedlike
        BED-like DataFrame (polars or pandas) with at least the columns "chrom", "chromStart", and "chromEnd".

    Returns
    -------
    pr.PyRanges
        PyRanges object with columns renamed to "Chromosome", "Start", and "End".
    """
    pdf = nw.from_native(bedlike, eager_only=True).to_pandas()
    return pr.PyRanges(
        pdf.rename(
            columns={
                "chrom": "Chromosome",
                "chromStart": "Start",
                "chromEnd": "End",
                "strand": "Strand",
            }
        )
    )

with_len(bed, length)

Set the length of regions in a BED-like DataFrame to a fixed length by expanding or shrinking relative to the center (or peak) of the window. If the original region size + length is odd, the center will be 1 position closer the right end.

Parameters:

Name Type Description Default
bed FrameT

BED-like DataFrame with at least the columns "chromStart" and "chromEnd".

required
length int

Desired length of the windows. Must be non-negative.

required

Returns:

Type Description
FrameT

DataFrame of the same type as the input with updated "chromStart" and "chromEnd" columns.

Source code in python/seqpro/bed.py
def with_len(bed: FrameT, length: int) -> FrameT:
    """Set the length of regions in a BED-like DataFrame to a fixed length by expanding or shrinking
    relative to the center (or peak) of the window. If the original region size + length is odd, the
    center will be 1 position closer the right end.

    Parameters
    ----------
    bed
        BED-like DataFrame with at least the columns "chromStart" and "chromEnd".
    length
        Desired length of the windows. Must be non-negative.

    Returns
    -------
    FrameT
        DataFrame of the same type as the input with updated "chromStart" and "chromEnd" columns.
    """
    bed = nw.from_native(bed)

    if length < 0:
        raise ValueError("Length must be non-negative.")

    if "peak" in bed.columns:
        double_center = (
            nw.when(nw.col("peak").is_null())
            .then(nw.col("chromStart") + nw.col("chromEnd"))
            .otherwise(2 * (nw.col("chromStart") + nw.col("peak")))
        )
    else:
        double_center = nw.col("chromStart") + nw.col("chromEnd")

    bed = bed.with_columns(
        chromStart=(double_center - length) // 2,
        chromEnd=(double_center + length) // 2,
    )
    return nw.to_native(bed)