BED¶

`from_pyr(pyr)` ¶

Convert a PyRanges object to a BED-like DataFrame.

Parameters:

Name	Type	Description	Default
`pyr`	`PyRanges`	PyRanges object with at least the columns "Chromosome", "Start", and "End".	required

Returns:

Type	Description
`DataFrame`	BED-like DataFrame with columns renamed to "chrom", "chromStart", and "chromEnd".

Source code in python/seqpro/bed.py

def from_pyr(pyr: pr.PyRanges) -> pl.DataFrame:
    """Convert a PyRanges object to a BED-like DataFrame.

    Parameters
    ----------
    pyr
        PyRanges object with at least the columns "Chromosome", "Start", and "End".

    Returns
    -------
    pl.DataFrame
        BED-like DataFrame with columns renamed to "chrom", "chromStart", and "chromEnd".
    """
    return (
        pl.from_pandas(pyr.df)
        .rename(
            {
                "Chromosome": "chrom",
                "Start": "chromStart",
                "End": "chromEnd",
                "Strand": "strand",
            },
            strict=False,
        )
        .with_columns(
            # pyranges casts these to categorical, but we want them back as strings
            pl.col(r"^(chrom|strand)$").cast(pl.Utf8),
        )
    )

`read(path)` ¶

Reads a bed-like (BED3+) file as a pandas DataFrame. The file type is inferred from the file extension and supports .bed, .narrowPeak, and .broadPeak.

Parameters:

Name	Type	Description	Default
`path`	`PathLike`	Path to the bed-like file.	required

Returns:

Type	Description
`DataFrame`	BED-like DataFrame with typed columns and zero-based coordinate metadata.

Source code in python/seqpro/bed.py

def read(path: PathLike) -> pl.DataFrame:
    """Reads a bed-like (BED3+) file as a pandas DataFrame. The file type is inferred
    from the file extension and supports .bed, .narrowPeak, and .broadPeak.

    Parameters
    ----------
    path
        Path to the bed-like file.

    Returns
    -------
    pl.DataFrame
        BED-like DataFrame with typed columns and zero-based coordinate metadata.
    """
    path = Path(path)
    if ".bed" in path.suffixes:
        result = _read_bed(path)
    elif ".narrowPeak" in path.suffixes:
        result = _read_narrowpeak(path)
    elif ".broadPeak" in path.suffixes:
        result = _read_broadpeak(path)
    else:
        raise ValueError(
            f"""Unrecognized file extension: {"".join(path.suffixes)}. Expected one of
            .bed, .narrowPeak, or .broadPeak (potentially gzipped)."""
        )
    result.config_meta.set(coordinate_system_zero_based=True)  # type: ignore[attr-defined]
    return result

`set_schema(bed, to, from_=None)` ¶

Rename coordinate columns to match a target schema.

Parameters:

Name	Type	Description	Default
`df`		A polars or pandas DataFrame with genomic coordinate columns.	required
`to`	`SchemaLike`	Target schema: a shorthand string ("bed", "pb", "pr", "gtf") or a tuple of column names (chrom, start, end[, strand]).	required
`from_`	`SchemaLike \| None`	Source schema hint. Auto-detected if not provided.	`None`

Source code in python/seqpro/_coords.py

def set_schema(
    bed: IntoFrameT, to: SchemaLike, from_: SchemaLike | None = None
) -> IntoFrameT:
    """Rename coordinate columns to match a target schema.

    Parameters
    ----------
    df
        A polars or pandas DataFrame with genomic coordinate columns.
    to
        Target schema: a shorthand string ("bed", "pb", "pr", "gtf") or
        a tuple of column names (chrom, start, end[, strand]).
    from_
        Source schema hint. Auto-detected if not provided.
    """
    bed = nw.from_native(bed)

    src = detect_schema(bed, hint=from_)
    tgt = _resolve_schema(to)

    cols = bed.columns
    rename_map: dict[str, str] = {}
    for src_col, tgt_col in [
        (src.chrom, tgt.chrom),
        (src.start, tgt.start),
        (src.end, tgt.end),
    ]:
        if src_col != tgt_col and src_col in cols:
            rename_map[src_col] = tgt_col

    if (
        src.strand is not None
        and tgt.strand is not None
        and src.strand != tgt.strand
        and src.strand in cols
    ):
        rename_map[src.strand] = tgt.strand

    result = nw.to_native(bed.rename(rename_map))

    if isinstance(result, pl.DataFrame):
        result.config_meta.set(coordinate_system_zero_based=tgt.zero_based)  # type: ignore[attr-defined]

    return result

`sort(bed)` ¶

Sort a BED-like DataFrame by chromosome, start, and end position, using the natural order of chromosome names e.g. 1, 2, ..., 10, ...

Parameters:

Name	Type	Description	Default
`bed`	`FrameT`	DataFrame with BED-format columns: "chrom", "chromStart", "chromEnd". Accepts polars or pandas DataFrames.	required

Returns:

Type	Description
`FrameT`	Sorted DataFrame of the same type as the input.

Source code in python/seqpro/bed.py

def sort(bed: FrameT) -> FrameT:
    """Sort a BED-like DataFrame by chromosome, start, and end position, using the natural
    order of chromosome names e.g. 1, 2, ..., 10, ...

    Parameters
    ----------
    bed
        DataFrame with BED-format columns: "chrom", "chromStart", "chromEnd".
        Accepts polars or pandas DataFrames.

    Returns
    -------
    FrameT
        Sorted DataFrame of the same type as the input.
    """
    bed = nw.from_native(bed)
    order = natsorted(bed["chrom"].unique().to_list())
    bed = (
        bed.with_columns(_seqpro_chrom_sort_key_=nw.col("chrom").cast(nw.Enum(order)))
        .sort("_seqpro_chrom_sort_key_", "chromStart", "chromEnd")
        .drop("_seqpro_chrom_sort_key_")
    )
    return nw.to_native(bed)

`to_pyr(bedlike)` ¶

Convert a BED-like DataFrame to a PyRanges object.

Warning

PyRanges automatically sorts the DataFrame by chromosome and start position, so the order of the regions may change after conversion. You can keep track of the original order by adding an index column before converting to a PyRanges object. After converting back to a DataFrame, you can sort the DataFrame by the index to get the original order.

Parameters:

Name	Type	Description	Default
`bedlike`		BED-like DataFrame (polars or pandas) with at least the columns "chrom", "chromStart", and "chromEnd".	required

Returns:

Type	Description
`PyRanges`	PyRanges object with columns renamed to "Chromosome", "Start", and "End".

Source code in python/seqpro/bed.py

def to_pyr(bedlike) -> pr.PyRanges:
    """Convert a BED-like DataFrame to a PyRanges object.

    !!! warning
        PyRanges automatically sorts the DataFrame by chromosome and start position, so the order of
        the regions may change after conversion. You can keep track of the original
        order by [adding an index column](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.with_row_index.html)
        before converting to a PyRanges object. After converting back to a DataFrame, you can sort the DataFrame by the index to
        get the original order.

    Parameters
    ----------
    bedlike
        BED-like DataFrame (polars or pandas) with at least the columns "chrom", "chromStart", and "chromEnd".

    Returns
    -------
    pr.PyRanges
        PyRanges object with columns renamed to "Chromosome", "Start", and "End".
    """
    pdf = nw.from_native(bedlike, eager_only=True).to_pandas()
    return pr.PyRanges(
        pdf.rename(
            columns={
                "chrom": "Chromosome",
                "chromStart": "Start",
                "chromEnd": "End",
                "strand": "Strand",
            }
        )
    )

`with_len(bed, length)` ¶

Set the length of regions in a BED-like DataFrame to a fixed length by expanding or shrinking relative to the center (or peak) of the window. If the original region size + length is odd, the center will be 1 position closer the right end.

Parameters:

Name	Type	Description	Default
`bed`	`FrameT`	BED-like DataFrame with at least the columns "chromStart" and "chromEnd".	required
`length`	`int`	Desired length of the windows. Must be non-negative.	required

Returns:

Type	Description
`FrameT`	DataFrame of the same type as the input with updated "chromStart" and "chromEnd" columns.

Source code in python/seqpro/bed.py

def with_len(bed: FrameT, length: int) -> FrameT:
    """Set the length of regions in a BED-like DataFrame to a fixed length by expanding or shrinking
    relative to the center (or peak) of the window. If the original region size + length is odd, the
    center will be 1 position closer the right end.

    Parameters
    ----------
    bed
        BED-like DataFrame with at least the columns "chromStart" and "chromEnd".
    length
        Desired length of the windows. Must be non-negative.

    Returns
    -------
    FrameT
        DataFrame of the same type as the input with updated "chromStart" and "chromEnd" columns.
    """
    bed = nw.from_native(bed)

    if length < 0:
        raise ValueError("Length must be non-negative.")

    if "peak" in bed.columns:
        double_center = (
            nw.when(nw.col("peak").is_null())
            .then(nw.col("chromStart") + nw.col("chromEnd"))
            .otherwise(2 * (nw.col("chromStart") + nw.col("peak")))
        )
    else:
        double_center = nw.col("chromStart") + nw.col("chromEnd")

    bed = bed.with_columns(
        chromStart=(double_center - length) // 2,
        chromEnd=(double_center + length) // 2,
    )
    return nw.to_native(bed)

BED¶

from_pyr(pyr) ¶

read(path) ¶

set_schema(bed, to, from_=None) ¶

sort(bed) ¶

to_pyr(bedlike) ¶

with_len(bed, length) ¶

`from_pyr(pyr)` ¶

`read(path)` ¶

`set_schema(bed, to, from_=None)` ¶

`sort(bed)` ¶

`to_pyr(bedlike)` ¶

`with_len(bed, length)` ¶