From b720e2c3c2f2847370e7776119e994b8a878c6a3 Mon Sep 17 00:00:00 2001 From: endre bakken stovner Date: Fri, 19 May 2023 18:20:26 +0200 Subject: [PATCH 01/10] Fix tostring2 --- pyproject.toml | 2 +- pyranges/tostring2.py | 4 ++-- tests/property_based/hypothesis_helper.py | 11 +++-------- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c015f467..cbdb2d59 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pyranges" -version = "0.0.126" +version = "0.0.127" description = "GenomicRanges for Python." readme = "README.md" authors = [{ name = "Endre Bakken Stovner", email = "endbak@pm.me" }] diff --git a/pyranges/tostring2.py b/pyranges/tostring2.py index cf03444a..de9d939a 100644 --- a/pyranges/tostring2.py +++ b/pyranges/tostring2.py @@ -195,7 +195,7 @@ def _grow_string_representation(df, columns_dtypes, terminal_width: Optional[int new_build_df = pd.concat([build_df, df[c]], axis=1) new_str_repr = tabulate( - new_build_df.to_dict(orient="records"), headers=list(new_build_df.columns), tablefmt="psql", showindex=False + new_build_df, headers=list(new_build_df.columns), tablefmt="psql", showindex=False # type: ignore ) table_width = len(new_str_repr.split("\n", 1)[0]) @@ -209,7 +209,7 @@ def _grow_string_representation(df, columns_dtypes, terminal_width: Optional[int if i < total_columns: new_build_df = add_hidden_col_dotdot(build_df, len(original_header[i:])) str_repr = tabulate( - new_build_df.to_dict(orient="records"), headers=list(new_build_df.columns), tablefmt="psql", showindex=False + new_build_df, headers=list(new_build_df.columns), tablefmt="psql", showindex=False # type: ignore ) return str_repr, original_header[i:] diff --git a/tests/property_based/hypothesis_helper.py b/tests/property_based/hypothesis_helper.py index f2c14a84..575e3c1d 100644 --- a/tests/property_based/hypothesis_helper.py +++ b/tests/property_based/hypothesis_helper.py @@ -8,14 +8,9 @@ import pyranges as pr from pyranges import PyRanges -if environ.get("GITHUB_ACTIONS"): - max_examples = 15 - slow_max_examples = 5 - deadline = None -else: - max_examples = 1000 - slow_max_examples = 100 - deadline = None +max_examples = 15 +slow_max_examples = 5 +deadline = None lengths = st.integers(min_value=1, max_value=int(1e7)) small_lengths = st.integers(min_value=1, max_value=int(1e4)) From 01ab973b180bc2ffc366db5b353a9992ee5a8378 Mon Sep 17 00:00:00 2001 From: endre bakken stovner Date: Sun, 21 May 2023 16:20:38 +0200 Subject: [PATCH 02/10] Add types to pyranges_main --- pyranges/__init__.py | 24 +- pyranges/helpers.py | 7 +- pyranges/methods/attr.py | 3 +- pyranges/methods/join.py | 2 +- pyranges/multithreaded.py | 44 --- pyranges/pyranges_main.py | 722 ++++++++++++++++++++------------------ 6 files changed, 391 insertions(+), 411 deletions(-) diff --git a/pyranges/__init__.py b/pyranges/__init__.py index c75b4be3..5bf17528 100644 --- a/pyranges/__init__.py +++ b/pyranges/__init__.py @@ -28,7 +28,7 @@ read_gff = read_gtf -def from_dict(d, int64=False): +def from_dict(d): """Create a PyRanges from dict. Parameters @@ -37,10 +37,6 @@ def from_dict(d, int64=False): Dict with data. - int64 : bool, default False. - - Whether to use 64-bit integers for starts and ends. - Warning ------- @@ -69,10 +65,10 @@ def from_dict(d, int64=False): For printing, the PyRanges was sorted on Chromosome and Strand. """ - return PyRanges(pd.DataFrame(d), int64=int64) + return PyRanges(pd.DataFrame(d)) -def from_string(s, int64=False): +def from_string(s): """Create a PyRanges from multiline string. Parameters @@ -81,10 +77,6 @@ def from_string(s, int64=False): String with data. - int64 : bool, default False. - - Whether to use 64-bit integers for starts and ends. - See Also -------- @@ -120,7 +112,7 @@ def from_string(s, int64=False): df = pd.read_csv(StringIO(s), sep=r"\s+", index_col=None) - return PyRanges(df, int64=int64) + return PyRanges(df) def itergrs(prs, strand=None, keys=False): @@ -233,7 +225,7 @@ def itergrs(prs, strand=None, keys=False): return iter(natsorted(grs_per_chromosome.items())) -def random(n=1000, length=100, chromsizes=None, strand=True, int64=False, seed=None): +def random(n=1000, length=100, chromsizes=None, strand=True, seed=None): """Return PyRanges with random intervals. Parameters @@ -254,9 +246,9 @@ def random(n=1000, length=100, chromsizes=None, strand=True, int64=False, seed=N Data should have strand. - int64 : bool, default False + seed : int, default None - Use int64 to represent Start and End. + Seed for random number generator. Examples -------- @@ -328,7 +320,7 @@ def random(n=1000, length=100, chromsizes=None, strand=True, int64=False, seed=N s = np.random.choice("+ -".split(), size=n) random_df.insert(3, "Strand", s) - return PyRanges(random_df, int64=int64) + return PyRanges(random_df) """Namespace for statistcal functions. diff --git a/pyranges/helpers.py b/pyranges/helpers.py index d9bcbe5d..cb4f099c 100644 --- a/pyranges/helpers.py +++ b/pyranges/helpers.py @@ -1,3 +1,8 @@ +from typing import Tuple, Union + +import pandas as pd + + def get_chromosomes_from_dict(dfs): keys = list(dfs.keys()) if isinstance(keys[0], tuple): @@ -18,7 +23,7 @@ def get_strands_from_dict(dfs): return strands -def get_key_from_df(df): +def get_key_from_df(df: pd.DataFrame) -> Union[str, Tuple[str, str]]: chromosome = df.Chromosome.head(1).iloc[0] if "Strand" in df: strand = df.Strand.head(1).iloc[0] diff --git a/pyranges/methods/attr.py b/pyranges/methods/attr.py index cd9cdc18..ca0f7bac 100644 --- a/pyranges/methods/attr.py +++ b/pyranges/methods/attr.py @@ -53,9 +53,8 @@ def _setattr(self, column_name, column, pos=False): if column_name not in ["Chromosome", "Strand"]: self.__dict__["dfs"] = dfs else: - int64 = True if self.dtypes["Start"] == np.int64 else False # will merge the dfs, then split on keys again to ensure they are correct - self.__dict__["dfs"] = pr.PyRanges(pr.PyRanges(dfs).df, int64=int64).dfs + self.__dict__["dfs"] = pr.PyRanges(pr.PyRanges(dfs).df).dfs def _getattr(self, name): diff --git a/pyranges/methods/join.py b/pyranges/methods/join.py index 3ad5c8cf..1662c352 100644 --- a/pyranges/methods/join.py +++ b/pyranges/methods/join.py @@ -4,7 +4,7 @@ def _both_indexes(scdf, ocdf, how=False, **kwargs): - assert (how in "containment first last outer right left".split() + [False, None]) or isinstance(how, int) + assert (how in "containment first last outer right left".split() + [False, None]) or isinstance(how, int), how starts = scdf.Start.values ends = scdf.End.values indexes = scdf.index.values diff --git a/pyranges/multithreaded.py b/pyranges/multithreaded.py index ccfee396..7505ea60 100644 --- a/pyranges/multithreaded.py +++ b/pyranges/multithreaded.py @@ -480,47 +480,3 @@ def _extend_grp(df, **kwargs): assert (df.Start < df.End).all(), "Some intervals are negative or zero length after applying extend!" return df - - -def pyrange_apply_chunks(function, self, as_pyranges, **kwargs): - nparams = get_n_args(function) - nb_cpu = kwargs.get("nb_cpu", 1) - if nb_cpu > 1: - import ray # type: ignore - - with suppress_stdout_stderr(): - ray.init(num_cpus=nb_cpu, ignore_reinit_error=True) - - function, get, _merge_dfs = get_multithreaded_funcs(function, nb_cpu=nb_cpu) - - keys = [] - lengths = [] - results = [] - for k, v in self.items(): - dfs = np.array_split(v, nb_cpu) - lengths.append(len(dfs)) - results.extend([call_f_single(function, nparams, df, **kwargs) for df in dfs]) - keys.append(k) - - results = get(results) - - _results = [] - start = 0 - for _, length in zip(keys, lengths): - end = start + length - _r = results[start:end] - - if as_pyranges: - _results.append(pd.concat(_r)) - else: - _results.append(_r) - - start = end - - results = _results - if nb_cpu > 1: - ray.shutdown() - - results = process_results(results, keys) - - return results diff --git a/pyranges/pyranges_main.py b/pyranges/pyranges_main.py index 4b5e921a..21278540 100644 --- a/pyranges/pyranges_main.py +++ b/pyranges/pyranges_main.py @@ -1,4 +1,6 @@ """Data structure for genomic intervals and their annotation.""" +from collections import defaultdict +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Tuple, Union import numpy as np import pandas as pd @@ -6,21 +8,22 @@ import pyranges as pr from pyranges.methods.intersection import _intersection, _overlap -from pyranges.multithreaded import ( - _extend, - _extend_grp, - _tes, - _tss, - pyrange_apply, - pyrange_apply_chunks, - pyrange_apply_single, -) +from pyranges.multithreaded import _extend, _extend_grp, _tes, _tss, pyrange_apply, pyrange_apply_single from pyranges.tostring2 import tostring +if TYPE_CHECKING: + from pathlib import Path + + from pandas.core.indexes.base import Index + from pyrle.rledict import RleDict # type: ignore + __all__ = ["PyRanges"] -def fill_kwargs(kwargs): +ChromosomeLocation = Union[str, Tuple[str, str]] + + +def fill_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]: """Give the kwargs dict default options.""" defaults = { @@ -55,7 +58,7 @@ class PyRanges: Parameters ---------- - df : pandas.DataFrame or dict of pandas.DataFrame, default None + df : DataFrame or dict of DataFrame, default None The data to be stored in the PyRanges. chromosomes : array-like or scalar value, default None @@ -71,7 +74,7 @@ class PyRanges: The strands in the PyRanges. copy_df : bool, default True - Copy input pandas.DataFrame + Copy input DataFrame See Also -------- @@ -87,7 +90,7 @@ class PyRanges: ----- A PyRanges object is represented internally as a dictionary efficiency. The keys are - chromosomes or chromosome/strand tuples and the values are pandas DataFrames. + chromosomes or chromosome/strand tuples and the values are pandas pd.DataFrames. Examples -------- @@ -139,8 +142,8 @@ class PyRanges: For printing, the PyRanges was sorted on Chromosome and Strand. """ - dfs = None - """Dict mapping chromosomes or chromosome/strand pairs to pandas DataFrames.""" + dfs: Union[Dict[str, pd.DataFrame], Dict[Tuple[str, str], pd.DataFrame]] + """Dict mapping chromosomes or chromosome/strand pairs to pandas pd.DataFrames.""" features = None """Namespace for genomic-features methods. @@ -162,14 +165,13 @@ class PyRanges: def __init__( self, - df=None, - chromosomes=None, - starts=None, - ends=None, - strands=None, - int64=False, - copy_df=True, - ): + df: Optional[Union[pd.DataFrame, Dict[Union[str], pd.DataFrame], Dict[Tuple[str, str], pd.DataFrame]]] = None, + chromosomes: Optional[str] = None, + starts: Optional[Tuple[int, int]] = None, + ends: Optional[List[int]] = None, + strands: Optional[Tuple[str, str]] = None, + copy_df: bool = True, + ) -> None: from pyranges.methods.init import _init if df is None and chromosomes is None: @@ -177,7 +179,7 @@ def __init__( _init(self, df, chromosomes, starts, ends, strands, copy_df) - def __array_ufunc__(self, *args, **kwargs): + def __array_ufunc__(self, *args, **kwargs) -> "PyRanges": """Apply unary numpy-function. @@ -236,7 +238,7 @@ def __array_ufunc__(self, *args, **kwargs): # self.apply() - def __getattr__(self, name): + def __getattr__(self, name: str) -> pd.Series: """Return column. Parameters @@ -247,7 +249,7 @@ def __getattr__(self, name): Returns ------- - pandas.Series + pandas.pd.Series Example ------- @@ -264,7 +266,7 @@ def __getattr__(self, name): return _getattr(self, name) - def __setattr__(self, column_name, column): + def __setattr__(self, column_name: str, column: Any) -> None: """Insert or update column. Parameters @@ -273,7 +275,7 @@ def __setattr__(self, column_name, column): Name of column to update or insert. - column : list, np.array or pd.Series + column : list, np.array or pd.pd.Series Data to insert. @@ -315,7 +317,7 @@ def __setattr__(self, column_name, column): ) ) - def __getitem__(self, val): + def __getitem__(self, val: Any) -> "PyRanges": """Fetch columns or subset on position. If a list is provided, the column(s) in the list is returned. This subsets on columns. @@ -326,7 +328,7 @@ def __getitem__(self, val): Parameters ---------- - val : bool array/Series, tuple, list, str or slice + val : bool array/pd.Series, tuple, list, str or slice Data to fetch. @@ -356,7 +358,7 @@ def __getitem__(self, val): Stranded PyRanges object has 2,446 rows and 7 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome and Strand. - Create boolean Series and use it to subset: + Create boolean pd.Series and use it to subset: >>> s = (gr.Feature == "gene") | (gr.gene_id == "ENSG00000223972") >>> gr[s] @@ -462,16 +464,16 @@ def __iter__(self): return iter(self.items()) - def __len__(self): + def __len__(self) -> int: """Return the number of intervals in the PyRanges.""" return sum([len(d) for d in self.values()]) - def __str__(self): + def __str__(self) -> str: """Return string representation.""" return tostring(self) - def __repr__(self): + def __repr__(self) -> str: """Return REPL representation.""" return str(self) @@ -481,42 +483,32 @@ def _repr_html_(self): return self.df._repr_html_() - def apply(self, f, strand=None, as_pyranges=True, nb_cpu=1, **kwargs): + def apply(self, f: Callable, strand: Optional[bool] = None, **kwargs) -> "PyRanges": """Apply a function to the PyRanges. Parameters ---------- f : function - Function to apply on each DataFrame in a PyRanges + Function to apply on each pd.DataFrame in a PyRanges - strand : bool, default None, i.e. auto + strand : Optional[bool], default None, i.e. auto Whether to do operations on chromosome/strand pairs or chromosomes. If None, will use chromosome/strand pairs if the PyRanges is stranded. - as_pyranges : bool, default True - - Whether to return as a PyRanges or dict. If `f` does not return a DataFrame valid for - PyRanges, `as_pyranges` must be False. - - nb_cpu: int, default 1 - - How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple. - Will only lead to speedups on large datasets. - **kwargs Additional keyword arguments to pass as keyword arguments to `f` Returns ------- - PyRanges or dict - Result of applying f to each DataFrame in the PyRanges + PyRanges + Result of applying f to each pd.DataFrame in the PyRanges See also -------- pyranges.PyRanges.apply_pair: apply a function to a pair of PyRanges - pyranges.PyRanges.apply_chunks: apply a row-based function to a PyRanges in parallel + pyranges.PyRanges.apply_general: apply a function to a PyRanges and return a Dict[keys, Any] Note ---- @@ -541,12 +533,6 @@ def apply(self, f, strand=None, as_pyranges=True, nb_cpu=1, **kwargs): Stranded PyRanges object has 4 rows and 4 columns from 2 chromosomes. For printing, the PyRanges was sorted on Chromosome and Strand. - >>> gr.apply(lambda df: len(df), as_pyranges=False) - {('1', '+'): 2, ('2', '+'): 1, ('2', '-'): 1} - - >>> gr.apply(lambda df: len(df), as_pyranges=False, strand=False) - {'1': 2, '2': 2} - >>> def add_to_ends(df, **kwargs): ... df.loc[:, "End"] = kwargs["slack"] + df.End ... return df @@ -573,79 +559,67 @@ def apply(self, f, strand=None, as_pyranges=True, nb_cpu=1, **kwargs): result = pyrange_apply_single(f, self, **kwargs) - if not as_pyranges: - return result - else: - return PyRanges(result) - - def apply_chunks(self, f, as_pyranges=False, nb_cpu=1, **kwargs): - """Apply a row-based function to arbitrary partitions of the PyRanges. + return PyRanges(result) - apply_chunks speeds up the application of functions where the result is not affected by - applying the function to ordered, non-overlapping splits of the data. + def apply_general( + self, f: Callable, strand: Optional[bool] = None, **kwargs + ) -> Union[Dict[str, Any], Dict[Tuple[str, str], Any]]: + """Apply a function to the PyRanges and return a dict of dict. Parameters ---------- f : function - Row-based or associative function to apply on the partitions. + Function to apply on each pd.DataFrame in a PyRanges - as_pyranges : bool, default False - - Whether to return as a PyRanges or dict. - - nb_cpu: int, default 1 + strand : Optional[bool], default None, i.e. auto - How many cpus to use. The data is split into nb_cpu partitions. + Whether to do operations on chromosome/strand pairs or chromosomes. If None, will use + chromosome/strand pairs if the PyRanges is stranded. **kwargs Additional keyword arguments to pass as keyword arguments to `f` Returns ------- - dict of lists - Result of applying f to each partition of the DataFrames in the PyRanges. + PyRanges + Result of applying f to each pd.DataFrame in the PyRanges See also -------- + pyranges.PyRanges.apply: apply a function to a PyRanges and return a PyRanges pyranges.PyRanges.apply_pair: apply a function to a pair of PyRanges - pyranges.PyRanges.apply_chunks: apply a row-based function to a PyRanges in parallel Note ---- - apply_chunks will only lead to speedups on large datasets or slow-running functions. Using - it with nb_cpu=1 is pointless; use apply instead. + This is the function used internally to carry out almost all unary PyRanges methods. Examples -------- - >>> gr = pr.from_dict({"Chromosome": [1, 1, 1], "Start": [2, 3, 5], "End": [9, 4, 6]}) - >>> gr - +--------------+-----------+-----------+ - | Chromosome | Start | End | - | (category) | (int64) | (int64) | - |--------------+-----------+-----------| - | 1 | 2 | 9 | - | 1 | 3 | 4 | - | 1 | 5 | 6 | - +--------------+-----------+-----------+ - Unstranded PyRanges object has 3 rows and 3 columns from 1 chromosomes. - For printing, the PyRanges was sorted on Chromosome. + >>> gr = pr.from_dict({"Chromosome": [1, 1, 2, 2], "Strand": ["+", "+", "-", "+"], + ... "Start": [1, 4, 2, 9], "End": [2, 27, 13, 10]}) + + >>> gr.apply_general(lambda df: len(df)) + {('1', '+'): 2, ('2', '+'): 1, ('2', '-'): 1} - >>> gr.apply_chunks( - ... lambda df, **kwargs: list(df.End + kwargs["add"]), nb_cpu=1, add=1000) - {'1': [[1009, 1004, 1006]]} + >>> gr.apply_general(lambda df: len(df), strand=False) + {'1': 2, '2': 2} """ + if strand is None: + strand = self.stranded + + kwargs.update({"strand": strand}) kwargs.update(kwargs.get("kwargs", {})) kwargs = fill_kwargs(kwargs) - result = pyrange_apply_chunks(f, self, as_pyranges, **kwargs) + return pyrange_apply_single(f, self, **kwargs) - return result - - def apply_pair(self, other, f, strandedness=None, as_pyranges=True, **kwargs): + def apply_pair( + self, other: "PyRanges", f: Callable, strandedness: None = None, as_pyranges: bool = True, **kwargs + ) -> Union[Dict[Tuple[str, str], Tuple[int, int]], "PyRanges"]: """Apply a function to a pair of PyRanges. The function is applied to each chromosome or chromosome/strand pair found in at least one @@ -654,7 +628,7 @@ def apply_pair(self, other, f, strandedness=None, as_pyranges=True, **kwargs): Parameters ---------- f : function - Row-based or associative function to apply on the DataFrames. + Row-based or associative function to apply on the pd.DataFrames. strandedness : {None, "same", "opposite", False}, default None, i.e. auto @@ -664,7 +638,7 @@ def apply_pair(self, other, f, strandedness=None, as_pyranges=True, **kwargs): as_pyranges : bool, default False - Whether to return as a PyRanges or dict. If `f` does not return a DataFrame valid for + Whether to return as a PyRanges or dict. If `f` does not return a pd.DataFrame valid for PyRanges, `as_pyranges` must be False. nb_cpu: int, default 1 @@ -678,7 +652,7 @@ def apply_pair(self, other, f, strandedness=None, as_pyranges=True, **kwargs): Returns ------- dict of lists - Result of applying f to each partition of the DataFrames in the PyRanges. + Result of applying f to each partition of the pd.DataFrames in the PyRanges. See also -------- @@ -752,20 +726,20 @@ def apply_pair(self, other, f, strandedness=None, as_pyranges=True, **kwargs): else: return PyRanges(result) - def as_df(self): - """Return PyRanges as DataFrame. + def as_df(self) -> pd.DataFrame: + """Return PyRanges as pd.DataFrame. Returns ------- - DataFrame + pd.DataFrame - A DataFrame natural sorted on Chromosome and Strand. The ordering of rows within + A pd.DataFrame natural sorted on Chromosome and Strand. The ordering of rows within chromosomes and strands is preserved. See also -------- - PyRanges.df : Return PyRanges as DataFrame. + PyRanges.df : Return PyRanges as pd.DataFrame. Examples -------- @@ -800,7 +774,7 @@ def as_df(self): else: return pd.concat(self.values()).reset_index(drop=True) - def assign(self, col, f, strand=None, nb_cpu=1, **kwargs): + def assign(self, col: str, f: Callable, strand: Optional[bool] = None, nb_cpu: int = 1, **kwargs) -> "PyRanges": """Add or replace a column. Does not change the original PyRanges. @@ -815,7 +789,7 @@ def assign(self, col, f, strand=None, nb_cpu=1, **kwargs): f : function Function to create new column. - strand : bool, default None, i.e. auto + strand : Optional[bool], default None, i.e. auto Whether to do operations on chromosome/strand pairs or chromosomes. If None, will use chromosome/strand pairs if the PyRanges is stranded. @@ -889,7 +863,7 @@ def assign(self, col, f, strand=None, nb_cpu=1, **kwargs): first_result = next(iter(result.values())) - assert isinstance(first_result, pd.Series), "result of assign function must be Series, but is {}".format( + assert isinstance(first_result, pd.Series), "result of assign function must be pd.Series, but is {}".format( type(first_result) ) @@ -899,7 +873,7 @@ def assign(self, col, f, strand=None, nb_cpu=1, **kwargs): return new_self - def boundaries(self, group_by, agg=None): + def boundaries(self, group_by: str, agg: Optional[Dict[str, Union[str, Callable]]] = None) -> "PyRanges": """Return the boundaries of groups of intervals (e.g. transcripts) Parameters @@ -913,7 +887,7 @@ def boundaries(self, group_by, agg=None): Defines how to aggregate metadata columns. Provided as dictionary of column names -> functions, function names or list of such, - as accepted by the Pandas.DataFrame.agg method. + as accepted by the pd.DataFrame.agg method. Returns @@ -971,7 +945,7 @@ def boundaries(self, group_by, agg=None): result = pyrange_apply_single(_bounds, self, **kwargs) return pr.PyRanges(result) - def calculate_frame(self, by): + def calculate_frame(self, by: Union[str, List[str]]) -> "PyRanges": """Calculate the frame of each genomic interval, assuming all are coding sequences (CDS), and add it as column inplace. After this, the input Pyranges will contain an added "Frame" column, which determines the base of the CDS that is the first base of a codon. @@ -987,17 +961,15 @@ def calculate_frame(self, by): Returns ------- - None - The "Frame" column is added inplace. - + PyRanges Examples -------- - >>> p= pr.from_dict({"Chromosome": [1,1,1,2,2], - ... "Strand": ["+","+","+","-","-"], - ... "Start": [1,31,52,101,201], - ... "End": [10,45,90,130,218], - ... "transcript_id": ["t1","t1","t1","t2","t2"] }) + >>> p = pr.from_dict({"Chromosome": [1,1,1,2,2], + ... "Strand": ["+","+","+","-","-"], + ... "Start": [1,31,52,101,201], + ... "End": [10,45,90,130,218], + ... "transcript_id": ["t1","t1","t1","t2","t2"]}) >>> p +--------------+--------------+-----------+-----------+-----------------+ | Chromosome | Strand | Start | End | transcript_id | @@ -1013,7 +985,6 @@ def calculate_frame(self, by): For printing, the PyRanges was sorted on Chromosome and Strand. >>> p.calculate_frame(by=['transcript_id']) - >>> p +--------------+--------------+-----------+-----------+-----------------+-----------+ | Chromosome | Strand | Start | End | transcript_id | Frame | | (category) | (category) | (int64) | (int64) | (object) | (int64) | @@ -1028,22 +999,26 @@ def calculate_frame(self, by): For printing, the PyRanges was sorted on Chromosome and Strand. """ + _self = self.copy() # Column to save the initial index - self.__index__ = np.arange(len(self)) + _self.__index__ = np.arange(len(self)) # Filtering for desired columns - lst = by if type(by) is list else [by] - sorted_p = self[["Strand", "__index__"] + lst] + if isinstance(by, str): + lst = [by] + else: + lst = by + sorted_p = _self[["Strand", "__index__"] + lst] # Sorting by 5' (Intervals on + are sorted by ascending order and - are sorted by descending order) sorted_p = sorted_p.sort(by="5") # Creating a column saving the length for the intervals (for selenoprofiles and ensembl) - sorted_p.__length__ = sorted_p.End - sorted_p.Start + sorted_p.__length__ = sorted_p.lengths() - # Creating a column saving the cummulative length for the intervals - for k, df in sorted_p: - sorted_p.dfs[k]["__cumsum__"] = df.groupby(by=by).__length__.cumsum() + # Creating a column saving the cumulative length for the intervals + for df in sorted_p.values(): + df["__cumsum__"] = df.groupby(by=by).__length__.cumsum() # Creating a frame column sorted_p.Frame = sorted_p.__cumsum__ - sorted_p.__length__ @@ -1051,13 +1026,13 @@ def calculate_frame(self, by): # Appending the Frame of sorted_p by the index of p sorted_p = sorted_p.apply(lambda df: df.sort_values(by="__index__")) - self.Frame = sorted_p.Frame + _self.Frame = sorted_p.Frame # Drop __index__ column - self.apply(lambda df: df.drop("__index__", axis=1, inplace=True)) + return _self.apply(lambda df: df.drop("__index__", axis=1)) @property - def chromosomes(self): + def chromosomes(self) -> List[str]: """Return chromosomes in natsorted order.""" if self.stranded: @@ -1065,7 +1040,13 @@ def chromosomes(self): else: return natsorted(set([k for k in self.keys()])) - def cluster(self, strand=None, by=None, slack=0, count=False, nb_cpu=1): + def cluster( + self, + strand: Optional[bool] = None, + by: Optional[Union[List[str], str]] = None, + slack: int = 0, + count: bool = False, + ) -> "PyRanges": """Give overlapping intervals a common id. Parameters @@ -1184,27 +1165,27 @@ def cluster(self, strand=None, by=None, slack=0, count=False, nb_cpu=1): Stranded PyRanges object has 2,446 rows and 7 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome and Strand. """ - + _self = self.copy() if strand is None: - strand = self.stranded + strand = _self.stranded kwargs = {"strand": strand, "slack": slack, "count": count, "by": by} kwargs = fill_kwargs(kwargs) - _stranded = self.stranded + _stranded = _self.stranded if not strand and _stranded: - self.Strand2 = self.Strand - self = self.unstrand() + _self.__Strand__ = _self.Strand + _self = _self.unstrand() if not by: from pyranges.methods.cluster import _cluster - df = pyrange_apply_single(_cluster, self, **kwargs) + df = pyrange_apply_single(_cluster, _self, **kwargs) else: from pyranges.methods.cluster import _cluster_by kwargs["by"] = by - df = pyrange_apply_single(_cluster_by, self, **kwargs) + df = pyrange_apply_single(_cluster_by, _self, **kwargs) gr = PyRanges(df) @@ -1224,13 +1205,12 @@ def cluster(self, strand=None, by=None, slack=0, count=False, nb_cpu=1): new_dfs[k] = v if not strand and _stranded: - new_dfs = {k: d.rename(columns={"Strand2": "Strand"}) for k, d in new_dfs.items()} - - self = PyRanges(new_dfs) - - return self + renamed = [d.rename(columns={"__Strand__": "Strand"}) for d in new_dfs.values()] + return PyRanges._zip_locationkey_and_data(new_dfs.keys(), renamed, strand=True) + else: + return PyRanges._zip_locationkey_and_data(new_dfs.keys(), new_dfs.values(), strand=strand) - def copy(self): + def copy(self) -> "PyRanges": """Make a deep copy of the PyRanges. Notes @@ -1241,7 +1221,7 @@ def copy(self): return self.apply(lambda df: df.copy(deep=True)) @property - def columns(self): + def columns(self) -> "Index": """Return the column labels of the PyRanges. Returns @@ -1284,20 +1264,18 @@ def columns(self): """ if not len(self.values()): - return [] + return pd.Index([]) first = next(iter(self.values())) - columns = first.columns - - return columns + return first.columns def count_overlaps( self, - other, - strandedness=None, - keep_nonoverlapping=True, - overlap_col="NumberOverlaps", - ): + other: "PyRanges", + strandedness: None = None, + keep_nonoverlapping: bool = True, + overlap_col: str = "NumberOverlaps", + ) -> "PyRanges": """Count number of overlaps per interval. Count how many intervals in self overlap with those in other. @@ -1388,13 +1366,13 @@ def count_overlaps( def coverage( self, - other, - strandedness=None, - keep_nonoverlapping=True, - overlap_col="NumberOverlaps", - fraction_col="FractionOverlaps", - nb_cpu=1, - ): + other: "PyRanges", + strandedness: None = None, + keep_nonoverlapping: bool = True, + overlap_col: str = "NumberOverlaps", + fraction_col: str = "FractionOverlaps", + nb_cpu: int = 1, + ) -> "PyRanges": """Count number of overlaps and their fraction per interval. Count how many intervals in self overlap with those in other. @@ -1501,17 +1479,17 @@ def coverage( return counts @property - def df(self): - """Return PyRanges as DataFrame. + def df(self) -> pd.DataFrame: + """Return PyRanges as pd.DataFrame. See also -------- - PyRanges.as_df : return PyRanges as DataFrame.""" + PyRanges.as_df : return PyRanges as pd.DataFrame.""" return self.as_df() - def drop(self, drop=None, like=None): + def drop(self, drop: Optional[str] = None, like: Optional[str] = None) -> "PyRanges": """Drop column(s). If no arguments are given, all the columns except Chromosome, Start, End and Strand are @@ -1591,7 +1569,7 @@ def drop(self, drop=None, like=None): return _drop(self, drop, like) - def drop_duplicate_positions(self, strand=None, keep="first"): + def drop_duplicate_positions(self, strand: Optional[bool] = None, keep: Union[bool, str] = "first") -> "PyRanges": """Return PyRanges with duplicate postion rows removed. Parameters @@ -1667,15 +1645,12 @@ def drop_duplicate_positions(self, strand=None, keep="first"): if strand is None: strand = self.stranded - kwargs = {} - kwargs["sparse"] = {"self": False} - kwargs["keep"] = keep + kwargs = {"sparse": {"self": False}, "keep": keep, "strand": strand and self.stranded} kwargs = fill_kwargs(kwargs) - kwargs["strand"] = strand and self.stranded return PyRanges(pyrange_apply_single(_drop_duplicate_positions, self, **kwargs)) @property - def dtypes(self): + def dtypes(self) -> pd.Series: """Return the dtypes of the PyRanges. Examples @@ -1715,12 +1690,12 @@ def dtypes(self): return df.dtypes @property - def empty(self): + def empty(self) -> bool: """Indicate whether PyRanges is empty.""" return len(self) == 0 - def extend(self, ext, group_by=None): + def extend(self, ext: Union[Dict[str, int], int], group_by: None = None) -> "PyRanges": """Extend the intervals from the ends. Parameters @@ -1835,7 +1810,7 @@ def extend(self, ext, group_by=None): # @profile - def five_end(self): + def five_end(self) -> "PyRanges": """Return the five prime end of intervals. The five prime end is the start of a forward strand or the end of a reverse strand. @@ -1888,7 +1863,7 @@ def five_end(self): kwargs = fill_kwargs({"strand": self.stranded}) return PyRanges(pyrange_apply_single(_tss, self, **kwargs)) - def head(self, n=8): + def head(self, n: int = 8) -> "PyRanges": """Return the n first rows. Parameters @@ -1949,12 +1924,14 @@ def head(self, n=8): subsetter[:n] = True return self[subsetter] - def insert(self, other, loc=None): + def insert( + self, other: Union[pd.DataFrame, pd.Series, Dict[str, pd.Series]], loc: Optional[int] = None + ) -> "PyRanges": """Add one or more columns to the PyRanges. Parameters ---------- - other : Series, DataFrame or dict + other : pd.Series, pd.DataFrame or dict Data to insert into the PyRanges. `other` must have the same number of rows as the PyRanges. loc : int, default None, i.e. after last column of PyRanges. @@ -1968,7 +1945,7 @@ def insert(self, other, loc=None): Note ---- - If a Series, or a dict of Series is used, the Series must have a name. + If a pd.Series, or a dict of pd.Series is used, the pd.Series must have a name. Examples -------- @@ -2024,8 +2001,8 @@ def insert(self, other, loc=None): Unstranded PyRanges object has 4 rows and 5 columns from 3 chromosomes. For printing, the PyRanges was sorted on Chromosome. - >>> arbitrary_result = gr.apply( - ... lambda df: pd.Series(df.Start + df.End, name="Hi!"), as_pyranges=False) + >>> arbitrary_result = gr.apply_general( + ... lambda df: pd.Series(df.Start + df.End, name="Hi!")) >>> arbitrary_result {'E': 1 9 2 15 @@ -2055,11 +2032,11 @@ def insert(self, other, loc=None): from pyranges.methods.attr import _setattr if isinstance(other, (pd.Series, pd.DataFrame)): - assert len(other) == len(self), "Pandas Series or DataFrame must be same length as PyRanges!" + assert len(other) == len(self), "Pandas pd.Series or pd.DataFrame must be same length as PyRanges!" if isinstance(other, pd.Series): if not other.name: - raise Exception("Series must have a name!") + raise Exception("pd.Series must have a name!") _setattr(self, other.name, other, loc) @@ -2072,7 +2049,7 @@ def insert(self, other, loc=None): first = next(iter(other.values())) is_dataframe = isinstance(first, pd.DataFrame) if is_dataframe: - columns = first.columns + columns = [str(c) for c in first.columns] ds = [] for c in columns: @@ -2083,14 +2060,16 @@ def insert(self, other, loc=None): loc += 1 else: if not first.name: - raise Exception("Series must have a name!") + raise Exception("pd.Series must have a name!") d = {k: v for k, v in other.items()} _setattr(self, first.name, d, loc) return self - def intersect(self, other, strandedness=None, how=None, invert=False, nb_cpu=1): + def intersect( + self, other: "PyRanges", strandedness: Optional[bool] = None, how: Optional[str] = None, invert: bool = False + ) -> "PyRanges": """Return overlapping subintervals. Returns the segments of the intervals in self which overlap with those in other. @@ -2116,11 +2095,6 @@ def intersect(self, other, strandedness=None, how=None, invert=False, nb_cpu=1): Whether to return the intervals without overlaps. - nb_cpu: int, default 1 - - How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple. - Will only lead to speedups on large datasets. - Returns ------- PyRanges @@ -2197,9 +2171,8 @@ def intersect(self, other, strandedness=None, how=None, invert=False, nb_cpu=1): For printing, the PyRanges was sorted on Chromosome. """ - kwargs = {"how": how, "strandedness": strandedness, "nb_cpu": nb_cpu} + kwargs = {"how": how, "strandedness": strandedness, "sparse": {"self": False, "other": True}} kwargs = fill_kwargs(kwargs) - kwargs["sparse"] = {"self": False, "other": True} if len(self) == 0: return self @@ -2212,26 +2185,26 @@ def intersect(self, other, strandedness=None, how=None, invert=False, nb_cpu=1): if invert: found_idxs = getattr(result, "__ix__", []) - result = self[~self.__ix__.isin(found_idxs)] + result = self[~pd.Series(self.__ix__).isin(found_idxs)] result = result.drop("__ix__") return result - def items(self): - """Return the pairs of keys and DataFrames. + def items(self) -> Union[List[Tuple[str, pd.DataFrame]], List[Tuple[Tuple[str, str], pd.DataFrame]]]: + """Return the pairs of keys and pd.DataFrames. Returns ------- dict - The dict mapping keys to DataFrames in the PyRanges. + The dict mapping keys to pd.DataFrames in the PyRanges. See Also -------- PyRanges.chromosomes : return the chromosomes PyRanges.keys : return the keys - PyRanges.values : return the DataFrames in the PyRanges + PyRanges.values : return the pd.DataFrames in the PyRanges Examples -------- @@ -2248,16 +2221,16 @@ def items(self): def join( self, - other, - strandedness=None, - how=None, - report_overlap=False, - slack=0, - suffix="_b", - nb_cpu=1, - apply_strand_suffix=None, - preserve_order=False, - ): + other: "PyRanges", + strandedness: None = None, + how: Optional[str] = None, + report_overlap: bool = False, + slack: int = 0, + suffix: str = "_b", + nb_cpu: int = 1, + apply_strand_suffix: None = None, + preserve_order: bool = False, + ) -> "PyRanges": """Join PyRanges on genomic location. Parameters @@ -2401,7 +2374,7 @@ def join( from pyranges.methods.join import _write_both - kwargs = { + kwargs: Dict[str, Any] = { "strandedness": strandedness, "how": how, "report_overlap": report_overlap, @@ -2451,7 +2424,7 @@ def join( return gr - def keys(self): + def keys(self) -> Union[List[str], List[Tuple[str, str]]]: """Return the keys. Returns @@ -2480,16 +2453,16 @@ def keys(self): def k_nearest( self, - other, - k=1, - ties=None, - strandedness=None, - overlap=True, - how=None, - suffix="_b", - nb_cpu=1, - apply_strand_suffix=None, - ): + other: "PyRanges", + k: Union[List[int], int] = 1, + ties: Optional[str] = None, + strandedness: None = None, + overlap: bool = True, + how: Optional[str] = None, + suffix: str = "_b", + nb_cpu: int = 1, + apply_strand_suffix: None = None, + ) -> "PyRanges": """Find k nearest intervals. Parameters @@ -2498,7 +2471,7 @@ def k_nearest( PyRanges to find nearest interval in. - k : int or list/array/Series of int + k : int or list/array/pd.Series of int Number of closest to return. If iterable, must be same length as PyRanges. @@ -2718,29 +2691,24 @@ def k_nearest( kwargs = fill_kwargs(kwargs) kwargs["stranded"] = self.stranded and other.stranded - overlap = kwargs.get("overlap", True) - ties = kwargs.get("ties", False) - - self = self.copy() + _self = self.copy() if isinstance(k, pd.Series): k = k.values # how many to nearest to find; might be different for each - self.__k__ = k + _self.__k__ = k # give each their own unique ID - self.__IX__ = np.arange(len(self)) + _self.__IX__ = np.arange(len(_self)) - dfs = pyrange_apply(_nearest, self, other, **kwargs) + dfs = pyrange_apply(_nearest, _self, other, **kwargs) nearest = PyRanges(dfs) if not overlap: result = nearest else: - from collections import defaultdict - - overlap_how = defaultdict(lambda: None, {"first": "first", "last": "last"})[kwargs.get("ties")] - overlaps = self.join( + overlap_how = defaultdict(lambda: None, {"first": "first", "last": "last"})[kwargs.get("ties")] # type: ignore + overlaps = _self.join( other, strandedness=strandedness, how=overlap_how, @@ -2805,7 +2773,7 @@ def k_nearest( result = result.drop(like="__IX__|__k__") - self = self.drop(like="__k__|__IX__") + _self = _self.drop(like="__k__|__IX__") def prev_to_neg(df, **kwargs): strand = df.Strand.iloc[0] if "Strand" in df else "+" @@ -2821,7 +2789,7 @@ def prev_to_neg(df, **kwargs): result = result.apply(prev_to_neg, suffix=kwargs["suffix"]) - if not self.stranded and other.stranded: + if not _self.stranded and other.stranded: if apply_strand_suffix is None: import sys @@ -2835,7 +2803,7 @@ def prev_to_neg(df, **kwargs): return result @property - def length(self): + def length(self) -> int: """Return the total length of the intervals. See Also @@ -2868,9 +2836,15 @@ def length(self): 5 """ - return int(self.lengths(as_dict=False).sum()) + lengths = self.lengths(as_dict=False) + assert isinstance(lengths, pd.Series) + length = lengths.sum() + assert isinstance(length, (np.int64, int)) + return int(length) - def lengths(self, as_dict=False): + def lengths( + self, as_dict: bool = False + ) -> Union[pd.Series, Dict[Tuple[str, str], pd.Series], Dict[str, pd.Series]]: """Return the length of each interval. Parameters @@ -2878,11 +2852,11 @@ def lengths(self, as_dict=False): as_dict : bool, default False - Whether to return lengths as Series or dict of Series per key. + Whether to return lengths as pd.Series or dict of pd.Series per key. Returns ------- - Series or dict of Series with the lengths of each interval. + pd.Series or dict of pd.Series with the lengths of each interval. See Also -------- @@ -2928,24 +2902,19 @@ def lengths(self, as_dict=False): """ if as_dict: - if not len(self): - return {} - lengths = {} - for k, df in self.items(): - lengths[k] = df.End - df.Start - - return lengths + return {k: df.End - df.Start for k, df in self.items()} # type: ignore else: - _lengths = [] + _lengths: List[pd.Series] = [] if not len(self): - return np.array(_lengths, dtype=int) + return pd.Series([], dtype=np.int64) for _, df in self: - lengths = df.End - df.Start - _lengths.append(lengths) + _lengths.append(df.End - df.Start) - return pd.concat(_lengths).reset_index(drop=True) + ls = pd.concat(_lengths).reset_index(drop=True) + assert isinstance(ls, pd.Series) + return ls - def max_disjoint(self, strand=None, slack=0, **kwargs): + def max_disjoint(self, strand: Optional[bool] = None, slack: int = 0, **kwargs) -> "PyRanges": """Find the maximal disjoint set of intervals. Parameters @@ -3003,7 +2972,14 @@ def max_disjoint(self, strand=None, slack=0, **kwargs): return pr.PyRanges(df) - def merge(self, strand=None, count=False, count_col="Count", by=None, slack=0): + def merge( + self, + strand: Optional[bool] = None, + count: bool = False, + count_col: str = "Count", + by: Optional[Union[List[str], str]] = None, + slack: int = 0, + ) -> "PyRanges": """Merge overlapping intervals into one. Parameters @@ -3125,7 +3101,7 @@ def merge(self, strand=None, count=False, count_col="Count", by=None, slack=0): if strand is None: strand = self.stranded - kwargs = { + kwargs: Dict[str, Any] = { "strand": strand, "count": count, "by": by, @@ -3146,7 +3122,7 @@ def merge(self, strand=None, count=False, count_col="Count", by=None, slack=0): return PyRanges(df) - def mp(self, n=8, formatting=None): + def mp(self, n: int = 8, formatting: None = None) -> None: """Merge location and print. See Also @@ -3192,14 +3168,14 @@ def mspc(self, n=30, formatting=None): def nearest( self, - other, - strandedness=None, - overlap=True, - how=None, - suffix="_b", - nb_cpu=1, - apply_strand_suffix=None, - ): + other: "PyRanges", + strandedness: None = None, + overlap: bool = True, + how: Optional[str] = None, + suffix: str = "_b", + nb_cpu: int = 1, + apply_strand_suffix: None = None, + ) -> "PyRanges": """Find closest interval. Parameters @@ -3339,7 +3315,7 @@ def nearest( return gr - def new_position(self, new_pos, columns=None): + def new_position(self, new_pos: str, columns: Optional[Tuple[str, str, str, str]] = None) -> "PyRanges": """Give new position. The operation join produces a PyRanges with two pairs of start coordinates and two pairs of @@ -3351,9 +3327,9 @@ def new_position(self, new_pos, columns=None): Change of coordinates. - columns : tuple of str, default None, i.e. auto + columns : Optional[tuple of str], default None, i.e. auto - The name of the coordinate columns. By default uses the two first columns containing + The name of the coordinate columns. By default, uses the two first columns containing "Start" and the two first columns containing "End". See Also @@ -3471,9 +3447,7 @@ def new_position(self, new_pos, columns=None): if self.empty: return self - kwargs = {"strand": None} - kwargs["sparse"] = {"self": False} - kwargs["new_pos"] = new_pos + kwargs: Dict[str, Any] = {"strand": None, "sparse": {"self": False}, "new_pos": new_pos} if columns is None: start1, start2 = self.columns[self.columns.str.contains("Start")][:2] @@ -3488,7 +3462,14 @@ def new_position(self, new_pos, columns=None): return pr.PyRanges(dfs) - def overlap(self, other, strandedness=None, how="first", invert=False, nb_cpu=1): + def overlap( + self, + other: "PyRanges", + strandedness: Optional[Union[bool, str]] = None, + how: Optional[str] = "first", + invert: bool = False, + nb_cpu: int = 1, + ) -> "PyRanges": """Return overlapping intervals. Returns the intervals in self which overlap with those in other. @@ -3605,10 +3586,13 @@ def overlap(self, other, strandedness=None, how="first", invert=False, nb_cpu=1) For printing, the PyRanges was sorted on Chromosome. """ - kwargs = {"strandedness": strandedness, "nb_cpu": nb_cpu} - kwargs["sparse"] = {"self": False, "other": True} - kwargs["how"] = how - kwargs["invert"] = invert + kwargs = { + "strandedness": strandedness, + "nb_cpu": nb_cpu, + "sparse": {"self": False, "other": True}, + "how": how, + "invert": invert, + } kwargs = fill_kwargs(kwargs) if len(self) == 0: @@ -3623,7 +3607,7 @@ def overlap(self, other, strandedness=None, how="first", invert=False, nb_cpu=1) if invert: found_idxs = getattr(result, "__ix__", []) - result = self[~self.__ix__.isin(found_idxs)] + result = self[~self.__ix__.isin(found_idxs)] # type: ignore result = result.drop("__ix__") return result @@ -3640,7 +3624,9 @@ def pc(self, n=8, formatting=None): return self - def print(self, n=8, merge_position=False, sort=False, formatting=None, chain=False): + def print( + self, n: int = 8, merge_position: bool = False, sort: bool = False, formatting: Optional[Dict[str, str]] = None + ) -> None: """Print the PyRanges. Parameters @@ -3650,7 +3636,7 @@ def print(self, n=8, merge_position=False, sort=False, formatting=None, chain=Fa The number of rows to print. - merge_postion : bool, default False + merge_position : bool, default False Print location in same column to save screen space. @@ -3663,10 +3649,6 @@ def print(self, n=8, merge_position=False, sort=False, formatting=None, chain=Fa Formatting options per column. - chain : False - - Return the PyRanges. Useful to print intermediate results in call chains. - See Also -------- @@ -3677,7 +3659,7 @@ def print(self, n=8, merge_position=False, sort=False, formatting=None, chain=Fa PyRanges.mpc : merge print chain PyRanges.msp : merge sort print PyRanges.mspc : merge sort print chain - PyRanges.rp : raw print dictionary of DataFrames + PyRanges.rp : raw print dictionary of pd.DataFrames Examples -------- @@ -3796,11 +3778,8 @@ def print(self, n=8, merge_position=False, sort=False, formatting=None, chain=Fa print(s) - if chain: - return self - def rp(self): - """Print dict of DataFrames. + """Print dict of pd.DataFrames. See Also -------- @@ -3810,7 +3789,7 @@ def rp(self): print(self.dfs) def rpc(self): - """Print dict of DataFrames and return self. + """Print dict of pd.DataFrames and return self. See Also -------- @@ -3821,7 +3800,7 @@ def rpc(self): return self - def sample(self, n=8, replace=False): + def sample(self, n: int = 8, replace: bool = False) -> "PyRanges": """Subsample arbitrary rows of PyRanges. If n is larger than length of PyRanges, replace must be True. @@ -3863,7 +3842,14 @@ def sample(self, n=8, replace=False): subsetter[sample] = True return self[subsetter] - def set_intersect(self, other, strandedness=None, how=None, new_pos=False, nb_cpu=1): + def set_intersect( + self, + other: "PyRanges", + strandedness: None = None, + how: Optional[str] = None, + new_pos: bool = False, + nb_cpu: int = 1, + ) -> "PyRanges": """Return set-theoretical intersection. Like intersect, but both PyRanges are merged first. @@ -3981,7 +3967,7 @@ def set_intersect(self, other, strandedness=None, how=None, new_pos=False, nb_cp return PyRanges(dfs) - def set_union(self, other, strandedness=None, nb_cpu=1): + def set_union(self, other: "PyRanges", strandedness: None = None, nb_cpu: int = 1) -> "PyRanges": """Return set-theoretical union. Parameters @@ -4073,7 +4059,7 @@ def set_union(self, other, strandedness=None, nb_cpu=1): return gr - def sort(self, by=None, nb_cpu=1): + def sort(self, by: Optional[str] = None, nb_cpu: int = 1) -> "PyRanges": """Sort by position or columns. Parameters @@ -4092,7 +4078,7 @@ def sort(self, by=None, nb_cpu=1): Note ---- - Since a PyRanges contains multiple DataFrames, the sorting only happens within dataframes. + Since a PyRanges contains multiple pd.DataFrames, the sorting only happens within dataframes. Returns ------- @@ -4172,8 +4158,7 @@ def sort(self, by=None, nb_cpu=1): from pyranges.methods.sort import _sort - kwargs = {"strand": self.stranded} - kwargs["sparse"] = {"self": False} + kwargs = {"strand": self.stranded, "sparse": {"self": False}} if by: assert "5" not in by or ( ((type(by) is str and by == "5") or (type(by) is not str and "5" in by)) and self.stranded @@ -4209,7 +4194,14 @@ def slack(self, slack): """Deprecated: this function has been moved to Pyranges.extend""" return self.extend(slack) - def spliced_subsequence(self, start=0, end=None, by=None, strand=None, **kwargs): + def spliced_subsequence( + self, + start: int = 0, + end: Optional[int] = None, + by: Optional[str] = None, + strand: Optional[bool] = None, + **kwargs + ) -> "PyRanges": """Get subsequences of the intervals, using coordinates mapping to spliced transcripts (without introns) The returned intervals are subregions of self, cut according to specifications. @@ -4352,12 +4344,12 @@ def spliced_subsequence(self, start=0, end=None, by=None, strand=None, **kwargs) return pr.PyRanges(result) - def split(self, strand=None, between=False, nb_cpu=1): + def split(self, strand: Optional[bool] = None, between: bool = False) -> "PyRanges": """Split into non-overlapping intervals. Parameters ---------- - strand : bool, default None, i.e. auto + strand : Optional[bool], default None, i.e. auto Whether to ignore strand information if PyRanges is stranded. @@ -4365,11 +4357,6 @@ def split(self, strand=None, between=False, nb_cpu=1): Include lengths between intervals. - nb_cpu: int, default 1 - - How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple. - Will only lead to speedups on large datasets. - Returns ------- PyRanges @@ -4470,13 +4457,13 @@ def split(self, strand=None, between=False, nb_cpu=1): split = pr.PyRanges(df) if not between: - strandedness = "same" if strand else False + strandedness: Union[str, bool] = "same" if strand else False split = split.overlap(self, strandedness=strandedness) return split @property - def stranded(self): + def stranded(self) -> bool: """Whether PyRanges has (valid) strand info. Note @@ -4524,7 +4511,7 @@ def stranded(self): return isinstance(key, tuple) @property - def strands(self): + def strands(self) -> List[Union[Any, str]]: """Return strands. Notes @@ -4570,13 +4557,13 @@ def strands(self): return natsorted(set([k[1] for k in self.keys()])) - def subset(self, f, strand=None, **kwargs): + def subset(self, f: Callable, strand: Optional[bool] = None, **kwargs) -> "PyRanges": """Return a subset of the rows. Parameters ---------- f : function - Function which returns boolean Series equal to length of df. + Function which returns boolean pd.Series equal to length of df. strand : bool, default None, i.e. auto @@ -4594,7 +4581,7 @@ def subset(self, f, strand=None, **kwargs): Notes ----- - PyRanges can also be subsetted directly with a boolean Series. This function is slightly + PyRanges can also be subsetted directly with a boolean pd.Series. This function is slightly faster, but more cumbersome. Returns @@ -4667,7 +4654,14 @@ def subset(self, f, strand=None, **kwargs): return self[result] - def subsequence(self, start=0, end=None, by=None, strand=None, **kwargs): + def subsequence( + self, + start: int = 0, + end: Optional[int] = None, + by: Optional[str] = None, + strand: Optional[bool] = None, + **kwargs + ) -> "PyRanges": """Get subsequences of the intervals. The returned intervals are subregions of self, cut according to specifications. @@ -4800,7 +4794,7 @@ def subsequence(self, start=0, end=None, by=None, strand=None, **kwargs): return pr.PyRanges(result) - def subtract(self, other, strandedness=None, nb_cpu=1): + def subtract(self, other: "PyRanges", strandedness: None = None, nb_cpu: int = 1) -> "PyRanges": """Subtract intervals. Parameters @@ -4864,8 +4858,7 @@ def subtract(self, other, strandedness=None, nb_cpu=1): from pyranges.methods.subtraction import _subtraction - kwargs = {"strandedness": strandedness} - kwargs["sparse"] = {"self": False, "other": True} + kwargs = {"strandedness": strandedness, "sparse": {"self": False, "other": True}} kwargs = fill_kwargs(kwargs) strand = True if strandedness else False @@ -4879,7 +4872,7 @@ def subtract(self, other, strandedness=None, nb_cpu=1): return PyRanges(result).drop("__num__") - def summary(self, to_stdout=True, return_df=False): + def summary(self, to_stdout: bool = True, return_df: bool = False) -> Optional[pd.DataFrame]: """Return info. Count refers to the number of intervals, the rest to the lengths. @@ -4903,7 +4896,7 @@ def summary(self, to_stdout=True, return_df=False): Returns ------- - None or DataFrame with summary. + None or pd.DataFrame with summary. Examples @@ -4960,7 +4953,7 @@ def summary(self, to_stdout=True, return_df=False): return _summary(self, to_stdout, return_df) - def tail(self, n=8): + def tail(self, n: int = 8) -> "PyRanges": """Return the n last rows. Parameters @@ -5021,7 +5014,7 @@ def tail(self, n=8): subsetter[(len(self) - n) :] = True return self[subsetter] - def tile(self, tile_size, overlap=False, strand=None, nb_cpu=1): + def tile(self, tile_size: int, overlap: bool = False, strand: Optional[bool] = None, nb_cpu: int = 1) -> "PyRanges": """Return overlapping genomic tiles. The genome is divided into bookended tiles of length `tile_size` and one is returned per @@ -5124,15 +5117,13 @@ def tile(self, tile_size, overlap=False, strand=None, nb_cpu=1): if strand is None: strand = self.stranded - kwargs = {"strand": strand, "overlap": overlap} - kwargs["sparse"] = {"self": False} - kwargs["tile_size"] = tile_size + kwargs = {"strand": strand, "overlap": overlap, "sparse": {"self": False}, "tile_size": tile_size} df = pyrange_apply_single(_tiles, self, **kwargs) return PyRanges(df) - def to_example(self, n=10): + def to_example(self, n: int = 10) -> Dict[str, List[Union[int, str]]]: """Return as dict. Used for easily creating examples for copy and pasting. @@ -5199,7 +5190,7 @@ def to_example(self, n=10): return d - def three_end(self): + def three_end(self) -> "PyRanges": """Return the 3'-end. The 3'-end is the start of intervals on the reverse strand and the end of intervals on the @@ -5302,7 +5293,9 @@ def three_end(self): # >>> # """ - def to_bed(self, path=None, keep=True, compression="infer", chain=False): + def to_bed( + self, path: Optional[str] = None, keep: bool = True, compression: str = "infer", chain: bool = False + ) -> Union[str, "PyRanges"]: r"""Write to bed. Parameters @@ -5379,14 +5372,14 @@ def to_bed(self, path=None, keep=True, compression="infer", chain=False): def to_bigwig( self, - path=None, - chromosome_sizes=None, - rpm=True, - divide=None, - value_col=None, - dryrun=False, - chain=False, - ): + path: None = None, + chromosome_sizes: None = None, + rpm: bool = True, + divide: Optional[bool] = None, + value_col: Optional[str] = None, + dryrun: bool = False, + chain: bool = False, + ) -> Optional["PyRanges"]: """Write regular or value coverage to bigwig. Note @@ -5433,7 +5426,7 @@ def to_bigwig( See Also -------- - pyranges.to_bigwig : write pandas DataFrame to bigwig. + pyranges.to_bigwig : write pandas pd.DataFrame to bigwig. Examples -------- @@ -5511,9 +5504,11 @@ def to_bigwig( if chain: return self else: - pass + return None - def to_csv(self, path=None, sep=",", header=True, compression="infer", chain=False): + def to_csv( + self, path: Optional["Path"] = None, sep: str = ",", header: bool = True, compression: str = "infer" + ) -> Union[str, "PyRanges"]: r"""Write to comma- or other value-separated file. Parameters @@ -5534,10 +5529,6 @@ def to_csv(self, path=None, sep=",", header=True, compression="infer", chain=Fal Which compression to use. Uses file extension to infer by default. - chain: bool, default False - - Whether to return the PyRanges after writing. - Note ---- @@ -5561,13 +5552,15 @@ def to_csv(self, path=None, sep=",", header=True, compression="infer", chain=Fal from pyranges.out import _to_csv - result = _to_csv(self, path, sep=sep, header=header, compression=compression) - if path and chain: - return self - else: - return result + return _to_csv(self, path, sep=sep, header=header, compression=compression) - def to_gff3(self, path=None, compression="infer", chain=False, map_cols=None): + def to_gff3( + self, + path: None = None, + compression: str = "infer", + chain: bool = False, + map_cols: Optional[Dict[str, str]] = None, + ) -> str: """Write to General Feature Format 3. The GFF format consists of a tab-separated file without header. @@ -5679,7 +5672,13 @@ def to_gff3(self, path=None, compression="infer", chain=False, map_cols=None): else: return result - def to_gtf(self, path=None, compression="infer", chain=False, map_cols=None): + def to_gtf( + self, + path: None = None, + compression: str = "infer", + chain: bool = False, + map_cols: Optional[Dict[str, str]] = None, + ) -> str: """Write to Gene Transfer Format. The GTF format consists of a tab-separated file without header. @@ -5770,7 +5769,9 @@ def to_gtf(self, path=None, compression="infer", chain=False, map_cols=None): else: return result - def to_rle(self, value_col=None, strand=None, rpm=False, nb_cpu=1): + def to_rle( + self, value_col: Optional[str] = None, strand: Optional[bool] = None, rpm: bool = False, nb_cpu: int = 1 + ) -> "RleDict": """Return as RleDict. Create collection of Rles representing the coverage or other numerical value. @@ -5880,7 +5881,7 @@ def to_rle(self, value_col=None, strand=None, rpm=False, nb_cpu=1): return _to_rle(self, value_col, strand=strand, rpm=rpm, nb_cpu=nb_cpu) - def unstrand(self): + def unstrand(self) -> "PyRanges": """Remove strand. Note @@ -5933,12 +5934,12 @@ def unstrand(self): return pr.PyRanges(gr.dfs) - def values(self): - """Return the underlying DataFrames.""" + def values(self) -> List[pd.DataFrame]: + """Return the underlying pd.DataFrames.""" return [df for k, df in self.items() if not df.empty] - def window(self, window_size, strand=None): + def window(self, window_size: int, strand: Optional[bool] = None) -> "PyRanges": """Return overlapping genomic windows. Windows of length `window_size` are returned. @@ -6057,3 +6058,30 @@ def __getstate__(self): def __setstate__(self, d): self.__dict__["dfs"] = d + + @staticmethod + def _zip_locationkey_and_data(keys: Iterable, dfs: Iterable[pd.DataFrame], strand: bool) -> "PyRanges": + """Zip keys and data into a PyRanges object. + + Helper method because MyPy has difficulty seeing that PyRanges keys are + either list[str] or list[tuple[str, str]]. It considers them to be list[Union[str, tuple[str, str]]] + which results in typecheck errors. + """ + if strand: + for k in keys: + assert isinstance(k, tuple) + return pr.PyRanges(dict(zip(keys, dfs))) + else: + for k in keys: + assert isinstance(k, str) + return pr.PyRanges(dict(zip(keys, dfs))) + + +def _test(): + import doctest + + doctest.testmod() + + +if __name__ == "__main__": + _test() From c5045856ba6a1026803b662a7c0974f8946ee0ae Mon Sep 17 00:00:00 2001 From: endre bakken stovner Date: Sun, 21 May 2023 16:55:41 +0200 Subject: [PATCH 03/10] Add types to init --- pyranges/__init__.py | 57 +++++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/pyranges/__init__.py b/pyranges/__init__.py index 5bf17528..a026829d 100644 --- a/pyranges/__init__.py +++ b/pyranges/__init__.py @@ -1,7 +1,10 @@ from __future__ import print_function +import itertools import sys from collections import defaultdict +from pathlib import Path +from typing import Dict, Iterable, Optional, Set, Tuple, Union import numpy as np import pandas as pd @@ -27,8 +30,10 @@ read_gff = read_gtf +Chromsizes = Union[Dict[str, int], Dict[Tuple[str, str], int]] -def from_dict(d): + +def from_dict(d: Dict[str, Iterable]) -> PyRanges: """Create a PyRanges from dict. Parameters @@ -68,7 +73,7 @@ def from_dict(d): return PyRanges(pd.DataFrame(d)) -def from_string(s): +def from_string(s: str) -> PyRanges: """Create a PyRanges from multiline string. Parameters @@ -115,7 +120,7 @@ def from_string(s): return PyRanges(df) -def itergrs(prs, strand=None, keys=False): +def itergrs(prs: Iterable[PyRanges], strand=None, keys=False): r"""Iterate over multiple PyRanges at once. Parameters @@ -209,14 +214,12 @@ def itergrs(prs, strand=None, keys=False): prs = [gr.unstrand() for gr in prs] grs_per_chromosome = defaultdict(list) - set_keys = set() - for gr in prs: - set_keys.update(gr.dfs.keys()) + set_keys: Union[Set[str], Set[Tuple[str, str]]] = set(itertools.chain.from_iterable(*[gr.dfs.keys() for gr in prs])) empty_dfs = [pd.DataFrame(columns=gr.columns) for gr in prs] for gr, empty in zip(prs, empty_dfs): for k in set_keys: - df = gr.dfs.get(k, empty) + df = gr.dfs.get(k, empty) # type: ignore grs_per_chromosome[k].append(df) if not keys: @@ -225,7 +228,13 @@ def itergrs(prs, strand=None, keys=False): return iter(natsorted(grs_per_chromosome.items())) -def random(n=1000, length=100, chromsizes=None, strand=True, seed=None): +def random( + n: int = 1000, + length: int = 100, + chromsizes: Optional[Chromsizes] = None, + strand: bool = True, + seed: Optional[int] = None, +): """Return PyRanges with random intervals. Parameters @@ -296,8 +305,7 @@ def random(n=1000, length=100, chromsizes=None, strand=True, seed=None): """ if chromsizes is None: - chromsizes = data.chromsizes() - df = chromsizes.df + df = data.chromsizes().df elif isinstance(chromsizes, dict): df = pd.DataFrame({"Chromosome": list(chromsizes.keys()), "End": list(chromsizes.values())}) else: @@ -307,7 +315,7 @@ def random(n=1000, length=100, chromsizes=None, strand=True, seed=None): n_per_chrom = pd.Series(np.random.choice(df.index, size=n, p=p)).value_counts(sort=False).to_frame() n_per_chrom.insert(1, "Chromosome", df.loc[n_per_chrom.index].Chromosome) - n_per_chrom.columns = "Count Chromosome".split() + n_per_chrom.columns = pd.Index("Count Chromosome".split()) random_dfs = [] for _, (count, chrom) in n_per_chrom.iterrows(): @@ -330,14 +338,17 @@ def random(n=1000, length=100, chromsizes=None, strand=True, seed=None): pyranges.statistics : statistcal methods for genomics.""" -def to_bigwig(gr, path, chromosome_sizes): +def to_bigwig(gr: PyRanges, path: Path, chromosome_sizes=Optional[Chromsizes]): """Write df to bigwig. Must contain the columns Chromosome, Start, End and Score. All others are ignored. Parameters ---------- - path : str + gr: PyRanges + Intervals to write. + + path : Path Where to write bigwig. @@ -492,7 +503,9 @@ def to_bigwig(gr, path, chromosome_sizes): assert ( len(gr.strands) <= 1 ), "Can only write one strand at a time. Use an unstranded PyRanges or subset on strand first." - assert np.sum(gr.lengths()) == gr.merge().length, "Intervals must not overlap." + lengths = gr.lengths() + assert isinstance(lengths, pd.Series) + assert np.sum(lengths) == gr.merge().length, "Intervals must not overlap." df = gr.df @@ -515,16 +528,16 @@ def to_bigwig(gr, path, chromosome_sizes): bw.addEntries(chromosomes, starts, ends=ends, values=values) -def version_info(): +def version_info() -> None: import importlib - def update_version_info(version_info, library): + def update_version_info(_version_info, library) -> None: if importlib.util.find_spec(library): version = importlib.import_module(library).__version__ else: version = "not installed" - version_info[library] = version + _version_info[library] = version version_info = { "pyranges version": pr.__version__, @@ -561,3 +574,13 @@ def update_version_info(version_info, library): "PyRanges", "version_info", ] + + +def _test(): + import doctest + + doctest.testmod() + + +if __name__ == "__main__": + _test() From 01313cfe0c2e1fbfd389be22c3ccb30364708c2e Mon Sep 17 00:00:00 2001 From: endre bakken stovner Date: Sun, 21 May 2023 17:08:46 +0200 Subject: [PATCH 04/10] add types to tostring --- pyranges/__init__.py | 2 +- pyranges/tostring2.py | 50 +++++++++++++++++++++++++++---------------- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/pyranges/__init__.py b/pyranges/__init__.py index a026829d..a0ebe0cc 100644 --- a/pyranges/__init__.py +++ b/pyranges/__init__.py @@ -532,7 +532,7 @@ def version_info() -> None: import importlib def update_version_info(_version_info, library) -> None: - if importlib.util.find_spec(library): + if importlib.util.find_spec(library): # type: ignore version = importlib.import_module(library).__version__ else: version = "not installed" diff --git a/pyranges/tostring2.py b/pyranges/tostring2.py index de9d939a..bfdebc5a 100644 --- a/pyranges/tostring2.py +++ b/pyranges/tostring2.py @@ -1,29 +1,32 @@ import functools import os import shutil -from typing import Optional +from typing import Any, Dict, List, Optional, Tuple import natsort # type: ignore import pandas as pd +from pandas.core.frame import DataFrame + +from pyranges.pyranges_main import PyRanges sort_cols = "Start End".split() GITHUB_ACTIONS = os.environ.get("GITHUB_ACTIONS", False) -def _get_stranded_f(self, half_entries, f, sort=False): +def _get_stranded_f(self: PyRanges, half_entries: int, f: str, sort: bool = False) -> DataFrame: counter = 0 dfs = [] chromosomes = self.chromosomes if f == "tail": - chromosomes = reversed(chromosomes) + chromosomes = list(reversed(chromosomes)) default = pd.DataFrame(columns=self.columns) for chromosome in chromosomes: - plus = self.dfs.get((chromosome, "+"), default) - minus = self.dfs.get((chromosome, "-"), default) + plus = self.dfs.get((chromosome, "+"), default) # type: ignore + minus = self.dfs.get((chromosome, "-"), default) # type: ignore if sort: plus = plus.sort_values(sort_cols) @@ -54,18 +57,18 @@ def _get_stranded_f(self, half_entries, f, sort=False): return df -def _get_unstranded_f(self, half_entries, f, sort=False): +def _get_unstranded_f(self: PyRanges, half_entries: int, f: str, sort: bool = False) -> DataFrame: chromosomes = self.chromosomes if f == "tail": - chromosomes = reversed(chromosomes) + chromosomes = list(reversed(chromosomes)) default = pd.DataFrame(columns=self.columns) counter = 0 dfs = [] for chromosome in chromosomes: - cdf = self.dfs.get((chromosome), default) + cdf = self.dfs.get(chromosome, default) # type: ignore cdf = getattr(cdf, f)(half_entries) if sort: @@ -85,7 +88,7 @@ def _get_unstranded_f(self, half_entries, f, sort=False): return df -def _get_df(self, n, sort): +def _get_df(self: PyRanges, n: int, sort: bool) -> DataFrame: half_entries = int(n / 2) if len(self) <= n: @@ -110,7 +113,7 @@ def _get_df(self, n, sort): return df -def show_pos_merge_position(df): +def show_pos_merge_position(df: DataFrame) -> DataFrame: # all_dots = df.Start == "..." cols_to_drop = "Chromosome Start End".split() @@ -136,7 +139,7 @@ def show_pos_merge_position(df): return df -def get_columns_dtypes(self): +def get_columns_dtypes(self: PyRanges) -> Dict[str, str]: _df = next(iter(self.dfs.values())) dtypes = [ str(d) @@ -149,7 +152,7 @@ def get_columns_dtypes(self): return {c: d for c, d in zip(columns, dtypes)} -def build_header(columns_dtypes): +def build_header(columns_dtypes: Dict[str, str]) -> List[str]: header = [] for c, d in columns_dtypes.items(): cd = "".join([str(c), "\n(", d, ")"]) @@ -166,7 +169,9 @@ def add_hidden_col_dotdot(df, n_hidden_cols): return df -def _grow_string_representation(df, columns_dtypes, terminal_width: Optional[int] = None): +def _grow_string_representation( + df: DataFrame, columns_dtypes: Dict[str, str], terminal_width: Optional[int] = None +) -> Tuple[str, List[str]]: from tabulate import tabulate _terminal_width = shutil.get_terminal_size().columns if terminal_width is None else terminal_width @@ -174,7 +179,7 @@ def _grow_string_representation(df, columns_dtypes, terminal_width: Optional[int if len(columns_dtypes) < 15: header = build_header(columns_dtypes) - str_repr = tabulate(df, headers=header, tablefmt="psql", showindex=False) + str_repr = tabulate(df, headers=header, tablefmt="psql", showindex=False) # type: ignore table_width = len(str_repr.split("\n", 1)[0]) @@ -183,10 +188,11 @@ def _grow_string_representation(df, columns_dtypes, terminal_width: Optional[int header = build_header({k: columns_dtypes[k] for k in columns_dtypes}) original_header = list(columns_dtypes) - df.columns = header + df.columns = pd.Index(header) # know that any pyrange will have at least three columns build_df = df.get(list(df.columns[:3])) + assert isinstance(build_df, DataFrame) total_columns = len(df.columns) @@ -222,7 +228,7 @@ def _grow_string_representation(df, columns_dtypes, terminal_width: Optional[int ) -def untraditional_strand_info(self, str_repr_width): +def untraditional_strand_info(self: PyRanges, str_repr_width: int) -> str: _ustr = "" if "Strand" in self.columns and not self.stranded: strands = [] @@ -249,7 +255,7 @@ def untraditional_strand_info(self, str_repr_width): return _ustr -def hidden_columns_info(hidden_columns, str_repr_width): +def hidden_columns_info(hidden_columns: List[Any], str_repr_width: int) -> str: n_hidden_cols = len(hidden_columns) _hstr = "" if n_hidden_cols: @@ -268,7 +274,7 @@ def hidden_columns_info(hidden_columns, str_repr_width): return _hstr -def add_text_to_str_repr(self, str_repr, hidden_columns, sort): +def add_text_to_str_repr(self: PyRanges, str_repr: str, hidden_columns: List[Any], sort: bool) -> str: n_intervals = len(self) n_chromosomes = len(self.chromosomes) @@ -297,7 +303,13 @@ def add_text_to_str_repr(self, str_repr, hidden_columns, sort): return str_repr -def tostring(self, n=8, merge_position=False, formatting=None, sort=False): +def tostring( + self: PyRanges, + n: int = 8, + merge_position: bool = False, + formatting: Optional[Dict[str, str]] = None, + sort: bool = False, +) -> str: if len(self) == 0: return "Empty PyRanges" From e652433421c5e1ed49cc5cd577da569672623d8f Mon Sep 17 00:00:00 2001 From: endre bakken stovner Date: Sun, 21 May 2023 17:39:52 +0200 Subject: [PATCH 05/10] Types to multithreaded --- pyranges/multithreaded.py | 154 +++++++++++--------------------------- pyranges/tostring2.py | 19 ++--- 2 files changed, 53 insertions(+), 120 deletions(-) diff --git a/pyranges/multithreaded.py b/pyranges/multithreaded.py index 7505ea60..4a7fb3e0 100644 --- a/pyranges/multithreaded.py +++ b/pyranges/multithreaded.py @@ -1,35 +1,24 @@ import os +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, Union import numpy as np import pandas as pd from natsort import natsorted # type: ignore +from pandas.core.frame import DataFrame -import pyranges as pr +if TYPE_CHECKING: + from pyranges.pyranges_main import PyRanges ray = None -def get_n_args(f): +def get_n_args(f: Callable) -> int: import inspect nparams = len(inspect.signature(f).parameters) return nparams -def call_f(f, nparams, df, odf, kwargs): - if nparams == 3: - return f.remote(df, odf, **kwargs) - else: - return f.remote(df, odf) - - -def call_f_single(f, nparams, df, **kwargs): - if nparams == 2: - return f.remote(df, **kwargs) - else: - return f.remote(df) - - class suppress_stdout_stderr(object): """ A context manager for doing a "deep suppression" of stdout and stderr in @@ -61,20 +50,20 @@ def __exit__(self, *_): os.close(self.null_fds[1]) -def merge_dfs(df1, df2): +def merge_dfs(df1: DataFrame, df2: DataFrame) -> DataFrame: if not df1.empty and not df2.empty: return pd.concat([df1, df2], sort=False).reset_index(drop=True) elif df1.empty and df2.empty: # can this happen? - return None + return pd.DataFrame() elif df1.empty: return df2 else: return df1 -def process_results(results, keys): +def process_results(results: List[Any], keys: Union[List[str], List[Tuple[str, str]]]) -> dict: results_dict = {k: r for k, r in zip(keys, results) if r is not None} try: @@ -103,7 +92,7 @@ def process_results(results, keys): return results_dict -def make_sparse(df): +def make_sparse(df: DataFrame) -> DataFrame: if "Strand" in df: cols = "Chromosome Start End Strand".split() else: @@ -112,7 +101,7 @@ def make_sparse(df): return df[cols] -def make_binary_sparse(kwargs, df, odf): +def make_binary_sparse(kwargs: Dict[str, Any], df: DataFrame, odf: DataFrame) -> Tuple[DataFrame, DataFrame]: sparse = kwargs.get("sparse") if not sparse: @@ -127,13 +116,10 @@ def make_binary_sparse(kwargs, df, odf): return df, odf -def make_unary_sparse(kwargs, df): - sparse = kwargs.get("sparse").get("self") - - if sparse: - df = make_sparse(df) +def make_unary_sparse(kwargs: Dict[str, Any], df: DataFrame) -> DataFrame: + sparse = kwargs.get("sparse", {}).get("self") - return df + return make_sparse(df) if sparse else df def ray_initialized(): @@ -157,40 +143,9 @@ def test_function(): raise e -def get_multithreaded_funcs(function, nb_cpu): - if nb_cpu > 1: - import ray # type: ignore - - _merge_dfs = ray.remote(merge_dfs) - get = ray.get - function = ray.remote(function) - else: - - def _merge_dfs(): - return "dummy value" - - _merge_dfs.remote = merge_dfs - - def get(x): - return x - - function.remote = function - - return function, get, _merge_dfs - - -def pyrange_apply(function, self, other, **kwargs): - nparams = get_n_args(function) - nb_cpu = kwargs.get("nb_cpu", 1) - - if nb_cpu > 1: - import ray # type: ignore - - with suppress_stdout_stderr(): - ray.init(num_cpus=nb_cpu, ignore_reinit_error=True) - - function, get, _merge_dfs = get_multithreaded_funcs(function, nb_cpu=nb_cpu) - +def pyrange_apply( + function: Callable, self: "PyRanges", other: "PyRanges", **kwargs +) -> Union[Dict[Tuple[str, str], Any], Dict[str, Any]]: strandedness = kwargs["strandedness"] other_strand = {"+": "-", "-": "+"} @@ -228,7 +183,7 @@ def pyrange_apply(function, self, other, **kwargs): odf = other[c, os].values()[0] df, odf = make_binary_sparse(kwargs, df, odf) - result = call_f(function, nparams, df, odf, kwargs) + result = function(df, odf, **kwargs) results.append(result) @@ -241,7 +196,7 @@ def pyrange_apply(function, self, other, **kwargs): odf = other_dfs[c] df, odf = make_binary_sparse(kwargs, df, odf) - result = call_f(function, nparams, df, odf, kwargs) + result = function(df, odf, **kwargs) results.append(result) elif not self.stranded and other.stranded: @@ -249,28 +204,28 @@ def pyrange_apply(function, self, other, **kwargs): if c not in other_chromosomes: odf = dummy else: - odf1 = other_dfs.get((c, "+"), dummy) - odf2 = other_dfs.get((c, "-"), dummy) + odf1 = other_dfs.get((c, "+"), dummy) # type: ignore + odf2 = other_dfs.get((c, "-"), dummy) # type: ignore - odf = _merge_dfs.remote(odf1, odf2) + odf = merge_dfs(odf1, odf2) df, odf = make_binary_sparse(kwargs, df, odf) - result = call_f(function, nparams, df, odf, kwargs) + result = function(df, odf, **kwargs) results.append(result) elif self.stranded and other.stranded: - for (c, s), df in self.items(): + for (c, s), df in self.items(): # type: ignore if c not in other_chromosomes: - odfs = pr.PyRanges(dummy) + odfs = [dummy] else: - odfp = other_dfs.get((c, "+"), dummy) - odfm = other_dfs.get((c, "-"), dummy) + odfp = other_dfs.get((c, "+"), dummy) # type: ignore + odfm = other_dfs.get((c, "-"), dummy) # type: ignore odfs = [odfp, odfm] if len(odfs) == 2: - odf = _merge_dfs.remote(*odfs) + odf = merge_dfs(*odfs) elif len(odfs) == 1: odf = odfs[0] else: @@ -278,7 +233,7 @@ def pyrange_apply(function, self, other, **kwargs): df, odf = make_binary_sparse(kwargs, df, odf) - result = call_f(function, nparams, df, odf, kwargs) + result = function(df, odf, **kwargs) results.append(result) else: @@ -290,62 +245,46 @@ def pyrange_apply(function, self, other, **kwargs): df, odf = make_binary_sparse(kwargs, df, odf) - result = call_f(function, nparams, df, odf, kwargs) + result = function(df, odf, **kwargs) results.append(result) - results = get(results) - - results = process_results(results, keys) - - if nb_cpu > 1: - ray.shutdown() + return process_results(results, keys) - return results - -def pyrange_apply_single(function, self, **kwargs): - nparams = get_n_args(function) - nb_cpu = kwargs.get("nb_cpu", 1) +def pyrange_apply_single(function: Callable, self: "PyRanges", **kwargs) -> Any: strand = kwargs["strand"] - if nb_cpu > 1: - import ray # type: ignore - - with suppress_stdout_stderr(): - ray.init(num_cpus=nb_cpu, ignore_reinit_error=True) - - function, get, _merge_dfs = get_multithreaded_funcs(function, nb_cpu=nb_cpu) - if strand: assert self.stranded, "Can only do stranded operation when PyRange contains strand info" results = [] + keys: Union[List[str], List[Tuple[str, str]]] = [] # type: ignore if strand: - for (c, s), df in self.items(): + for (c, s), df in self.items(): # type: ignore kwargs["chromosome"] = c _strand = s kwargs["strand"] = _strand df = make_unary_sparse(kwargs, df) - result = call_f_single(function, nparams, df, **kwargs) + result = function(df, **kwargs) results.append(result) keys = self.keys() elif not self.stranded: - keys = [] for c, df in self.items(): kwargs["chromosome"] = c + assert isinstance(c, str) df = make_unary_sparse(kwargs, df) - result = call_f_single(function, nparams, df, **kwargs) + result = function(df, **kwargs) results.append(result) keys.append(c) else: - keys = [] for c in self.chromosomes: + assert isinstance(c, str) kwargs["chromosome"] = c dfs = self[c] @@ -353,23 +292,16 @@ def pyrange_apply_single(function, self, **kwargs): if len(dfs.keys()) == 2: df, df2 = dfs.values() # merge strands - df = _merge_dfs.remote(df, df2) + df = merge_dfs(df, df2) else: df = dfs.values()[0] df = make_unary_sparse(kwargs, df) - result = call_f_single(function, nparams, df, **kwargs) + result = function(df, **kwargs) results.append(result) keys.append(c) - results = get(results) - - if nb_cpu > 1: - ray.shutdown() - - results = process_results(results, keys) - - return results + return process_results(results, keys) def _lengths(df): @@ -378,7 +310,7 @@ def _lengths(df): return lengths -def _tss(df, **kwargs): +def _tss(df: DataFrame, **kwargs) -> DataFrame: df = df.copy(deep=True) dtype = df.dtypes["Start"] slack = kwargs.get("slack", 0) @@ -394,7 +326,7 @@ def _tss(df, **kwargs): return df -def _tes(df, **kwargs): +def _tes(df: DataFrame, **kwargs) -> DataFrame: df = df.copy(deep=True) dtype = df.dtypes["Start"] slack = kwargs.get("slack", 0) @@ -410,7 +342,7 @@ def _tes(df, **kwargs): return df -def _extend(df, **kwargs): +def _extend(df: DataFrame, **kwargs) -> DataFrame: df = df.copy() dtype = df.Start.dtype slack = kwargs["ext"] diff --git a/pyranges/tostring2.py b/pyranges/tostring2.py index bfdebc5a..72bed06f 100644 --- a/pyranges/tostring2.py +++ b/pyranges/tostring2.py @@ -1,20 +1,21 @@ import functools import os import shutil -from typing import Any, Dict, List, Optional, Tuple +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple import natsort # type: ignore import pandas as pd from pandas.core.frame import DataFrame -from pyranges.pyranges_main import PyRanges +if TYPE_CHECKING: + from pyranges.pyranges_main import PyRanges sort_cols = "Start End".split() GITHUB_ACTIONS = os.environ.get("GITHUB_ACTIONS", False) -def _get_stranded_f(self: PyRanges, half_entries: int, f: str, sort: bool = False) -> DataFrame: +def _get_stranded_f(self: "PyRanges", half_entries: int, f: str, sort: bool = False) -> DataFrame: counter = 0 dfs = [] @@ -57,7 +58,7 @@ def _get_stranded_f(self: PyRanges, half_entries: int, f: str, sort: bool = Fals return df -def _get_unstranded_f(self: PyRanges, half_entries: int, f: str, sort: bool = False) -> DataFrame: +def _get_unstranded_f(self: "PyRanges", half_entries: int, f: str, sort: bool = False) -> DataFrame: chromosomes = self.chromosomes if f == "tail": @@ -88,7 +89,7 @@ def _get_unstranded_f(self: PyRanges, half_entries: int, f: str, sort: bool = Fa return df -def _get_df(self: PyRanges, n: int, sort: bool) -> DataFrame: +def _get_df(self: "PyRanges", n: int, sort: bool) -> DataFrame: half_entries = int(n / 2) if len(self) <= n: @@ -139,7 +140,7 @@ def show_pos_merge_position(df: DataFrame) -> DataFrame: return df -def get_columns_dtypes(self: PyRanges) -> Dict[str, str]: +def get_columns_dtypes(self: "PyRanges") -> Dict[str, str]: _df = next(iter(self.dfs.values())) dtypes = [ str(d) @@ -228,7 +229,7 @@ def _grow_string_representation( ) -def untraditional_strand_info(self: PyRanges, str_repr_width: int) -> str: +def untraditional_strand_info(self: "PyRanges", str_repr_width: int) -> str: _ustr = "" if "Strand" in self.columns and not self.stranded: strands = [] @@ -274,7 +275,7 @@ def hidden_columns_info(hidden_columns: List[Any], str_repr_width: int) -> str: return _hstr -def add_text_to_str_repr(self: PyRanges, str_repr: str, hidden_columns: List[Any], sort: bool) -> str: +def add_text_to_str_repr(self: "PyRanges", str_repr: str, hidden_columns: List[Any], sort: bool) -> str: n_intervals = len(self) n_chromosomes = len(self.chromosomes) @@ -304,7 +305,7 @@ def add_text_to_str_repr(self: PyRanges, str_repr: str, hidden_columns: List[Any def tostring( - self: PyRanges, + self: "PyRanges", n: int = 8, merge_position: bool = False, formatting: Optional[Dict[str, str]] = None, From 009abe88aaa54edc4444b79582ebeaa608643d95 Mon Sep 17 00:00:00 2001 From: endre bakken stovner Date: Fri, 26 May 2023 09:44:38 +0200 Subject: [PATCH 06/10] Fix doctests --- hi | Bin 1890 -> 1880 bytes pyproject.toml | 2 +- pyranges/__init__.py | 79 ++- pyranges/_typing.py | 0 pyranges/genomicfeatures.py | 6 +- pyranges/methods/attr.py | 4 +- pyranges/methods/concat.py | 34 +- pyranges/methods/getitem.py | 5 +- pyranges/methods/init.py | 100 +-- pyranges/multioverlap.py | 4 +- pyranges/multithreaded.py | 156 ++--- pyranges/pyranges_main.py | 629 ++++++------------- tests/property_based/hypothesis_helper.py | 8 +- tests/unit/df_dict_mismatch/test_mismatch.py | 17 +- tests/unit/join/test_join.py | 4 +- tests/unit/slack/test_slack.py | 2 +- tests/unit/test_count_overlaps.py | 2 +- tests/unit/test_genomicfeatures.py | 16 - 18 files changed, 356 insertions(+), 712 deletions(-) create mode 100644 pyranges/_typing.py diff --git a/hi b/hi index 669e467e2b12ef2392bbb5b01d1b6f356b1eb8c2..9c39f1a4d8ad11b16e9fd62e66f0a291f29c505f 100644 GIT binary patch delta 146 zcmaFFcY}|mfn}=iMizY*LFG=0.0.63", "tabulate", "sorted_nearest>=0.0.33", "natsort"] [project.optional-dependencies] -dev = ["black", "bumpver", "isort", "pip-tools", "pytest"] +dev = ["pyrle", "bamread", "bwread", "fisher"] [tool.setuptools.packages.find] where = ["."] diff --git a/pyranges/__init__.py b/pyranges/__init__.py index a0ebe0cc..4b89d8ac 100644 --- a/pyranges/__init__.py +++ b/pyranges/__init__.py @@ -4,7 +4,7 @@ import sys from collections import defaultdict from pathlib import Path -from typing import Dict, Iterable, Optional, Set, Tuple, Union +from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union import numpy as np import pandas as pd @@ -15,6 +15,7 @@ import pyranges.genomicfeatures as gf # NOQA: F401 from pyranges import data, statistics from pyranges.get_fasta import get_fasta, get_sequence, get_transcript_sequence +from pyranges.helpers import get_key_from_df, single_value_key from pyranges.methods.concat import concat from pyranges.multioverlap import count_overlaps from pyranges.pyranges_main import PyRanges @@ -33,6 +34,49 @@ Chromsizes = Union[Dict[str, int], Dict[Tuple[str, str], int]] +def from_args( + chromosomes: Union[Sequence[str], Sequence[int]], + starts: Sequence[int], + ends: Sequence[int], + strands: Optional[Union[str, Sequence[str]]] = None, +) -> "PyRanges": + if isinstance(chromosomes, str) or isinstance(chromosomes, int): + _chromosomes = pd.Series([chromosomes] * len(starts), dtype="category") + else: + _chromosomes = pd.Series(chromosomes, dtype="category") + + columns: List[pd.Series] = [_chromosomes, pd.Series(starts), pd.Series(ends)] + colnames = ["Chromosome", "Start", "End"] + if strands is not None: + if isinstance(strands, str): + _strands = pd.Series([strands] * len(starts), dtype="category") + else: + _strands = pd.Series(strands, dtype="category") + + columns.append(_strands) + colnames.append("Strand") + + lengths = list(str(len(s)) for s in columns) + assert len(set(lengths)) == 1, "[{colnames} must be of equal length. But are {columns}".format( + colnames=", ".join(colnames), columns=", ".join(lengths) + ) + + idx = range(len(starts)) + series_to_concat = [] + for s in columns: + if isinstance(s, pd.Series): + s = pd.Series(s.values, index=idx) + else: + s = pd.Series(s, index=idx) + + series_to_concat.append(s) + + df = pd.concat(series_to_concat, axis=1) + df.columns = pd.Index(colnames) + + return pr.PyRanges(df) + + def from_dict(d: Dict[str, Iterable]) -> PyRanges: """Create a PyRanges from dict. @@ -73,6 +117,36 @@ def from_dict(d: Dict[str, Iterable]) -> PyRanges: return PyRanges(pd.DataFrame(d)) +def from_dfs(dfs: Union[Dict[str, pd.DataFrame], Dict[Tuple[str, str], pd.DataFrame]]) -> "PyRanges": + df: pd.DataFrame + empty_removed = {k: v.copy() for k, v in dfs.items() if not v.empty} + + _strand_valid = True + for key, df in empty_removed.items(): + _key = get_key_from_df(df) + if not single_value_key(df): + raise ValueError("All Chromosome/Strand vals in a df must be the same.") + _key_same = _key == key + + if isinstance(_key, tuple): + _strand_valid = _strand_valid and (_key[1] in ["+", "-"]) + + if _strand_valid and not _key_same: + raise ValueError(f"All keys must be the same, but df has {_key} and dict had {key}.") + + if not _strand_valid: + df = pd.concat(empty_removed.values()).reset_index(drop=True) + + groupby_cols = ["Chromosome"] + + empty_removed = {k[0]: v for k, v in df.groupby(groupby_cols)} # type: ignore + + gr = PyRanges() + gr.__dict__["dfs"] = empty_removed + + return gr # type: ignore + + def from_string(s: str) -> PyRanges: """Create a PyRanges from multiline string. @@ -214,7 +288,8 @@ def itergrs(prs: Iterable[PyRanges], strand=None, keys=False): prs = [gr.unstrand() for gr in prs] grs_per_chromosome = defaultdict(list) - set_keys: Union[Set[str], Set[Tuple[str, str]]] = set(itertools.chain.from_iterable(*[gr.dfs.keys() for gr in prs])) + keys = [gr.dfs.keys() for gr in prs] + set_keys: Union[Set[str], Set[Tuple[str, str]]] = set(itertools.chain.from_iterable(keys)) empty_dfs = [pd.DataFrame(columns=gr.columns) for gr in prs] for gr, empty in zip(prs, empty_dfs): diff --git a/pyranges/_typing.py b/pyranges/_typing.py new file mode 100644 index 00000000..e69de29b diff --git a/pyranges/genomicfeatures.py b/pyranges/genomicfeatures.py index 5c4fa328..26145a1b 100644 --- a/pyranges/genomicfeatures.py +++ b/pyranges/genomicfeatures.py @@ -189,7 +189,7 @@ def introns(self, by="gene", nb_cpu=1): >>> gr.features.introns(by="gene") +--------------+------------+-----------+-----------+--------------+-----------------+-----------------+ | Chromosome | Feature | Start | End | Strand | gene_id | transcript_id | - | (object) | (object) | (int64) | (int64) | (category) | (object) | (object) | + | (category) | (object) | (int64) | (int64) | (category) | (object) | (object) | |--------------+------------+-----------+-----------+--------------+-----------------+-----------------| | 1 | intron | 1173926 | 1174265 | + | ENSG00000162571 | nan | | 1 | intron | 1174321 | 1174423 | + | ENSG00000162571 | nan | @@ -207,7 +207,7 @@ def introns(self, by="gene", nb_cpu=1): >>> gr.features.introns(by="transcript") +--------------+------------+-----------+-----------+--------------+-----------------+-----------------+ | Chromosome | Feature | Start | End | Strand | gene_id | transcript_id | - | (object) | (object) | (int64) | (int64) | (category) | (object) | (object) | + | (category) | (object) | (int64) | (int64) | (category) | (object) | (object) | |--------------+------------+-----------+-----------+--------------+-----------------+-----------------| | 1 | intron | 818202 | 818722 | + | ENSG00000177757 | ENST00000326734 | | 1 | intron | 960800 | 961292 | + | ENSG00000187961 | ENST00000338591 | @@ -241,7 +241,7 @@ def introns(self, by="gene", nb_cpu=1): result = pyrange_apply(_introns2, by_gr, exons, **kwargs) - return pr.PyRanges(result) + return pr.from_dfs(result) def _outside_bounds(df, **kwargs): diff --git a/pyranges/methods/attr.py b/pyranges/methods/attr.py index ca0f7bac..8ad77b7f 100644 --- a/pyranges/methods/attr.py +++ b/pyranges/methods/attr.py @@ -53,8 +53,8 @@ def _setattr(self, column_name, column, pos=False): if column_name not in ["Chromosome", "Strand"]: self.__dict__["dfs"] = dfs else: - # will merge the dfs, then split on keys again to ensure they are correct - self.__dict__["dfs"] = pr.PyRanges(pr.PyRanges(dfs).df).dfs + df = pd.concat(dfs.values()) + self.__dict__["dfs"] = pr.PyRanges(df).dfs def _getattr(self, name): diff --git a/pyranges/methods/concat.py b/pyranges/methods/concat.py index a427194c..12419976 100644 --- a/pyranges/methods/concat.py +++ b/pyranges/methods/concat.py @@ -1,5 +1,3 @@ -from collections import defaultdict - import pandas as pd import pyranges as pr @@ -9,8 +7,7 @@ def concat(pyranges, strand=None): if not pyranges: return None - pyranges = [pr for pr in pyranges if not pr.empty] - grs_per_chromosome = defaultdict(list) + pyranges = [gr for gr in pyranges if not gr.empty] strand_info = [gr.stranded for gr in pyranges] @@ -20,31 +17,4 @@ def concat(pyranges, strand=None): if strand: assert all([gr.stranded for gr in pyranges]), "Cannot do stranded concat, not all pyranges contain strand info." - for gr in pyranges: - for k, df in gr.dfs.items(): - # dbg(df) - grs_per_chromosome[k].append(df) - else: - for gr in pyranges: - for chromosome in gr.chromosomes: - df = gr[chromosome].df - grs_per_chromosome[chromosome].append(df) - - new_pyrange = {} - - for k, v in grs_per_chromosome.items(): - new_pyrange[k] = pd.concat(v, sort=False) - - res = pr.multithreaded.process_results(new_pyrange.values(), new_pyrange.keys()) - - if any(strand_info) and not all(strand_info): - new_res = {} - for k, v in res.items(): - v.loc[:, "Strand"] = v.Strand.cat.add_categories(["."]) - new_res[k] = v.assign(Strand=v.Strand.fillna(".")) - res = pr.PyRanges(new_res) - res.Strand = res.Strand - else: - res = pr.PyRanges(res) - - return res + return pr.PyRanges(pd.concat([gr.df for gr in pyranges])) diff --git a/pyranges/methods/getitem.py b/pyranges/methods/getitem.py index cf0041ce..bdcd01b4 100644 --- a/pyranges/methods/getitem.py +++ b/pyranges/methods/getitem.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -from pyranges import PyRanges +import pyranges as pr from pyranges.methods.drop import _keep from pyranges.subset import get_booldict, get_slice, get_string, get_tuple @@ -32,5 +32,4 @@ def _getitem(self, val): else: raise Exception("Not a valid subsetter: {}".format(str(val))) - gr = PyRanges(dfs) - return gr + return pr.from_dfs(dfs) diff --git a/pyranges/methods/init.py b/pyranges/methods/init.py index c9a8b58f..6c00f31d 100644 --- a/pyranges/methods/init.py +++ b/pyranges/methods/init.py @@ -3,7 +3,6 @@ from pyranges import PyRanges from pyranges.genomicfeatures import GenomicFeaturesMethods -from pyranges.helpers import get_key_from_df, single_value_key from pyranges.statistics import StatisticsMethods @@ -45,44 +44,6 @@ def create_df_dict(df, stranded): return {k: v for k, v in df.groupby(grpby_key)} -def create_pyranges_df(chromosomes, starts, ends, strands=None): - if isinstance(chromosomes, str) or isinstance(chromosomes, int): - chromosomes = pd.Series([chromosomes] * len(starts), dtype="category") - - if strands is not None: - if isinstance(strands, str): - strands = pd.Series([strands] * len(starts), dtype="category") - - columns = [chromosomes, starts, ends, strands] - lengths = list(str(len(s)) for s in columns) - assert ( - len(set(lengths)) == 1 - ), "chromosomes, starts, ends and strands must be of equal length. But are {}".format(", ".join(lengths)) - colnames = "Chromosome Start End Strand".split() - else: - columns = [chromosomes, starts, ends] - lengths = list(str(len(s)) for s in columns) - assert len(set(lengths)) == 1, "chromosomes, starts and ends must be of equal length. But are {}".format( - ", ".join(lengths) - ) - colnames = "Chromosome Start End".split() - - idx = range(len(starts)) - series_to_concat = [] - for s in columns: - if isinstance(s, pd.Series): - s = pd.Series(s.values, index=idx) - else: - s = pd.Series(s, index=idx) - - series_to_concat.append(s) - - df = pd.concat(series_to_concat, axis=1) - df.columns = colnames - - return df - - def check_strandedness(df): """Check whether strand contains '.'""" @@ -102,68 +63,19 @@ def check_strandedness(df): return not contains_more_than_plus_minus_in_strand_col -def _init( - self, - df=None, - chromosomes=None, - starts=None, - ends=None, - strands=None, - copy_df=True, -): - # TODO: add categorize argument with dict of args to categorize? - +def _init(self, df: pd.DataFrame) -> None: if isinstance(df, PyRanges): raise Exception("Object is already a PyRange.") - if isinstance(df, pd.DataFrame): - assert all( - c in df for c in "Chromosome Start End".split() - ), "The dataframe does not have all the columns Chromosome, Start and End." - if copy_df: - df = df.copy() - - if df is False or df is None: - df = create_pyranges_df(chromosomes, starts, ends, strands) - - if isinstance(df, pd.DataFrame): - df = df.reset_index(drop=True) - - stranded = check_strandedness(df) - - df = set_dtypes(df) - - self.__dict__["dfs"] = create_df_dict(df, stranded) - - # df is actually dict of dfs - else: - empty_removed = {k: v.copy() for k, v in df.items() if not v.empty} - - _single_value_key = True - _key_same = True - _strand_valid = True - _has_strand = True - for key, df in empty_removed.items(): - _key = get_key_from_df(df) - _single_value_key = single_value_key(df) and _single_value_key - _key_same = (_key == key) and _key_same - - if isinstance(_key, tuple): - _strand_valid = _strand_valid and (_key[1] in ["+", "-"]) - else: - _has_strand = False + df = df.copy() - if not all([_single_value_key, _key_same, _strand_valid]): - df = pd.concat(empty_removed.values()).reset_index(drop=True) + df = df.reset_index(drop=True) - if _has_strand and _strand_valid: - empty_removed = df.groupby(["Chromosome", "Strand"]) - else: - empty_removed = df.groupby("Chromosome") + stranded = check_strandedness(df) - empty_removed = {k: v for (k, v) in empty_removed} + df = set_dtypes(df) - self.__dict__["dfs"] = empty_removed + self.__dict__["dfs"] = create_df_dict(df, stranded) self.__dict__["features"] = GenomicFeaturesMethods(self) self.__dict__["stats"] = StatisticsMethods(self) diff --git a/pyranges/multioverlap.py b/pyranges/multioverlap.py index 3c58d662..0dc5fa8d 100644 --- a/pyranges/multioverlap.py +++ b/pyranges/multioverlap.py @@ -91,7 +91,7 @@ def count_overlaps(grs, features=None, strandedness=None, how=None, nb_cpu=1): >>> pr.count_overlaps(grs) +--------------+-----------+-----------+-----------+-----------+-----------+ | Chromosome | Start | End | a | b | c | - | (object) | (int64) | (int64) | (int64) | (int64) | (int64) | + | (category) | (int64) | (int64) | (int64) | (int64) | (int64) | |--------------+-----------+-----------+-----------+-----------+-----------| | chr1 | 6 | 8 | 1 | 0 | 0 | | chr1 | 8 | 10 | 1 | 0 | 1 | @@ -106,7 +106,7 @@ def count_overlaps(grs, features=None, strandedness=None, how=None, nb_cpu=1): Unstranded PyRanges object has 12 rows and 6 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome. - >>> gr = pr.PyRanges(chromosomes=["chr1"] * 4, starts=[0, 10, 20, 30], ends=[10, 20, 30, 40]) + >>> gr = pr.from_args(chromosomes=["chr1"] * 4, starts=[0, 10, 20, 30], ends=[10, 20, 30, 40]) >>> gr +--------------+-----------+-----------+ | Chromosome | Start | End | diff --git a/pyranges/multithreaded.py b/pyranges/multithreaded.py index 4a7fb3e0..40139ede 100644 --- a/pyranges/multithreaded.py +++ b/pyranges/multithreaded.py @@ -1,4 +1,3 @@ -import os from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, Union import numpy as np @@ -9,46 +8,6 @@ if TYPE_CHECKING: from pyranges.pyranges_main import PyRanges -ray = None - - -def get_n_args(f: Callable) -> int: - import inspect - - nparams = len(inspect.signature(f).parameters) - return nparams - - -class suppress_stdout_stderr(object): - """ - A context manager for doing a "deep suppression" of stdout and stderr in - Python, i.e. will suppress all print, even if the print originates in a - compiled C/Fortran sub-function. - This will not suppress raised exceptions, since exceptions are printed - to stderr just before a script exits, and after the context manager has - exited (at least, I think that is why it lets exceptions through). - - """ - - def __init__(self): - # Open a pair of null files - self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)] - # Save the actual stdout (1) and stderr (2) file descriptors. - self.save_fds = (os.dup(1), os.dup(2)) - - def __enter__(self): - # Assign the null pointers to stdout and stderr. - os.dup2(self.null_fds[0], 1) - os.dup2(self.null_fds[1], 2) - - def __exit__(self, *_): - # Re-assign the real stdout/stderr back to (1) and (2) - os.dup2(self.save_fds[0], 1) - os.dup2(self.save_fds[1], 2) - # Close the null files - os.close(self.null_fds[0]) - os.close(self.null_fds[1]) - def merge_dfs(df1: DataFrame, df2: DataFrame) -> DataFrame: if not df1.empty and not df2.empty: @@ -67,11 +26,21 @@ def process_results(results: List[Any], keys: Union[List[str], List[Tuple[str, s results_dict = {k: r for k, r in zip(keys, results) if r is not None} try: - first_item = next(iter(results_dict.values())) + next(iter(results_dict.values())) except StopIteration: # empty collection return results_dict - if not isinstance(first_item, pd.DataFrame): + # An arbitrary operation might make the keys in the dict and df out of sync. + # This fixes that by having the PyRanges initializer find the correct keys again.. + try: + if all(isinstance(v, pd.DataFrame) for v in results_dict.values()): + df = pd.concat(results_dict.values()) + import pyranges as pr + + _results_dict = pr.PyRanges(df).dfs + else: + return results_dict + except (ValueError, TypeError): return results_dict to_delete = [] @@ -89,7 +58,7 @@ def process_results(results: List[Any], keys: Union[List[str], List[Tuple[str, s for k in to_delete: del results_dict[k] - return results_dict + return _results_dict def make_sparse(df: DataFrame) -> DataFrame: @@ -122,27 +91,6 @@ def make_unary_sparse(kwargs: Dict[str, Any], df: DataFrame) -> DataFrame: return make_sparse(df) if sparse else df -def ray_initialized(): - def test_function(): - pass - - try: - test_function = ray.remote(test_function) - except Exception as e: - if isinstance(e, NameError): - return False - - raise e - - try: - test_function.remote() - except Exception as e: - if "RayConnectionError" in str(type(e)): - return True - else: - raise e - - def pyrange_apply( function: Callable, self: "PyRanges", other: "PyRanges", **kwargs ) -> Union[Dict[Tuple[str, str], Any], Dict[str, Any]]: @@ -165,7 +113,6 @@ def pyrange_apply( results = [] - items = natsorted(self.dfs.items()) keys = natsorted(self.dfs.keys()) dummy = pd.DataFrame(columns="Chromosome Start End".split()) @@ -174,7 +121,8 @@ def pyrange_apply( other_dfs = other.dfs if strandedness: - for (c, s), df in items: + for c, s in self.chromosomes_and_strands: + df = self._dfs_with_strand[c, s] os = strand_dict[s] if not (c, os) in other.keys() or len(other[c, os].values()) == 0: @@ -183,39 +131,50 @@ def pyrange_apply( odf = other[c, os].values()[0] df, odf = make_binary_sparse(kwargs, df, odf) - result = function(df, odf, **kwargs) + + try: + result = function(df, odf, **kwargs) + except TypeError: + result = function(df, odf) results.append(result) else: if self.stranded and not other.stranded: - for (c, s), df in items: + for (c, s), df in self._dfs_with_strand.items(): if c not in other_chromosomes: odf = dummy else: - odf = other_dfs[c] + odf = other._dfs_without_strands[c] df, odf = make_binary_sparse(kwargs, df, odf) - result = function(df, odf, **kwargs) + + try: + result = function(df, odf, **kwargs) + except TypeError: + result = function(df, odf) results.append(result) elif not self.stranded and other.stranded: - for c, df in items: + for c, df in self._dfs_without_strand.items(): if c not in other_chromosomes: odf = dummy else: - odf1 = other_dfs.get((c, "+"), dummy) # type: ignore - odf2 = other_dfs.get((c, "-"), dummy) # type: ignore + odf1 = other._dfs_with_strand.get((c, "+"), dummy) + odf2 = other._dfs_with_strand.get((c, "-"), dummy) odf = merge_dfs(odf1, odf2) df, odf = make_binary_sparse(kwargs, df, odf) - result = function(df, odf, **kwargs) + try: + result = function(df, odf, **kwargs) + except TypeError: + result = function(df, odf) results.append(result) elif self.stranded and other.stranded: - for (c, s), df in self.items(): # type: ignore + for (c, s), df in self._dfs_with_strand.items(): if c not in other_chromosomes: odfs = [dummy] else: @@ -233,19 +192,25 @@ def pyrange_apply( df, odf = make_binary_sparse(kwargs, df, odf) - result = function(df, odf, **kwargs) + try: + result = function(df, odf, **kwargs) + except TypeError: + result = function(df, odf) results.append(result) else: - for c, df in items: + for c, df in self._dfs_without_strand.items(): if c not in other_chromosomes: odf = dummy else: - odf = other_dfs[c] + odf = other._dfs_without_strand[c] df, odf = make_binary_sparse(kwargs, df, odf) - result = function(df, odf, **kwargs) + try: + result = function(df, odf, **kwargs) + except TypeError: + result = function(df, odf) results.append(result) return process_results(results, keys) @@ -259,26 +224,31 @@ def pyrange_apply_single(function: Callable, self: "PyRanges", **kwargs) -> Any: results = [] - keys: Union[List[str], List[Tuple[str, str]]] = [] # type: ignore + keys: List = [] if strand: - for (c, s), df in self.items(): # type: ignore + for (c, s), df in self._dfs_with_strand.items(): # type: ignore kwargs["chromosome"] = c _strand = s kwargs["strand"] = _strand - df = make_unary_sparse(kwargs, df) - result = function(df, **kwargs) + try: + result = function(df, **kwargs) + except TypeError: + result = function(df) results.append(result) keys = self.keys() elif not self.stranded: - for c, df in self.items(): + for c, df in self._dfs_without_strand.items(): kwargs["chromosome"] = c assert isinstance(c, str) - df = make_unary_sparse(kwargs, df) - result = function(df, **kwargs) + try: + result = function(df, **kwargs) + except TypeError: + result = function(df) + results.append(result) keys.append(c) @@ -296,11 +266,15 @@ def pyrange_apply_single(function: Callable, self: "PyRanges", **kwargs) -> Any: else: df = dfs.values()[0] - df = make_unary_sparse(kwargs, df) - result = function(df, **kwargs) - results.append(result) keys.append(c) + try: + result = function(df, **kwargs) + except TypeError: + result = function(df) + + results.append(result) + return process_results(results, keys) @@ -375,7 +349,7 @@ def _extend(df: DataFrame, **kwargs) -> DataFrame: return df -def _extend_grp(df, **kwargs): +def _extend_grp(df: pd.DataFrame, **kwargs): df = df.copy() dtype = df.Start.dtype slack = kwargs["ext"] diff --git a/pyranges/pyranges_main.py b/pyranges/pyranges_main.py index 21278540..2b3b4590 100644 --- a/pyranges/pyranges_main.py +++ b/pyranges/pyranges_main.py @@ -1,5 +1,4 @@ """Data structure for genomic intervals and their annotation.""" -from collections import defaultdict from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Tuple, Union import numpy as np @@ -98,7 +97,7 @@ class PyRanges: >>> pr.PyRanges() Empty PyRanges - >>> pr.PyRanges(chromosomes="chr1", starts=(1, 5), ends=[3, 149], + >>> pr.from_args(chromosomes="chr1", starts=(1, 5), ends=[3, 149], ... strands=("+", "-")) +--------------+-----------+-----------+--------------+ | Chromosome | Start | End | Strand | @@ -163,21 +162,19 @@ class PyRanges: pyranges.stats.StatisticsMethods : namespace for statistics """ - def __init__( - self, - df: Optional[Union[pd.DataFrame, Dict[Union[str], pd.DataFrame], Dict[Tuple[str, str], pd.DataFrame]]] = None, - chromosomes: Optional[str] = None, - starts: Optional[Tuple[int, int]] = None, - ends: Optional[List[int]] = None, - strands: Optional[Tuple[str, str]] = None, - copy_df: bool = True, - ) -> None: + def __init__(self, df: Optional[pd.DataFrame] = None) -> None: from pyranges.methods.init import _init - if df is None and chromosomes is None: - df = pd.DataFrame(columns="Chromosome Start End".split()) + if df is None: + _df = pd.DataFrame(columns="Chromosome Start End".split()) + else: + _df = df - _init(self, df, chromosomes, starts, ends, strands, copy_df) + assert all( + c in _df.columns for c in "Chromosome Start End".split() + ), f"The dataframe does not have all the columns Chromosome, Start and End: {_df}" + + _init(self, _df) def __array_ufunc__(self, *args, **kwargs) -> "PyRanges": """Apply unary numpy-function. @@ -230,14 +227,10 @@ def __array_ufunc__(self, *args, **kwargs) -> "PyRanges": for chromosome, df in gr: subset = df.head(1)[non_index].select_dtypes(include=np.number).columns _v = getattr(func, call)(df[subset], **kwargs) - # print(_v) - # print(df[_c]) df[subset] = _v return gr - # self.apply() - def __getattr__(self, name: str) -> pd.Series: """Return column. @@ -557,9 +550,7 @@ def apply(self, f: Callable, strand: Optional[bool] = None, **kwargs) -> "PyRang kwargs.update(kwargs.get("kwargs", {})) kwargs = fill_kwargs(kwargs) - result = pyrange_apply_single(f, self, **kwargs) - - return PyRanges(result) + return pr.from_dfs(pyrange_apply_single(f, self, **kwargs)) def apply_general( self, f: Callable, strand: Optional[bool] = None, **kwargs @@ -588,7 +579,8 @@ def apply_general( -------- pyranges.PyRanges.apply: apply a function to a PyRanges and return a PyRanges - pyranges.PyRanges.apply_pair: apply a function to a pair of PyRanges + pyranges.PyRanges.apply_pair: apply a function to a pair of PyRanges and return a PyRanges + pyranges.PyRanges.apply_pair_general: apply a function to a pair of PyRanges and return a dict Note ---- @@ -617,9 +609,7 @@ def apply_general( return pyrange_apply_single(f, self, **kwargs) - def apply_pair( - self, other: "PyRanges", f: Callable, strandedness: None = None, as_pyranges: bool = True, **kwargs - ) -> Union[Dict[Tuple[str, str], Tuple[int, int]], "PyRanges"]: + def apply_pair(self, other: "PyRanges", f: Callable, strandedness: None = None, **kwargs) -> "PyRanges": """Apply a function to a pair of PyRanges. The function is applied to each chromosome or chromosome/strand pair found in at least one @@ -630,6 +620,8 @@ def apply_pair( f : function Row-based or associative function to apply on the pd.DataFrames. + other : PyRanges + strandedness : {None, "same", "opposite", False}, default None, i.e. auto Whether to compare PyRanges on the same strand, the opposite or ignore strand @@ -709,10 +701,89 @@ def apply_pair( +--------------+-----------+-----------+------------+-----------+--------------+ Stranded PyRanges object has 2 rows and 6 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome and Strand. + """ - >>> f1.apply_pair(f2, lambda df, df2: (len(df), len(df2)), as_pyranges=False) - {('chr1', '+'): (2, 2), ('chr1', '-'): (1, 2)} + kwargs.update({"strandedness": strandedness}) + kwargs.update(kwargs.get("kwargs", {})) + kwargs = fill_kwargs(kwargs) + + result = pyrange_apply(f, self, other, **kwargs) + + return pr.from_dfs(result) + + def apply_pair_general( + self, other: "PyRanges", f: Callable, strandedness: None = None, **kwargs + ) -> Union[Dict[str, Any], Dict[Tuple[str, str], Any]]: + """Apply a function to a pair of PyRanges. + + The function is applied to each chromosome or chromosome/strand pair found in at least one + of the PyRanges. + + Parameters + ---------- + f : function + Row-based or associative function to apply on the pd.DataFrames. + + other : PyRanges + + strandedness : {None, "same", "opposite", False}, default None, i.e. auto + + Whether to compare PyRanges on the same strand, the opposite or ignore strand + information. The default, None, means use "same" if both PyRanges are strande, + otherwise ignore the strand information. + + **kwargs + Additional keyword arguments to pass as keyword arguments to `f` + + Returns + ------- + dict of lists + Result of applying f to each partition of the pd.DataFrames in the PyRanges. + + See also + -------- + + pyranges.PyRanges.apply: apply a function to a pair of PyRanges + pyranges.PyRanges.apply_general: apply a function to a PyRanges and return a dict of Any + pyranges.PyRanges.apply_pair: apply a function to a pair of PyRanges + pyranges.iter: iterate over two or more PyRanges + + Note + ---- + + This is the function used internally to carry out almost all comparison functions in + PyRanges. + + Examples + -------- + + >>> f1 = pr.data.f1() + >>> f1 + +--------------+-----------+-----------+------------+-----------+--------------+ + | Chromosome | Start | End | Name | Score | Strand | + | (category) | (int64) | (int64) | (object) | (int64) | (category) | + |--------------+-----------+-----------+------------+-----------+--------------| + | chr1 | 3 | 6 | interval1 | 0 | + | + | chr1 | 8 | 9 | interval3 | 0 | + | + | chr1 | 5 | 7 | interval2 | 0 | - | + +--------------+-----------+-----------+------------+-----------+--------------+ + Stranded PyRanges object has 3 rows and 6 columns from 1 chromosomes. + For printing, the PyRanges was sorted on Chromosome and Strand. + + >>> f2 = pr.data.f2() + >>> f2 + +--------------+-----------+-----------+------------+-----------+--------------+ + | Chromosome | Start | End | Name | Score | Strand | + | (category) | (int64) | (int64) | (object) | (int64) | (category) | + |--------------+-----------+-----------+------------+-----------+--------------| + | chr1 | 1 | 2 | a | 0 | + | + | chr1 | 6 | 7 | b | 0 | - | + +--------------+-----------+-----------+------------+-----------+--------------+ + Stranded PyRanges object has 2 rows and 6 columns from 1 chromosomes. + For printing, the PyRanges was sorted on Chromosome and Strand. + >>> f1.apply_pair_general(f2, lambda df, df2: (len(df), len(df2))) + {('chr1', '+'): (2, 2), ('chr1', '-'): (1, 2)} """ kwargs.update({"strandedness": strandedness}) @@ -720,11 +791,7 @@ def apply_pair( kwargs = fill_kwargs(kwargs) result = pyrange_apply(f, self, other, **kwargs) - - if not as_pyranges: - return result - else: - return PyRanges(result) + return result def as_df(self) -> pd.DataFrame: """Return PyRanges as pd.DataFrame. @@ -943,7 +1010,7 @@ def boundaries(self, group_by: str, agg: Optional[Dict[str, Union[str, Callable] kwargs = fill_kwargs(kwargs) result = pyrange_apply_single(_bounds, self, **kwargs) - return pr.PyRanges(result) + return pr.from_dfs(result) def calculate_frame(self, by: Union[str, List[str]]) -> "PyRanges": """Calculate the frame of each genomic interval, assuming all are coding sequences (CDS), and add it as column inplace. @@ -1040,6 +1107,15 @@ def chromosomes(self) -> List[str]: else: return natsorted(set([k for k in self.keys()])) + @property + def chromosomes_and_strands(self) -> List[Tuple[str, str]]: + """Return chromosomes and strands in natsorted order.""" + + if not self.stranded: + raise ValueError("PyRanges is not stranded.") + else: + return natsorted(set(self.keys())) + def cluster( self, strand: Optional[bool] = None, @@ -1187,7 +1263,7 @@ def cluster( kwargs["by"] = by df = pyrange_apply_single(_cluster_by, _self, **kwargs) - gr = PyRanges(df) + gr = pr.from_dfs(df) # each chromosome got overlapping ids (0 to len). Need to make unique! new_dfs = {} @@ -1362,7 +1438,7 @@ def count_overlaps( counts = pyrange_apply(_number_overlapping, self, other, **kwargs) - return pr.PyRanges(counts) + return pr.from_dfs(counts) def coverage( self, @@ -1474,7 +1550,7 @@ def coverage( from pyranges.methods.coverage import _coverage - counts = pr.PyRanges(pyrange_apply(_coverage, counts, other, **kwargs)) + counts = pr.from_dfs(pyrange_apply(_coverage, counts, other, **kwargs)) return counts @@ -1647,7 +1723,7 @@ def drop_duplicate_positions(self, strand: Optional[bool] = None, keep: Union[bo kwargs = {"sparse": {"self": False}, "keep": keep, "strand": strand and self.stranded} kwargs = fill_kwargs(kwargs) - return PyRanges(pyrange_apply_single(_drop_duplicate_positions, self, **kwargs)) + return pr.from_dfs(pyrange_apply_single(_drop_duplicate_positions, self, **kwargs)) @property def dtypes(self) -> pd.Series: @@ -1782,15 +1858,11 @@ def extend(self, ext: Union[Dict[str, int], int], group_by: None = None) -> "PyR if isinstance(ext, dict): assert self.stranded, "PyRanges must be stranded to add 5/3-end specific extend." - kwargs = fill_kwargs({"ext": ext, "strand": self.stranded}) - - if group_by is None: - prg = PyRanges(pyrange_apply_single(_extend, self, **kwargs)) - else: - kwargs["group_by"] = group_by - prg = PyRanges(pyrange_apply_single(_extend_grp, self, **kwargs)) + kwargs = fill_kwargs({"ext": ext, "strand": self.stranded, "group_by": group_by}) + func = _extend if group_by is None else _extend_grp + dfs = pyrange_apply_single(func, self, **kwargs) - return prg + return pr.from_dfs(dfs) # # TODO: use subtract code here instead, easier # def no_overlap(self, other, **kwargs): @@ -1861,7 +1933,7 @@ def five_end(self) -> "PyRanges": assert self.stranded, "Need stranded pyrange to find 5'." kwargs = fill_kwargs({"strand": self.stranded}) - return PyRanges(pyrange_apply_single(_tss, self, **kwargs)) + return pr.from_dfs(pyrange_apply_single(_tss, self, **kwargs)) def head(self, n: int = 8) -> "PyRanges": """Return the n first rows. @@ -2181,7 +2253,7 @@ def intersect( self.__ix__ = np.arange(len(self)) dfs = pyrange_apply(_intersection, self, other, **kwargs) - result = pr.PyRanges(dfs) + result = pr.from_dfs(dfs) if invert: found_idxs = getattr(result, "__ix__", []) @@ -2404,7 +2476,7 @@ def join( kwargs["example_header_self"] = self.head(1).df dfs = pyrange_apply(_write_both, self, other, **kwargs) - gr = PyRanges(dfs) + gr = pr.from_dfs(dfs) if slack and len(gr) > 0: gr.Start = gr.Start__slack @@ -2451,357 +2523,6 @@ def keys(self) -> Union[List[str], List[Tuple[str, str]]]: return natsorted(self.dfs.keys()) - def k_nearest( - self, - other: "PyRanges", - k: Union[List[int], int] = 1, - ties: Optional[str] = None, - strandedness: None = None, - overlap: bool = True, - how: Optional[str] = None, - suffix: str = "_b", - nb_cpu: int = 1, - apply_strand_suffix: None = None, - ) -> "PyRanges": - """Find k nearest intervals. - - Parameters - ---------- - other : PyRanges - - PyRanges to find nearest interval in. - - k : int or list/array/pd.Series of int - - Number of closest to return. If iterable, must be same length as PyRanges. - - ties : {None, "first", "last", "different"}, default None - - How to resolve ties, i.e. closest intervals with equal distance. None means that the k nearest intervals are kept. - "first" means that the first tie is kept, "last" meanst that the last is kept. - "different" means that all nearest intervals with the k unique nearest distances are kept. - - strandedness : {None, "same", "opposite", False}, default None, i.e. auto - - Whether to compare PyRanges on the same strand, the opposite or ignore strand - information. The default, None, means use "same" if both PyRanges are stranded, - otherwise ignore the strand information. - - overlap : bool, default True - - Whether to include overlaps. - - how : {None, "upstream", "downstream"}, default None, i.e. both directions - - Whether to only look for nearest in one direction. Always with respect to the PyRanges - it is called on. - - suffix : str, default "_b" - - Suffix to give columns with shared name in other. - - apply_strand_suffix : bool, default None - - If first pyranges is unstranded, but the second is not, the first will be given a strand column. - apply_strand_suffix makes the added strand column a regular data column instead by adding a suffix. - - - nb_cpu: int, default 1 - - How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple. - Will only lead to speedups on large datasets. - - Returns - ------- - PyRanges - - A PyRanges with columns of nearest interval horizontally appended. - - Notes - ----- - - nearest also exists, and is more performant. - - See also - -------- - - PyRanges.new_position : give joined PyRanges new coordinates - PyRanges.nearest : find nearest intervals - - Examples - -------- - - >>> f1 = pr.from_dict({'Chromosome': ['chr1', 'chr1', 'chr1'], 'Start': [3, 8, 5], - ... 'End': [6, 9, 7], 'Strand': ['+', '+', '-']}) - >>> f1 - +--------------+-----------+-----------+--------------+ - | Chromosome | Start | End | Strand | - | (category) | (int64) | (int64) | (category) | - |--------------+-----------+-----------+--------------| - | chr1 | 3 | 6 | + | - | chr1 | 8 | 9 | + | - | chr1 | 5 | 7 | - | - +--------------+-----------+-----------+--------------+ - Stranded PyRanges object has 3 rows and 4 columns from 1 chromosomes. - For printing, the PyRanges was sorted on Chromosome and Strand. - - >>> f2 = pr.from_dict({'Chromosome': ['chr1', 'chr1'], 'Start': [1, 6], - ... 'End': [2, 7], 'Strand': ['+', '-']}) - >>> f2 - +--------------+-----------+-----------+--------------+ - | Chromosome | Start | End | Strand | - | (category) | (int64) | (int64) | (category) | - |--------------+-----------+-----------+--------------| - | chr1 | 1 | 2 | + | - | chr1 | 6 | 7 | - | - +--------------+-----------+-----------+--------------+ - Stranded PyRanges object has 2 rows and 4 columns from 1 chromosomes. - For printing, the PyRanges was sorted on Chromosome and Strand. - - >>> f1.k_nearest(f2, k=2) - +--------------+-----------+-----------+--------------+-----------+-----------+--------------+------------+ - | Chromosome | Start | End | Strand | Start_b | End_b | Strand_b | Distance | - | (category) | (int64) | (int64) | (category) | (int64) | (int64) | (category) | (int64) | - |--------------+-----------+-----------+--------------+-----------+-----------+--------------+------------| - | chr1 | 3 | 6 | + | 6 | 7 | - | 1 | - | chr1 | 3 | 6 | + | 1 | 2 | + | -2 | - | chr1 | 8 | 9 | + | 6 | 7 | - | -2 | - | chr1 | 8 | 9 | + | 1 | 2 | + | -7 | - | chr1 | 5 | 7 | - | 6 | 7 | - | 0 | - | chr1 | 5 | 7 | - | 1 | 2 | + | 4 | - +--------------+-----------+-----------+--------------+-----------+-----------+--------------+------------+ - Stranded PyRanges object has 6 rows and 8 columns from 1 chromosomes. - For printing, the PyRanges was sorted on Chromosome and Strand. - - >>> f1.k_nearest(f2, how="upstream", k=2) - +--------------+-----------+-----------+--------------+-----------+-----------+--------------+------------+ - | Chromosome | Start | End | Strand | Start_b | End_b | Strand_b | Distance | - | (category) | (int64) | (int64) | (category) | (int64) | (int64) | (category) | (int64) | - |--------------+-----------+-----------+--------------+-----------+-----------+--------------+------------| - | chr1 | 3 | 6 | + | 1 | 2 | + | -2 | - | chr1 | 8 | 9 | + | 6 | 7 | - | -2 | - | chr1 | 8 | 9 | + | 1 | 2 | + | -7 | - | chr1 | 5 | 7 | - | 6 | 7 | - | 0 | - +--------------+-----------+-----------+--------------+-----------+-----------+--------------+------------+ - Stranded PyRanges object has 4 rows and 8 columns from 1 chromosomes. - For printing, the PyRanges was sorted on Chromosome and Strand. - - >>> f1.k_nearest(f2, k=[1, 2, 1]) - +--------------+-----------+-----------+--------------+-----------+-----------+--------------+------------+ - | Chromosome | Start | End | Strand | Start_b | End_b | Strand_b | Distance | - | (category) | (int64) | (int64) | (category) | (int64) | (int64) | (category) | (int64) | - |--------------+-----------+-----------+--------------+-----------+-----------+--------------+------------| - | chr1 | 3 | 6 | + | 6 | 7 | - | 1 | - | chr1 | 8 | 9 | + | 6 | 7 | - | -2 | - | chr1 | 8 | 9 | + | 1 | 2 | + | -7 | - | chr1 | 5 | 7 | - | 6 | 7 | - | 0 | - +--------------+-----------+-----------+--------------+-----------+-----------+--------------+------------+ - Stranded PyRanges object has 4 rows and 8 columns from 1 chromosomes. - For printing, the PyRanges was sorted on Chromosome and Strand. - - >>> d1 = {"Chromosome": [1], "Start": [5], "End": [6]} - >>> d2 = {"Chromosome": 1, "Start": [1] * 2 + [5] * 2 + [9] * 2, - ... "End": [3] * 2 + [7] * 2 + [11] * 2, "ID": range(6)} - >>> gr, gr2 = pr.from_dict(d1), pr.from_dict(d2) - - >>> gr - +--------------+-----------+-----------+ - | Chromosome | Start | End | - | (category) | (int64) | (int64) | - |--------------+-----------+-----------| - | 1 | 5 | 6 | - +--------------+-----------+-----------+ - Unstranded PyRanges object has 1 rows and 3 columns from 1 chromosomes. - For printing, the PyRanges was sorted on Chromosome. - - >>> gr2 - +--------------+-----------+-----------+-----------+ - | Chromosome | Start | End | ID | - | (category) | (int64) | (int64) | (int64) | - |--------------+-----------+-----------+-----------| - | 1 | 1 | 3 | 0 | - | 1 | 1 | 3 | 1 | - | 1 | 5 | 7 | 2 | - | 1 | 5 | 7 | 3 | - | 1 | 9 | 11 | 4 | - | 1 | 9 | 11 | 5 | - +--------------+-----------+-----------+-----------+ - Unstranded PyRanges object has 6 rows and 4 columns from 1 chromosomes. - For printing, the PyRanges was sorted on Chromosome. - - >>> gr.k_nearest(gr2, k=2) - +--------------+-----------+-----------+-----------+-----------+-----------+------------+ - | Chromosome | Start | End | Start_b | End_b | ID | Distance | - | (category) | (int64) | (int64) | (int64) | (int64) | (int64) | (int64) | - |--------------+-----------+-----------+-----------+-----------+-----------+------------| - | 1 | 5 | 6 | 5 | 7 | 2 | 0 | - | 1 | 5 | 6 | 5 | 7 | 3 | 0 | - +--------------+-----------+-----------+-----------+-----------+-----------+------------+ - Unstranded PyRanges object has 2 rows and 7 columns from 1 chromosomes. - For printing, the PyRanges was sorted on Chromosome. - - >>> gr.k_nearest(gr2, k=2, ties="different") - +--------------+-----------+-----------+-----------+-----------+-----------+------------+ - | Chromosome | Start | End | Start_b | End_b | ID | Distance | - | (category) | (int64) | (int64) | (int64) | (int64) | (int64) | (int64) | - |--------------+-----------+-----------+-----------+-----------+-----------+------------| - | 1 | 5 | 6 | 5 | 7 | 2 | 0 | - | 1 | 5 | 6 | 5 | 7 | 3 | 0 | - | 1 | 5 | 6 | 1 | 3 | 1 | -3 | - | 1 | 5 | 6 | 1 | 3 | 0 | -3 | - +--------------+-----------+-----------+-----------+-----------+-----------+------------+ - Unstranded PyRanges object has 4 rows and 7 columns from 1 chromosomes. - For printing, the PyRanges was sorted on Chromosome. - - >>> gr.k_nearest(gr2, k=3, ties="first") - +--------------+-----------+-----------+-----------+-----------+-----------+------------+ - | Chromosome | Start | End | Start_b | End_b | ID | Distance | - | (category) | (int64) | (int64) | (int64) | (int64) | (int64) | (int64) | - |--------------+-----------+-----------+-----------+-----------+-----------+------------| - | 1 | 5 | 6 | 5 | 7 | 2 | 0 | - | 1 | 5 | 6 | 1 | 3 | 1 | -3 | - | 1 | 5 | 6 | 9 | 11 | 4 | 4 | - +--------------+-----------+-----------+-----------+-----------+-----------+------------+ - Unstranded PyRanges object has 3 rows and 7 columns from 1 chromosomes. - For printing, the PyRanges was sorted on Chromosome. - - >>> gr.k_nearest(gr2, k=1, overlap=False) - +--------------+-----------+-----------+-----------+-----------+-----------+------------+ - | Chromosome | Start | End | Start_b | End_b | ID | Distance | - | (category) | (int64) | (int64) | (int64) | (int64) | (int64) | (int64) | - |--------------+-----------+-----------+-----------+-----------+-----------+------------| - | 1 | 5 | 6 | 1 | 3 | 1 | -3 | - +--------------+-----------+-----------+-----------+-----------+-----------+------------+ - Unstranded PyRanges object has 1 rows and 7 columns from 1 chromosomes. - For printing, the PyRanges was sorted on Chromosome. - """ - - from sorted_nearest import get_all_ties, get_different_ties # type: ignore - - from pyranges.methods.k_nearest import _nearest # type: ignore - - kwargs = { - "strandedness": strandedness, - "how": how, - "overlap": overlap, - "nb_cpu": nb_cpu, - "k": k, - "ties": ties, - } - kwargs = fill_kwargs(kwargs) - kwargs["stranded"] = self.stranded and other.stranded - - _self = self.copy() - - if isinstance(k, pd.Series): - k = k.values - - # how many to nearest to find; might be different for each - _self.__k__ = k - # give each their own unique ID - _self.__IX__ = np.arange(len(_self)) - - dfs = pyrange_apply(_nearest, _self, other, **kwargs) - nearest = PyRanges(dfs) - - if not overlap: - result = nearest - else: - overlap_how = defaultdict(lambda: None, {"first": "first", "last": "last"})[kwargs.get("ties")] # type: ignore - overlaps = _self.join( - other, - strandedness=strandedness, - how=overlap_how, - nb_cpu=nb_cpu, - apply_strand_suffix=apply_strand_suffix, - ) - overlaps.Distance = 0 - result = pr.concat([overlaps, nearest]) - - if not len(result): - return pr.PyRanges() - new_result = {} - if ties in ["first", "last"]: - for c, df in result: - df = df.sort_values(["__IX__", "Distance"]) - grpby = df.groupby("__k__", sort=False) - dfs = [] - for k, kdf in grpby: - grpby2 = kdf.groupby("__IX__", sort=False) - _df = grpby2.head(k) - dfs.append(_df) - - if dfs: - new_result[c] = pd.concat(dfs) - - elif ties == "different" or not ties: - for c, df in result: - if df.empty: - continue - dfs = [] - - df = df.sort_values(["__IX__", "Distance"]) - grpby = df.groupby("__k__", sort=False) - - for k, kdf in grpby: - if ties: - lx = get_different_ties( - kdf.index.values, - kdf.__IX__.values, - kdf.Distance.astype(np.int64).values, - k, - ) - _df = kdf.reindex(lx) - else: - lx = get_all_ties( - kdf.index.values, - kdf.__IX__.values, - kdf.Distance.astype(np.int64).values, - k, - ) - _df = kdf.reindex(lx) - _df = _df.groupby("__IX__").head(k) - dfs.append(_df) - - if dfs: - new_result[c] = pd.concat(dfs) - - result = pr.PyRanges(new_result) - - if not result.__IX__.is_monotonic_increasing: - result = result.sort("__IX__") - - result = result.drop(like="__IX__|__k__") - - _self = _self.drop(like="__k__|__IX__") - - def prev_to_neg(df, **kwargs): - strand = df.Strand.iloc[0] if "Strand" in df else "+" - - suffix = kwargs["suffix"] - - bools = df["End" + suffix] < df.Start - if not strand == "+": - bools = ~bools - - df.loc[bools, "Distance"] = -df.loc[bools, "Distance"] - return df - - result = result.apply(prev_to_neg, suffix=kwargs["suffix"]) - - if not _self.stranded and other.stranded: - if apply_strand_suffix is None: - import sys - - print( - "join: Strand data from other will be added as strand data to self.\nIf this is undesired use the flag apply_strand_suffix=False.\nTo turn off the warning set apply_strand_suffix to True or False.", - file=sys.stderr, - ) - elif apply_strand_suffix: - result.columns = result.columns.str.replace("Strand", "Strand" + kwargs["suffix"]) - - return result - @property def length(self) -> int: """Return the total length of the intervals. @@ -2968,9 +2689,9 @@ def max_disjoint(self, strand: Optional[bool] = None, slack: int = 0, **kwargs) from pyranges.methods.max_disjoint import _max_disjoint - df = pyrange_apply_single(_max_disjoint, self, **kwargs) + dfs = pyrange_apply_single(_max_disjoint, self, **kwargs) - return pr.PyRanges(df) + return pr.from_dfs(dfs) def merge( self, @@ -3120,7 +2841,7 @@ def merge( df = pyrange_apply_single(_merge_by, self, **kwargs) - return PyRanges(df) + return pr.from_dfs(df) def mp(self, n: int = 8, formatting: None = None) -> None: """Merge location and print. @@ -3300,7 +3021,7 @@ def nearest( assert other.stranded, "If doing upstream or downstream nearest, other pyranges must be stranded" dfs = pyrange_apply(_nearest, self, other, **kwargs) - gr = PyRanges(dfs) + gr = pr.from_dfs(dfs) if not self.stranded and other.stranded: if apply_strand_suffix is None: @@ -3460,7 +3181,7 @@ def new_position(self, new_pos: str, columns: Optional[Tuple[str, str, str, str] dfs = pyrange_apply_single(_new_position, self, **kwargs) - return pr.PyRanges(dfs) + return pr.from_dfs(dfs) def overlap( self, @@ -3603,7 +3324,7 @@ def overlap( self.__ix__ = np.arange(len(self)) dfs = pyrange_apply(_overlap, self, other, **kwargs) - result = pr.PyRanges(dfs) + result = pr.from_dfs(dfs) if invert: found_idxs = getattr(result, "__ix__", []) @@ -3965,7 +3686,7 @@ def set_intersect( other_clusters = other.merge(strand=strand) dfs = pyrange_apply(_intersection, self_clusters, other_clusters, **kwargs) - return PyRanges(dfs) + return pr.from_dfs(dfs) def set_union(self, other: "PyRanges", strandedness: None = None, nb_cpu: int = 1) -> "PyRanges": """Return set-theoretical union. @@ -4166,7 +3887,7 @@ def sort(self, by: Optional[str] = None, nb_cpu: int = 1) -> "PyRanges": kwargs["by"] = by kwargs = fill_kwargs(kwargs) - return PyRanges(pyrange_apply_single(_sort, self, **kwargs)) + return pr.from_dfs(pyrange_apply_single(_sort, self, **kwargs)) def sp(self, n=30, formatting=None): """Sort on location and print. @@ -4200,7 +3921,7 @@ def spliced_subsequence( end: Optional[int] = None, by: Optional[str] = None, strand: Optional[bool] = None, - **kwargs + **kwargs, ) -> "PyRanges": """Get subsequences of the intervals, using coordinates mapping to spliced transcripts (without introns) @@ -4342,7 +4063,7 @@ def spliced_subsequence( result = pyrange_apply_single(_spliced_subseq, sorted_p, **kwargs) - return pr.PyRanges(result) + return pr.from_dfs(result) def split(self, strand: Optional[bool] = None, between: bool = False) -> "PyRanges": """Split into non-overlapping intervals. @@ -4388,38 +4109,38 @@ def split(self, strand: Optional[bool] = None, between: bool = False) -> "PyRang For printing, the PyRanges was sorted on Chromosome and Strand. >>> gr.split() - +--------------+-----------+-----------+------------+ - | Chromosome | Start | End | Strand | - | (object) | (int64) | (int64) | (object) | - |--------------+-----------+-----------+------------| - | chr1 | 3 | 5 | + | - | chr1 | 5 | 6 | + | - | chr1 | 6 | 9 | + | - | chr1 | 5 | 7 | - | - | chr1 | 11 | 12 | - | - +--------------+-----------+-----------+------------+ + +--------------+-----------+-----------+--------------+ + | Chromosome | Start | End | Strand | + | (category) | (int64) | (int64) | (category) | + |--------------+-----------+-----------+--------------| + | chr1 | 3 | 5 | + | + | chr1 | 5 | 6 | + | + | chr1 | 6 | 9 | + | + | chr1 | 5 | 7 | - | + | chr1 | 11 | 12 | - | + +--------------+-----------+-----------+--------------+ Stranded PyRanges object has 5 rows and 4 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome and Strand. >>> gr.split(between=True) - +--------------+-----------+-----------+------------+ - | Chromosome | Start | End | Strand | - | (object) | (int64) | (int64) | (object) | - |--------------+-----------+-----------+------------| - | chr1 | 3 | 5 | + | - | chr1 | 5 | 6 | + | - | chr1 | 6 | 9 | + | - | chr1 | 5 | 7 | - | - | chr1 | 7 | 11 | - | - | chr1 | 11 | 12 | - | - +--------------+-----------+-----------+------------+ + +--------------+-----------+-----------+--------------+ + | Chromosome | Start | End | Strand | + | (category) | (int64) | (int64) | (category) | + |--------------+-----------+-----------+--------------| + | chr1 | 3 | 5 | + | + | chr1 | 5 | 6 | + | + | chr1 | 6 | 9 | + | + | chr1 | 5 | 7 | - | + | chr1 | 7 | 11 | - | + | chr1 | 11 | 12 | - | + +--------------+-----------+-----------+--------------+ Stranded PyRanges object has 6 rows and 4 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome and Strand. >>> gr.split(strand=False) +--------------+-----------+-----------+ | Chromosome | Start | End | - | (object) | (int64) | (int64) | + | (category) | (int64) | (int64) | |--------------+-----------+-----------| | chr1 | 3 | 5 | | chr1 | 5 | 6 | @@ -4433,7 +4154,7 @@ def split(self, strand: Optional[bool] = None, between: bool = False) -> "PyRang >>> gr.split(strand=False, between=True) +--------------+-----------+-----------+ | Chromosome | Start | End | - | (object) | (int64) | (int64) | + | (category) | (int64) | (int64) | |--------------+-----------+-----------| | chr1 | 3 | 5 | | chr1 | 5 | 6 | @@ -4455,7 +4176,7 @@ def split(self, strand: Optional[bool] = None, between: bool = False) -> "PyRang df = pyrange_apply_single(_split, self, **kwargs) - split = pr.PyRanges(df) + split = pr.from_dfs(df) if not between: strandedness: Union[str, bool] = "same" if strand else False split = split.overlap(self, strandedness=strandedness) @@ -4660,7 +4381,7 @@ def subsequence( end: Optional[int] = None, by: Optional[str] = None, strand: Optional[bool] = None, - **kwargs + **kwargs, ) -> "PyRanges": """Get subsequences of the intervals. @@ -4792,7 +4513,7 @@ def subsequence( result = pyrange_apply_single(_subseq, self, **kwargs) - return pr.PyRanges(result) + return pr.from_dfs(result) def subtract(self, other: "PyRanges", strandedness: None = None, nb_cpu: int = 1) -> "PyRanges": """Subtract intervals. @@ -4864,13 +4585,13 @@ def subtract(self, other: "PyRanges", strandedness: None = None, nb_cpu: int = 1 strand = True if strandedness else False other_clusters = other.merge(strand=strand) - self = self.count_overlaps(other_clusters, strandedness=strandedness, overlap_col="__num__") + _self = self.copy() - result = pyrange_apply(_subtraction, self, other_clusters, **kwargs) + _self = _self.count_overlaps(other_clusters, strandedness=strandedness, overlap_col="__num__") - self = self.drop("__num__") + result = pyrange_apply(_subtraction, _self, other_clusters, **kwargs) - return PyRanges(result).drop("__num__") + return pr.from_dfs(result).drop("__num__") def summary(self, to_stdout: bool = True, return_df: bool = False) -> Optional[pd.DataFrame]: """Return info. @@ -5121,7 +4842,7 @@ def tile(self, tile_size: int, overlap: bool = False, strand: Optional[bool] = N df = pyrange_apply_single(_tiles, self, **kwargs) - return PyRanges(df) + return pr.from_dfs(df) def to_example(self, n: int = 10) -> Dict[str, List[Union[int, str]]]: """Return as dict. @@ -5237,7 +4958,7 @@ def three_end(self) -> "PyRanges": assert self.stranded, "Need stranded pyrange to find 3'." kwargs = fill_kwargs({"strand": True}) - return PyRanges(pyrange_apply_single(_tes, self, **kwargs)) + return pr.from_dfs(pyrange_apply_single(_tes, self, **kwargs)) # def to_bam(self, path=None, header=None, chromosome_sizes=None, chain=False): @@ -5930,9 +5651,11 @@ def unstrand(self) -> "PyRanges": gr = pr.concat([self["+"], self["-"]]) - gr = gr.apply(lambda df: df.drop("Strand", axis=1).reset_index(drop=True)) + dfs = [] + for _, df in gr.dfs.items(): + dfs.append(df.drop("Strand", axis=1).reset_index(drop=True)) - return pr.PyRanges(gr.dfs) + return pr.PyRanges(pd.concat(dfs).reset_index(drop=True)) def values(self) -> List[pd.DataFrame]: """Return the underlying pd.DataFrames.""" @@ -6049,9 +5772,9 @@ def window(self, window_size: int, strand: Optional[bool] = None) -> "PyRanges": "window_size": window_size, } - df = pyrange_apply_single(_windows, self, **kwargs) + dfs = pyrange_apply_single(_windows, self, **kwargs) - return PyRanges(df) + return pr.from_dfs(dfs) def __getstate__(self): return self.dfs @@ -6070,11 +5793,23 @@ def _zip_locationkey_and_data(keys: Iterable, dfs: Iterable[pd.DataFrame], stran if strand: for k in keys: assert isinstance(k, tuple) - return pr.PyRanges(dict(zip(keys, dfs))) + return pr.from_dfs(dict(zip(keys, dfs))) else: for k in keys: assert isinstance(k, str) - return pr.PyRanges(dict(zip(keys, dfs))) + return pr.from_dfs(dict(zip(keys, dfs))) + + @property + def _dfs_without_strand(self) -> Dict[str, pd.DataFrame]: + """Return a dictionary of stranded dataframes.""" + assert not self.stranded, "PyRanges object is stranded" + return {k: v for k, v in self.dfs.items() if isinstance(k, str)} + + @property + def _dfs_with_strand(self) -> Dict[Tuple[str, str], pd.DataFrame]: + """Return a dictionary of stranded dataframes.""" + assert self.stranded, "PyRanges object is not stranded" + return {k: v for k, v in self.dfs.items() if isinstance(k, tuple)} def _test(): diff --git a/tests/property_based/hypothesis_helper.py b/tests/property_based/hypothesis_helper.py index 575e3c1d..904e292a 100644 --- a/tests/property_based/hypothesis_helper.py +++ b/tests/property_based/hypothesis_helper.py @@ -1,5 +1,3 @@ -from os import environ - import hypothesis.strategies as st import numpy as np import pandas as pd @@ -108,7 +106,7 @@ def dfs_min2(draw): # nosec # if not strand: # df = df.drop("Strand", axis=1) - gr = PyRanges(df, int64=True) + gr = PyRanges(df) # gr = PyRanges(df) # do not sort like this, use pyranges sort @@ -134,7 +132,7 @@ def dfs_min(draw): # nosec # if not strand: # df = df.drop("Strand", axis=1) - gr = PyRanges(df, int64=True) + gr = PyRanges(df) # print(gr) # raise # gr = PyRanges(df) @@ -159,7 +157,7 @@ def dfs_no_min(draw): # nosec # if not strand: # df = df.drop("Strand", axis=1) - gr = PyRanges(df, int64=True) + gr = PyRanges(df) # gr = PyRanges(df) # do not sort like this, use pyranges sort diff --git a/tests/unit/df_dict_mismatch/test_mismatch.py b/tests/unit/df_dict_mismatch/test_mismatch.py index a0ba0506..9e662389 100644 --- a/tests/unit/df_dict_mismatch/test_mismatch.py +++ b/tests/unit/df_dict_mismatch/test_mismatch.py @@ -1,4 +1,5 @@ import pandas as pd +import pytest import pyranges as pr @@ -7,9 +8,8 @@ def test_unstranded_but_has_chrom_key(): df = pd.DataFrame({"Chromosome": "chr1", "Start": 5, "End": 10}, index=[0]) dfs = {("chr1", "+"): df} - gr = pr.PyRanges(dfs) - - assert not gr.stranded + with pytest.raises(ValueError, match=r"All keys must be the same, but df has chr1 and dict had .*"): + pr.from_dfs(dfs) def test_has_bad_strand_and_strand_key(): @@ -17,7 +17,7 @@ def test_has_bad_strand_and_strand_key(): dfs = {("chr1", "+"): df} - gr = pr.PyRanges(dfs) + gr = pr.from_dfs(dfs) assert not gr.stranded @@ -25,10 +25,7 @@ def test_has_bad_strand_and_strand_key(): def test_has_strand_but_is_not_stranded(): df = pd.DataFrame({"Chromosome": "chr1", "Start": 5, "End": 10, "Strand": "+"}, index=[0]) - dfs = {("chr1"): df} - - gr = pr.PyRanges(dfs) - - print(gr.dfs) + dfs = {"chr1": df} - assert gr.stranded + with pytest.raises(ValueError, match=r"All keys must be the same, but df has .* and dict had .*"): + pr.from_dfs(dfs) diff --git a/tests/unit/join/test_join.py b/tests/unit/join/test_join.py index 78800b14..f4e7c00f 100644 --- a/tests/unit/join/test_join.py +++ b/tests/unit/join/test_join.py @@ -5,8 +5,8 @@ # with slack def test_join_with_slack(): - gr1 = pr.PyRanges(chromosomes="chr1", starts=[0], ends=[10], strands="+") - gr2 = pr.PyRanges(chromosomes="chr1", starts=[15], ends=[20], strands="+") + gr1 = pr.from_args(chromosomes="chr1", starts=[0], ends=[10], strands="+") + gr2 = pr.from_args(chromosomes="chr1", starts=[15], ends=[20], strands="+") result = gr1.join(gr2, slack=10) df = result.df diff --git a/tests/unit/slack/test_slack.py b/tests/unit/slack/test_slack.py index c3093d6c..fc1c6862 100644 --- a/tests/unit/slack/test_slack.py +++ b/tests/unit/slack/test_slack.py @@ -3,6 +3,6 @@ # 3' and 5' def test_slack(): - gr = pr.PyRanges(chromosomes="chr1", starts=[15, 300], ends=[20, 305], strands="+ -".split()) + gr = pr.from_args(chromosomes="chr1", starts=[15, 300], ends=[20, 305], strands="+ -".split()) print(gr) gr = gr.slack({"5": 10, "3": 5}) diff --git a/tests/unit/test_count_overlaps.py b/tests/unit/test_count_overlaps.py index f013032d..b83f5872 100644 --- a/tests/unit/test_count_overlaps.py +++ b/tests/unit/test_count_overlaps.py @@ -19,7 +19,7 @@ grs = {n: pr.from_string(s) for n, s in zip(["a", "b", "c"], [a, b, c])} unstranded_grs = {n: gr.unstrand() for n, gr in grs.items()} -features = pr.PyRanges( +features = pr.from_args( chromosomes=["chr1"] * 4, starts=[0, 10, 20, 30], ends=[10, 20, 30, 40], diff --git a/tests/unit/test_genomicfeatures.py b/tests/unit/test_genomicfeatures.py index 9b1cbd99..be6a9818 100644 --- a/tests/unit/test_genomicfeatures.py +++ b/tests/unit/test_genomicfeatures.py @@ -15,11 +15,6 @@ def compute_introns_single(df, by): x.Strand = "-" x = x.df - print("g " * 100) - print(g) - print("x " * 100) - print(x) - if g.empty or x.empty: return pd.DataFrame() @@ -74,10 +69,6 @@ def _introns_correct(full, genes, exons, introns, by): expected = expected_results[gene_id] exons = pr.PyRanges(based_on[gene_id]).subset(lambda df: df.Feature == "exon").merge(by=id_column) genes = pr.PyRanges(based_on[gene_id]).subset(lambda df: df.Feature == by) - print("exons", exons) - print("based_on", based_on[gene_id]) - print("actual", idf["Chromosome Start End Strand".split()]) - print("expected", expected["Chromosome Start End Strand".split()]) _introns = pr.PyRanges(idf) assert len(exons.intersect(_introns)) == 0 assert len(genes.intersect(_introns)) == len(_introns) @@ -96,17 +87,10 @@ def test_introns_single(): exons.Feature = "exon" exons = exons.df df = pd.concat([gr[gr.Feature == "gene"].df, exons], sort=False) - print(df) for gid, gdf in df.groupby("gene_id"): - print("-------" * 20) - print(gid) - print(gdf) - print("gdf", len(gdf)) expected = compute_introns_single(gdf, by="gene") - print("expected", len(expected)) actual = pr.PyRanges(gdf).features.introns().df - print("actual", len(actual)) if actual.empty: assert expected.empty continue From a0934b8cb24abe8fe1957d546207ae7e9e726587 Mon Sep 17 00:00:00 2001 From: endre bakken stovner Date: Fri, 26 May 2023 12:08:11 +0200 Subject: [PATCH 07/10] Add types to readers --- pyranges/data.py | 33 +++--- pyranges/genomicfeatures.py | 98 ++++++++---------- pyranges/get_fasta.py | 28 ++++- pyranges/helpers.py | 8 +- pyranges/multioverlap.py | 14 +-- pyranges/pyranges_main.py | 12 +-- pyranges/readers.py | 199 ++++++++++++++++-------------------- tests/unit/test_io.py | 8 ++ 8 files changed, 193 insertions(+), 207 deletions(-) diff --git a/pyranges/data.py b/pyranges/data.py index a20d946a..90df509f 100644 --- a/pyranges/data.py +++ b/pyranges/data.py @@ -20,6 +20,7 @@ Stranded PyRanges object has 3 rows and 6 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome and Strand. """ +from pathlib import Path import pandas as pd import pkg_resources @@ -43,17 +44,17 @@ ] -def get_example_path(basename): +def get_example_path(basename) -> Path: full_path = pkg_resources.resource_filename("pyranges", "example_data/{}".format(basename)) if full_path.endswith(".bam"): # hack to load index too pkg_resources.resource_filename("pyranges", "example_data/{}.bai".format(basename)) - return full_path + return Path(full_path) -def aorta(): +def aorta() -> "pr.PyRanges": """ >>> # +--------------+-----------+-----------+------------+-----------+--------------+ >>> # | Chromosome | Start | End | Name | Score | Strand | @@ -78,7 +79,7 @@ def aorta(): return pr.read_bed(full_path) -def aorta2(): +def aorta2() -> "pr.PyRanges": """ >>> # +--------------+-----------+-----------+------------+-----------+--------------+ >>> # | Chromosome | Start | End | Name | Score | Strand | @@ -103,13 +104,13 @@ def aorta2(): return pr.read_bed(full_path) -def bw(): +def bw() -> "pr.PyRanges": full_path = get_example_path("bw.bw") return pr.read_bigwig(full_path) -def chipseq(): +def chipseq() -> "pr.PyRanges": """ >>> # +--------------+-----------+-----------+------------+-----------+--------------+ >>> # | Chromosome | Start | End | Name | Score | Strand | @@ -134,7 +135,7 @@ def chipseq(): return pr.read_bed(full_path) -def chipseq_background(): +def chipseq_background() -> "pr.PyRanges": """ >>> # +--------------+-----------+-----------+------------+-----------+--------------+ >>> # | Chromosome | Start | End | Name | Score | Strand | @@ -159,7 +160,7 @@ def chipseq_background(): return pr.read_bed(full_path) -def chromsizes(): +def chromsizes() -> "pr.PyRanges": """ >>> # +--------------+-----------+-----------+ >>> # | Chromosome | Start | End | @@ -184,7 +185,7 @@ def chromsizes(): return pr.read_bed(full_path) -def control_bam(): +def control_bam() -> "pr.PyRanges": """ >>> # +--------------+-----------+-----------+--------------+------------+ >>> # | Chromosome | Start | End | Strand | Flag | @@ -209,7 +210,7 @@ def control_bam(): return pr.read_bam(full_path) -def cpg(): +def cpg() -> "pr.PyRanges": """ >>> # +--------------+-----------+-----------+-----------+ >>> # | Chromosome | Start | End | CpG | @@ -236,7 +237,7 @@ def cpg(): return pr.PyRanges(df) -def ensembl_gtf(): +def ensembl_gtf() -> "pr.PyRanges": """ >>> # +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------+ >>> # | Chromosome | Source | Feature | Start | End | Score | Strand | Frame | gene_biotype | +19 | @@ -262,7 +263,7 @@ def ensembl_gtf(): return pr.read_gtf(full_path) -def exons(): +def exons() -> "pr.PyRanges": """ >>> # +--------------+-----------+-----------+----------------------------------------+-----------+--------------+ >>> # | Chromosome | Start | End | Name | Score | Strand | @@ -287,7 +288,7 @@ def exons(): return pr.read_bed(full_path) -def f1(): +def f1() -> "pr.PyRanges": """ >>> # +--------------+-----------+-----------+------------+-----------+--------------+ >>> # | Chromosome | Start | End | Name | Score | Strand | @@ -306,7 +307,7 @@ def f1(): return pr.read_bed(full_path) -def f2(): +def f2() -> "pr.PyRanges": """ >>> # +--------------+-----------+-----------+------------+-----------+--------------+ >>> # | Chromosome | Start | End | Name | Score | Strand | @@ -324,7 +325,7 @@ def f2(): return pr.read_bed(full_path) -def gencode_gtf(): +def gencode_gtf() -> "pr.PyRanges": """ >>> # +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+-------------------+-------+ >>> # | Chromosome | Source | Feature | Start | End | Score | Strand | Frame | gene_id | +15 | @@ -350,7 +351,7 @@ def gencode_gtf(): return pr.read_gtf(full_path) -def ucsc_bed(): +def ucsc_bed() -> "pr.PyRanges": """ >>> # +--------------+-----------+-----------+------------+------------+-----------------+--------------+---------------+-------------------+ >>> # | Chromosome | Start | End | Feature | gene_id | transcript_id | Strand | exon_number | transcript_name | diff --git a/pyranges/genomicfeatures.py b/pyranges/genomicfeatures.py index 26145a1b..bb8d909a 100644 --- a/pyranges/genomicfeatures.py +++ b/pyranges/genomicfeatures.py @@ -4,6 +4,9 @@ import pyranges as pr from pyranges.multithreaded import pyrange_apply +from pandas.core.frame import DataFrame +from pyranges.pyranges_main import PyRanges +from typing import Dict, Optional __all__ = ["genome_bounds", "tile_genome", "GenomicFeaturesMethods"] @@ -14,12 +17,10 @@ class GenomicFeaturesMethods: Accessed through `gr.features`.""" - pr = None - - def __init__(self, pr): + def __init__(self, pr: PyRanges) -> None: self.pr = pr - def tss(self): + def tss(self) -> PyRanges: """Return the transcription start sites. Returns the 5' for every interval with feature "transcript". @@ -83,7 +84,7 @@ def tss(self): return pr - def tes(self, slack=0): + def tes(self) -> PyRanges: """Return the transcription end sites. Returns the 3' for every interval with feature "transcript". @@ -147,7 +148,7 @@ def tes(self, slack=0): return pr - def introns(self, by="gene", nb_cpu=1): + def introns(self, by: str = "gene") -> PyRanges: """Return the introns. Parameters @@ -155,11 +156,6 @@ def introns(self, by="gene", nb_cpu=1): by : str, {"gene", "transcript"}, default "gene" Whether to find introns per gene or transcript. - nb_cpu: int, default 1 - - How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple. - Will only lead to speedups on large datasets. - See Also -------- pyranges.genomicfeatures.GenomicFeaturesMethods.tss : return the transcription start sites @@ -223,7 +219,7 @@ def introns(self, by="gene", nb_cpu=1): For printing, the PyRanges was sorted on Chromosome and Strand. """ - kwargs = {"by": by, "nb_cpu": nb_cpu} + kwargs = {"by": by} kwargs = pr.pyranges_main.fill_kwargs(kwargs) assert by in ["gene", "transcript"] @@ -244,22 +240,26 @@ def introns(self, by="gene", nb_cpu=1): return pr.from_dfs(result) -def _outside_bounds(df, **kwargs): +def _outside_bounds(df: DataFrame, **kwargs) -> DataFrame: df = df.copy() - chromsizes = kwargs.get("chromsizes") + _chromsizes = kwargs.get("chromsizes") - if not isinstance(chromsizes, dict): - size_df = chromsizes.df + if isinstance(_chromsizes, PyRanges): + size_df = _chromsizes.df + if not size_df.Chromosome.is_unique: + raise ValueError("Chromosomes must be unique in chromsizes.") chromsizes = {k: v for k, v in zip(size_df.Chromosome, size_df.End)} + else: + assert isinstance(_chromsizes, dict) + chromsizes = _chromsizes size = int(chromsizes[df.Chromosome.iloc[0]]) clip = kwargs.get("clip", False) only_right = kwargs.get("only_right", False) ends_outright = df.End > size - if not only_right: - starts_outleft = df.Start < 0 + starts_outleft = df.Start < 0 if not clip: # i.e. remove if only_right: @@ -288,7 +288,7 @@ def _outside_bounds(df, **kwargs): return df -def genome_bounds(gr, chromsizes, clip=False, only_right=False): +def genome_bounds(gr: PyRanges, chromsizes: Dict[str, int], clip: bool = False, only_right: bool = False) -> PyRanges: """Remove or clip intervals outside of genome bounds. Parameters @@ -391,18 +391,15 @@ def genome_bounds(gr, chromsizes, clip=False, only_right=False): return gr.apply(_outside_bounds, chromsizes=chromsizes, clip=clip, only_right=only_right) -def _last_tile(df, **kwargs): - # do not need copy, since it is only used internally by - # tile_genome - # df = df.copy() - sizes = kwargs.get("sizes") +def _last_tile(df: DataFrame, sizes: pd.DataFrame, **kwargs) -> DataFrame: + # do not need copy, since it is only used internally by tile_genome size = sizes[df.Chromosome.iloc[0]].End.iloc[0] df.loc[df.tail(1).index, "End"] = size return df -def tile_genome(genome, tile_size, tile_last=False): +def tile_genome(chromsizes: PyRanges, tile_size: int, tile_last: bool = False) -> PyRanges: """Create a tiled genome. Parameters @@ -464,20 +461,20 @@ def tile_genome(genome, tile_size, tile_last=False): For printing, the PyRanges was sorted on Chromosome. """ - if isinstance(genome, dict): - chromosomes, ends = list(genome.keys()), list(genome.values()) + if isinstance(chromsizes, dict): + chromosomes, ends = list(chromsizes.keys()), list(chromsizes.values()) df = pd.DataFrame({"Chromosome": chromosomes, "Start": 0, "End": ends}) - genome = pr.PyRanges(df) + chromsizes = pr.PyRanges(df) - gr = genome.tile(tile_size) + gr = chromsizes.tile(tile_size) if not tile_last: - gr = gr.apply(_last_tile, sizes=genome) + gr = gr.apply(_last_tile, sizes=chromsizes) return gr -def _keep_transcript_with_most_exons(df): +def _keep_transcript_with_most_exons(df: pd.DataFrame) -> DataFrame: transcripts_with_most_exons = [] for _, gdf in df.groupby("gene_id"): @@ -491,13 +488,11 @@ def _keep_transcript_with_most_exons(df): return pd.concat(transcripts_with_most_exons).reset_index(drop=True) -def filter_transcripts(df, keep="most_exons"): +def filter_transcripts(df: pd.DataFrame) -> DataFrame: return _keep_transcript_with_most_exons(df) -def _tss(df, slack=0): - intype = df.Start.dtype - +def _tss(df: DataFrame, slack: int = 0) -> DataFrame: tss_pos = df.loc[df.Strand == "+"] tss_neg = df.loc[df.Strand == "-"].copy() @@ -512,17 +507,12 @@ def _tss(df, slack=0): tss.Start = tss.Start - slack tss.loc[tss.Start < 0, "Start"] = 0 - tss.index = range(len(tss)) - - tss[["Start", "End"]] = tss[["Start", "End"]].astype(intype) + tss.index = pd.Index(range(len(tss))) return tss -def _tes(df, slack=0): - intype = df.Start.dtype - # df = self.df - +def _tes(df: DataFrame, slack: int = 0) -> DataFrame: tes_pos = df.loc[df.Strand == "+"] tes_neg = df.loc[df.Strand == "-"].copy() @@ -537,9 +527,7 @@ def _tes(df, slack=0): tes.Start = tes.Start - slack tes.loc[tes.Start < 0, "Start"] = 0 - tes.index = range(len(tes)) - - tes[["Start", "End"]] = tes[["Start", "End"]].astype(intype) + tes.index = pd.Index(range(len(tes))) return tes @@ -547,11 +535,11 @@ def _tes(df, slack=0): by_to_id = {"gene": "gene_id", "transcript": "transcript_id"} -def _introns2(df, exons, **kwargs): +def _introns2(df: DataFrame, exons: DataFrame, **kwargs) -> DataFrame: """TODO: refactor""" if df.empty or exons.empty: - return None + return pd.DataFrame(columns=df.columns) original_order = df.columns by = kwargs["by"] @@ -559,12 +547,12 @@ def _introns2(df, exons, **kwargs): exons = exons[["Start", "End", id_column]] genes = df[["Start", "End", id_column]] - exons.columns = ["Start", "End", "by_id"] - genes.columns = ["Start", "End", "by_id"] + exons.columns = pd.Index(["Start", "End", "by_id"]) + genes.columns = pd.Index(["Start", "End", "by_id"]) intersection = pd.Series(np.intersect1d(exons["by_id"], genes["by_id"])) if len(intersection) == 0: - return None + return pd.DataFrame(columns=df.columns) exons = exons[exons["by_id"].isin(intersection)].reset_index(drop=True).sort_values(["by_id", "Start"]) genes = genes[genes["by_id"].isin(intersection)].reset_index(drop=True).sort_values(["by_id", "Start"]) @@ -604,13 +592,17 @@ def _introns2(df, exons, **kwargs): ) vc = introns["by_id"].value_counts(sort=False).to_frame().reset_index() - vc.columns = ["by_id", "counts"] + vc.columns = pd.Index(["by_id", "counts"]) - genes_without_introns = pd.DataFrame(data={"by_id": np.setdiff1d(by_ids.values, vc.by_id.values), "counts": 0}) + genes_without_introns = pd.DataFrame(data={"by_id": np.setdiff1d( + np.array(by_ids.values), + np.array(vc.by_id.values)), + "counts": 0} + ) vc = pd.concat([vc, genes_without_introns]).sort_values("by_id") - original_ids = np.repeat(vc.by_id, vc.counts).to_frame() + original_ids = pd.Series(np.repeat(vc.by_id, vc.counts)).to_frame() original_ids = original_ids.merge( df[["__temp__", id_column]], right_on="__temp__", diff --git a/pyranges/get_fasta.py b/pyranges/get_fasta.py index eee4be0b..e1cfa7e9 100644 --- a/pyranges/get_fasta.py +++ b/pyranges/get_fasta.py @@ -1,11 +1,19 @@ import sys +from pathlib import Path +from typing import TYPE_CHECKING, Optional import pandas as pd +from pandas.core.frame import DataFrame +from pandas.core.series import Series import pyranges as pr # noqa: F401 +from pyranges.pyranges_main import PyRanges +if TYPE_CHECKING: + import pyfaidx # type: ignore -def get_sequence(gr, path=None, pyfaidx_fasta=None): + +def get_sequence(gr: PyRanges, path: Optional[Path] = None, pyfaidx_fasta: Optional["pyfaidx.Fasta"] = None) -> Series: """Get the sequence of the intervals from a fasta file Parameters @@ -14,7 +22,7 @@ def get_sequence(gr, path=None, pyfaidx_fasta=None): Coordinates. - path : str + path : Path Path to fasta file. It will be indexed using pyfaidx if an index is not found @@ -128,7 +136,9 @@ def get_fasta(*args, **kwargs): return get_sequence(*args, **kwargs) -def get_transcript_sequence(gr, group_by, path=None, pyfaidx_fasta=None): +def get_transcript_sequence( + gr: PyRanges, group_by: str, path: Optional[Path] = None, pyfaidx_fasta: Optional["pyfaidx.Fasta"] = None +) -> DataFrame: """Get the sequence of mRNAs, e.g. joining intervals corresponding to exons of the same transcript Parameters @@ -141,7 +151,7 @@ def get_transcript_sequence(gr, group_by, path=None, pyfaidx_fasta=None): intervals are grouped by this/these ID column(s): these are exons belonging to same transcript - path : str + path : Optional Path Path to fasta file. It will be indexed using pyfaidx if an index is not found @@ -223,3 +233,13 @@ def get_transcript_sequence(gr, group_by, path=None, pyfaidx_fasta=None): z["Sequence"] = get_sequence(gr, path=path, pyfaidx_fasta=pyfaidx_fasta) return z.groupby(group_by, as_index=False).agg({"Sequence": "".join}) + + +def _test(): + import doctest + + doctest.testmod() + + +if __name__ == "__main__": + _test() diff --git a/pyranges/helpers.py b/pyranges/helpers.py index cb4f099c..232e5072 100644 --- a/pyranges/helpers.py +++ b/pyranges/helpers.py @@ -1,9 +1,9 @@ -from typing import Tuple, Union +from typing import Tuple, Union, List import pandas as pd -def get_chromosomes_from_dict(dfs): +def get_chromosomes_from_dict(dfs) -> List[str]: keys = list(dfs.keys()) if isinstance(keys[0], tuple): chromosomes = [k[0] for k in keys] @@ -13,7 +13,7 @@ def get_chromosomes_from_dict(dfs): return chromosomes -def get_strands_from_dict(dfs): +def get_strands_from_dict(dfs) -> Union[List[str], List[Tuple[str, str]]]: keys = list(dfs.keys()) if isinstance(keys[0], tuple): strands = [k[1] for k in keys] @@ -32,7 +32,7 @@ def get_key_from_df(df: pd.DataFrame) -> Union[str, Tuple[str, str]]: return chromosome -def single_value_key(df): +def single_value_key(df: pd.DataFrame) -> bool: if "Strand" in df: return len(df[["Chromosome", "Strand"]].drop_duplicates(["Chromosome", "Strand"])) == 1 else: diff --git a/pyranges/multioverlap.py b/pyranges/multioverlap.py index 0dc5fa8d..d331525b 100644 --- a/pyranges/multioverlap.py +++ b/pyranges/multioverlap.py @@ -1,9 +1,11 @@ import numpy as np import pyranges as pr +from pyranges.pyranges_main import PyRanges +from typing import Dict, Optional -def count_overlaps(grs, features=None, strandedness=None, how=None, nb_cpu=1): +def count_overlaps(grs: Dict[str, PyRanges], features: Optional[PyRanges] = None, strandedness: Optional[str] = None, how: Optional[str] = None) -> PyRanges: """Count overlaps in multiple pyranges. Parameters @@ -27,11 +29,6 @@ def count_overlaps(grs, features=None, strandedness=None, how=None, nb_cpu=1): What intervals to report. By default reports all overlapping intervals. "containment" reports intervals where the overlapping is contained within it. - nb_cpu : int, default 1 - - How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple. - Will only lead to speedups on large datasets. - Examples -------- @@ -136,10 +133,7 @@ def count_overlaps(grs, features=None, strandedness=None, how=None, nb_cpu=1): kwargs = { "as_pyranges": False, - "nb_cpu": nb_cpu, - "strandedness": strandedness, "how": how, - "nb_cpu": nb_cpu, } names = list(grs.keys()) @@ -154,7 +148,7 @@ def count_overlaps(grs, features=None, strandedness=None, how=None, nb_cpu=1): gr = gr.drop() kwargs["name"] = name - features.apply_pair(gr, _count_overlaps, **kwargs) # count overlaps modifies the ranges in-place + features.apply_pair(gr, _count_overlaps, strandedness, **kwargs) # count overlaps modifies the ranges in-place def to_int(df): df[names] = df[names].astype(np.int64) diff --git a/pyranges/pyranges_main.py b/pyranges/pyranges_main.py index 2b3b4590..5cd8e97f 100644 --- a/pyranges/pyranges_main.py +++ b/pyranges/pyranges_main.py @@ -609,7 +609,7 @@ def apply_general( return pyrange_apply_single(f, self, **kwargs) - def apply_pair(self, other: "PyRanges", f: Callable, strandedness: None = None, **kwargs) -> "PyRanges": + def apply_pair(self, other: "PyRanges", f: Callable, strandedness: Optional[str] = None, **kwargs) -> "PyRanges": """Apply a function to a pair of PyRanges. The function is applied to each chromosome or chromosome/strand pair found in at least one @@ -633,11 +633,6 @@ def apply_pair(self, other: "PyRanges", f: Callable, strandedness: None = None, Whether to return as a PyRanges or dict. If `f` does not return a pd.DataFrame valid for PyRanges, `as_pyranges` must be False. - nb_cpu: int, default 1 - - How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple. - Will only lead to speedups on large datasets. - **kwargs Additional keyword arguments to pass as keyword arguments to `f` @@ -5214,10 +5209,9 @@ def to_bigwig( from pyranges.out import _to_bigwig - if chromosome_sizes is None: - chromosome_sizes = pr.data.chromsizes() + _chromosome_sizes = pr.data.chromsizes() if chromosome_sizes is None else chromosome_sizes - result = _to_bigwig(self, path, chromosome_sizes, rpm, divide, value_col, dryrun) + result = _to_bigwig(self, path, _chromosome_sizes, rpm, divide, value_col, dryrun) if dryrun: return result diff --git a/pyranges/readers.py b/pyranges/readers.py index cbf3cbb7..cc3207bc 100644 --- a/pyranges/readers.py +++ b/pyranges/readers.py @@ -1,6 +1,8 @@ from __future__ import print_function import sys +from pathlib import Path +from typing import Union, Optional, List import pandas as pd from natsort import natsorted # type: ignore @@ -9,7 +11,7 @@ from pyranges.pyranges_main import PyRanges -def read_bed(f, as_df=False, nrows=None): +def read_bed(f: Union[str, Path], /, nrows: Optional[int] = None) -> pr.PyRanges: """Return bed file as PyRanges. This is a reader for files that follow the bed format. They can have from @@ -24,11 +26,7 @@ def read_bed(f, as_df=False, nrows=None): Path to bed file - as_df : bool, default False - - Whether to return as pandas DataFrame instead of PyRanges. - - nrows : int, default None + nrows : Optional int, default None Number of rows to return. @@ -55,27 +53,18 @@ def read_bed(f, as_df=False, nrows=None): +--------------+-----------+-----------+------------+-----------+--------------+ Stranded PyRanges object has 5 rows and 6 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome and Strand. - - >>> pr.read_bed(path, as_df=True, nrows=5) - Chromosome Start End Name Score Strand - 0 chr1 9916 10115 H3K27me3 5 - - 1 chr1 9939 10138 H3K27me3 7 + - 2 chr1 9951 10150 H3K27me3 8 - - 3 chr1 9953 10152 H3K27me3 5 + - 4 chr1 9978 10177 H3K27me3 7 - - """ columns = ( "Chromosome Start End Name Score Strand ThickStart ThickEnd ItemRGB BlockCount BlockSizes BlockStarts".split() ) - - if f.endswith(".gz"): + path = Path(f) + if path.name.endswith(".gz"): import gzip - first_start = gzip.open(f).readline().split()[1] + first_start = gzip.open(path).readline().decode().split()[1] else: - first_start = open(f).readline().split()[1] + first_start = open(path).readline().split()[1] header = None @@ -85,22 +74,19 @@ def read_bed(f, as_df=False, nrows=None): header = 0 df = pd.read_csv( - f, - dtype={"Chromosome": "category", "Strand": "category"}, + path, + dtype={"Chromosome": "category", "Strand": "category"}, # type: ignore nrows=nrows, header=header, sep="\t", ) - df.columns = columns[: df.shape[1]] + df.columns = pd.Index(columns[: df.shape[1]]) - if not as_df: - return PyRanges(df) - else: - return df + return PyRanges(df) -def read_bam(f, sparse=True, as_df=False, mapq=0, required_flag=0, filter_flag=1540): +def read_bam(f: Union[str, Path], /, sparse=True, mapq=0, required_flag=0, filter_flag=1540) -> pr.PyRanges: """Return bam file as PyRanges. Parameters @@ -113,10 +99,6 @@ def read_bam(f, sparse=True, as_df=False, mapq=0, required_flag=0, filter_flag=1 Whether to return only. - as_df : bool, default False - - Whether to return as pandas DataFrame instead of PyRanges. - mapq : int, default 0 Minimum mapping quality score. @@ -159,7 +141,7 @@ def read_bam(f, sparse=True, as_df=False, mapq=0, required_flag=0, filter_flag=1 Stranded PyRanges object has 10,000 rows and 5 columns from 25 chromosomes. For printing, the PyRanges was sorted on Chromosome and Strand. """ - + path = Path(f) try: import bamread # type: ignore except ImportError: @@ -185,22 +167,17 @@ def read_bam(f, sparse=True, as_df=False, mapq=0, required_flag=0, filter_flag=1 sys.exit(1) if sparse: - df = bamread.read_bam(f, mapq, required_flag, filter_flag) + df = bamread.read_bam(path, mapq, required_flag, filter_flag) else: try: - df = bamread.read_bam_full(f, mapq, required_flag, filter_flag) + df = bamread.read_bam_full(path, mapq, required_flag, filter_flag) except AttributeError: print("bamread version 0.0.6 or higher is required to read bam non-sparsely.") - if as_df: - return df - else: - return PyRanges(df) - - # return bamread.read_bam(f, mapq, required_flag, filter_flag) + return PyRanges(df) -def _fetch_gene_transcript_exon_id(attribute, annotation=None): +def _fetch_gene_transcript_exon_id(attribute: pd.Series, annotation: Optional[str] = None) -> pd.DataFrame: no_quotes = attribute.str.replace('"', "").str.replace("'", "") df = no_quotes.str.extract( @@ -208,48 +185,48 @@ def _fetch_gene_transcript_exon_id(attribute, annotation=None): expand=True, ) # .iloc[:, [1, 2, 3]] - df.columns = "gene_id transcript_id exon_number exon_id".split() + df.columns = pd.Index("gene_id transcript_id exon_number exon_id".split()) if annotation == "ensembl": - newdf = [] + newdfs = [] for c in "gene_id transcript_id exon_id".split(): r = df[c].astype(str).str.extract(r"(\d+)").astype(float) - newdf.append(r) + newdfs.append(r) - newdf = pd.concat(newdf, axis=1) + newdf = pd.concat(newdfs, axis=1) newdf.insert(2, "exon_number", df["exon_number"]) df = newdf return df -def skiprows(f): +def skiprows(f: Path) -> int: try: import gzip - fh = gzip.open(f) - for i, l in enumerate(fh): - if l.decode()[0] != "#": + zh = gzip.open(f) + for i, zl in enumerate(zh): + if zl.decode()[0] != "#": break + zh.close() except (OSError, TypeError): # not a gzipped file, or StringIO fh = open(f) for i, l in enumerate(fh): if l[0] != "#": break - - fh.close() + fh.close() return i def read_gtf( - f, + f: Union[str, Path], + /, full=True, - as_df=False, nrows=None, duplicate_attr=False, ignore_bad: bool = False, -): +) -> pr.PyRanges: """Read files in the Gene Transfer Format. Parameters @@ -262,10 +239,6 @@ def read_gtf( Whether to read and interpret the annotation column. - as_df : bool, default False - - Whether to return as pandas DataFrame instead of PyRanges. - nrows : int, default None Number of rows to read. Default None, i.e. all. @@ -282,8 +255,7 @@ def read_gtf( ---- The GTF format encodes both Start and End as 1-based included. - PyRanges (and also the DF returned by this function, if as_df=True), instead - encodes intervals as 0-based, Start included and End excluded. + PyRanges encodes intervals as 0-based, Start included and End excluded. See Also -------- @@ -315,31 +287,32 @@ def read_gtf( >>> # 18 hidden columns: gene_name, gene_source, gene_biotype, transcript_id, transcript_version, transcript_name, transcript_source, transcript_biotype, tag, transcript_support_level, ... (+ 8 more.) """ - _skiprows = skiprows(f) + path = Path(f) + _skiprows = skiprows(path) if full: - gr = read_gtf_full(f, as_df, nrows, _skiprows, duplicate_attr, ignore_bad=ignore_bad) + gr = read_gtf_full(path, nrows, _skiprows, duplicate_attr, ignore_bad=ignore_bad) else: - gr = read_gtf_restricted(f, _skiprows, as_df=False, nrows=None) + gr = read_gtf_restricted(path, _skiprows, nrows=None) return gr def read_gtf_full( - f, - as_df=False, - nrows=None, - skiprows=0, - duplicate_attr=False, + f: Union[str, Path], + nrows = None, + skiprows = 0, + duplicate_attr = False, ignore_bad: bool = False, chunksize: int = int(1e5), # for unit-testing purposes -): +) -> pr.PyRanges: dtypes = {"Chromosome": "category", "Feature": "category", "Strand": "category"} names = "Chromosome Source Feature Start End Score Strand Frame Attribute".split() + path = Path(f) df_iter = pd.read_csv( - f, + path, sep="\t", header=None, names=names, @@ -353,7 +326,7 @@ def read_gtf_full( dfs = [] for df in df_iter: - extra = _to_rows(df.Attribute, ignore_bad=ignore_bad) + extra = _to_rows(df.Attribute.astype(str), ignore_bad=ignore_bad) df = df.drop("Attribute", axis=1) extra.set_index(df.index, inplace=True) ndf = pd.concat([df, extra], axis=1, sort=False) @@ -362,30 +335,25 @@ def read_gtf_full( df = pd.concat(dfs, sort=False) df.loc[:, "Start"] = df.Start - 1 - if not as_df: - return PyRanges(df) - else: - return df + return PyRanges(df) -def parse_kv_fields(line): +def parse_kv_fields(line: str) -> List[List[str]]: # rstrip: allows for GFF not having a last ";", or having final spaces return [kv.replace('""', '"NA"').replace('"', "").split(None, 1) for kv in line.rstrip("; ").split("; ")] -def to_rows(anno, ignore_bad: bool = False): - rowdicts = [] +def to_rows(anno: pd.Series, ignore_bad: bool = False) -> pd.DataFrame: try: - line = anno.head(1) - for line in line: - line.replace('"', "").replace(";", "").split() + row = anno.head(1) + for entry in row: + str(entry).replace('"', "").replace(";", "").split() except AttributeError: raise Exception( - "Invalid attribute string: {line}. If the file is in GFF3 format, use pr.read_gff3 instead.".format( - line=line - ) + f"Invalid attribute string: {entry}. If the file is in GFF3 format, use pr.read_gff3 instead." ) + rowdicts = [] try: for line in anno: rowdicts.append({k: v for k, v in parse_kv_fields(line)}) @@ -397,7 +365,7 @@ def to_rows(anno, ignore_bad: bool = False): return pd.DataFrame.from_records(rowdicts) -def to_rows_keep_duplicates(anno, ignore_bad: bool = False): +def to_rows_keep_duplicates(anno: pd.Series, ignore_bad: bool = False) -> pd.DataFrame: rowdicts = [] try: for line in anno: @@ -406,11 +374,9 @@ def to_rows_keep_duplicates(anno, ignore_bad: bool = False): # rstrip: allows for GFF not having a last ";", or having final spaces for k, v in tuple(parse_kv_fields(line)): if k not in rowdict: - rowdict[k] = v - elif k in rowdict and isinstance(rowdict[k], list): - rowdict[k].append(v) + rowdict[k] = [v] else: - rowdict[k] = [rowdict[k], v] + rowdict[k].append(v) rowdicts.append({k: ",".join(v) if isinstance(v, list) else v for k, v in rowdict.items()}) except ValueError: @@ -421,7 +387,7 @@ def to_rows_keep_duplicates(anno, ignore_bad: bool = False): return pd.DataFrame.from_records(rowdicts) -def read_gtf_restricted(f, skiprows, as_df=False, nrows=None): +def read_gtf_restricted(f: Union[str, Path], skiprows: Optional[int], nrows: Optional[int] = None) -> pr.PyRanges: """seqname - name of the chromosome or scaffold; chromosome names can be given with or without the 'chr' prefix. Important note: the seqname must be one used within Ensembl, i.e. a standard chromosome name or an Ensembl identifier such as a scaffold ID, without any additional content such as species or assembly. See the example GFF output below. # source - name of the program that generated this feature, or the data source (database or project name) feature - feature type name, e.g. Gene, Variation, Similarity @@ -433,17 +399,18 @@ def read_gtf_restricted(f, skiprows, as_df=False, nrows=None): attribute - A semicolon-separated list of tag-value pairs, providing additional information about each feature. """ dtypes = {"Chromosome": "category", "Feature": "category", "Strand": "category"} + path = Path(f) df_iter = pd.read_csv( - f, + path, sep="\t", comment="#", usecols=[0, 2, 3, 4, 5, 6, 8], header=None, names="Chromosome Feature Start End Score Strand Attribute".split(), - dtype=dtypes, + dtype=dtypes, # type: ignore chunksize=int(1e5), - skiprows=skiprows, + skiprows=skiprows if skiprows is not None else False, nrows=nrows, ) @@ -455,7 +422,7 @@ def read_gtf_restricted(f, skiprows, as_df=False, nrows=None): cols_to_concat = "Chromosome Start End Strand Feature Score".split() extract = _fetch_gene_transcript_exon_id(df.Attribute) - extract.columns = "gene_id transcript_id exon_number exon_id".split() + extract.columns = pd.Index("gene_id transcript_id exon_number exon_id".split()) extract.exon_number = extract.exon_number.astype(float) @@ -468,13 +435,10 @@ def read_gtf_restricted(f, skiprows, as_df=False, nrows=None): df.loc[:, "Start"] = df.Start - 1 - if not as_df: - return PyRanges(df) - else: - return df + return PyRanges(df) -def to_rows_gff3(anno): +def to_rows_gff3(anno) -> pd.DataFrame: rowdicts = [] for line in list(anno): @@ -485,7 +449,7 @@ def to_rows_gff3(anno): return pd.DataFrame.from_records(rowdicts).set_index(anno.index) -def read_gff3(f, full=True, annotation=None, as_df=False, nrows=None): +def read_gff3(f: Union[str, Path], full: bool = True, as_df: bool = False, nrows: Optional[int] = None) -> pr.PyRanges: """Read files in the General Feature Format. Parameters @@ -519,22 +483,23 @@ def read_gff3(f, full=True, annotation=None, as_df=False, nrows=None): pyranges.read_gtf : read files in the Gene Transfer Format """ - _skiprows = skiprows(f) + path = Path(f) + _skiprows = skiprows(path) if not full: - return read_gtf_restricted(f, _skiprows, as_df=as_df, nrows=nrows) + return read_gtf_restricted(path, _skiprows, nrows=nrows) dtypes = {"Chromosome": "category", "Feature": "category", "Strand": "category"} names = "Chromosome Source Feature Start End Score Strand Frame Attribute".split() df_iter = pd.read_csv( - f, + path, comment="#", sep="\t", header=None, names=names, - dtype=dtypes, + dtype=dtypes, # type: ignore chunksize=int(1e5), skiprows=_skiprows, nrows=nrows, @@ -552,13 +517,10 @@ def read_gff3(f, full=True, annotation=None, as_df=False, nrows=None): df.loc[:, "Start"] = df.Start - 1 - if not as_df: - return PyRanges(df) - else: - return df + return PyRanges(df) -def read_bigwig(f, as_df=False): +def read_bigwig(f: Union[str, Path]) -> pr.PyRanges: try: import pyBigWig # type: ignore except ModuleNotFoundError: @@ -589,7 +551,8 @@ def read_bigwig(f, as_df=False): >>> gr """ - bw = pyBigWig.open(f) + path = Path(f) + bw = pyBigWig.open(path) size = int(1e5) chromosomes = bw.chroms() @@ -626,4 +589,18 @@ def read_bigwig(f, as_df=False): } ) - return pr.PyRanges(dfs) + return pr.from_dfs(dfs) + + + + + + +def _test(): + import doctest + + doctest.testmod() + + +if __name__ == "__main__": + _test() diff --git a/tests/unit/test_io.py b/tests/unit/test_io.py index 4160571f..b85caf23 100644 --- a/tests/unit/test_io.py +++ b/tests/unit/test_io.py @@ -1,7 +1,9 @@ import numpy as np +import pandas as pd from pandas.testing import assert_frame_equal import pyranges as pr +from pyranges.readers import to_rows_keep_duplicates ensembl_gtf = "tests/unit/test_data/ensembl.gtf" @@ -57,3 +59,9 @@ def test_read_gff3(): def test_read_bed(): pr.read_bed("pyranges/example_data/chipseq.bed") + + +def test_to_rows_keep_duplicates(): + anno = pd.Series(["gene DDX11L1; gene sonic; unique hi"]) + result = to_rows_keep_duplicates(anno) + assert result.to_dict(orient="index") == {0: {'gene': 'DDX11L1,sonic', "unique": "hi"}} From 841e1fbe3dab9361c7298d62739e49440b675dcb Mon Sep 17 00:00:00 2001 From: endre bakken stovner Date: Fri, 26 May 2023 13:35:58 +0200 Subject: [PATCH 08/10] Add types to out.py --- pyranges/__init__.py | 192 ------------------------------------ pyranges/genomicfeatures.py | 11 +-- pyranges/helpers.py | 2 +- pyranges/multioverlap.py | 10 +- pyranges/out.py | 95 +++++++++++------- pyranges/pyranges_main.py | 33 ++----- pyranges/readers.py | 26 +---- 7 files changed, 87 insertions(+), 282 deletions(-) diff --git a/pyranges/__init__.py b/pyranges/__init__.py index 4b89d8ac..d8db666f 100644 --- a/pyranges/__init__.py +++ b/pyranges/__init__.py @@ -3,7 +3,6 @@ import itertools import sys from collections import defaultdict -from pathlib import Path from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union import numpy as np @@ -413,196 +412,6 @@ def random( pyranges.statistics : statistcal methods for genomics.""" -def to_bigwig(gr: PyRanges, path: Path, chromosome_sizes=Optional[Chromsizes]): - """Write df to bigwig. - - Must contain the columns Chromosome, Start, End and Score. All others are ignored. - - Parameters - ---------- - gr: PyRanges - Intervals to write. - - path : Path - - Where to write bigwig. - - chromosome_sizes : PyRanges or dict - - If dict: map of chromosome names to chromosome length. - - Examples - -------- - - Extended example with how to prepare your data for writing bigwigs: - - >>> d = {'Chromosome': ['chr1', 'chr1', 'chr1'], 'Start': [1, 4, 6], - ... 'End': [7, 8, 10], 'Strand': ['+', '-', '-'], - ... 'Value': [10, 20, 30]} - >>> import pyranges as pr - >>> gr = pr.from_dict(d) - >>> hg19 = pr.data.chromsizes() - >>> print(hg19) - +--------------+-----------+-----------+ - | Chromosome | Start | End | - | (category) | (int64) | (int64) | - |--------------+-----------+-----------| - | chr1 | 0 | 249250621 | - | chr2 | 0 | 243199373 | - | chr3 | 0 | 198022430 | - | chr4 | 0 | 191154276 | - | ... | ... | ... | - | chr22 | 0 | 51304566 | - | chrM | 0 | 16571 | - | chrX | 0 | 155270560 | - | chrY | 0 | 59373566 | - +--------------+-----------+-----------+ - Unstranded PyRanges object has 25 rows and 3 columns from 25 chromosomes. - For printing, the PyRanges was sorted on Chromosome. - - Overlapping intervals are invalid in bigwigs: - - >>> to_bigwig(gr, "outpath.bw", hg19) - Traceback (most recent call last): - ... - AssertionError: Can only write one strand at a time. Use an unstranded PyRanges or subset on strand first. - - >>> to_bigwig(gr["-"], "outpath.bw", hg19) - Traceback (most recent call last): - ... - AssertionError: Intervals must not overlap. - - >>> gr - +--------------+-----------+-----------+--------------+-----------+ - | Chromosome | Start | End | Strand | Value | - | (category) | (int64) | (int64) | (category) | (int64) | - |--------------+-----------+-----------+--------------+-----------| - | chr1 | 1 | 7 | + | 10 | - | chr1 | 4 | 8 | - | 20 | - | chr1 | 6 | 10 | - | 30 | - +--------------+-----------+-----------+--------------+-----------+ - Stranded PyRanges object has 3 rows and 5 columns from 1 chromosomes. - For printing, the PyRanges was sorted on Chromosome and Strand. - - >>> value = gr.to_rle(rpm=False, value_col="Value") - >>> value - chr1 + - -- - +--------+-----+------+ - | Runs | 1 | 6 | - |--------+-----+------| - | Values | 0.0 | 10.0 | - +--------+-----+------+ - Rle of length 7 containing 2 elements (avg. length 3.5) - - chr1 - - -- - +--------+-----+------+------+------+ - | Runs | 4 | 2 | 2 | 2 | - |--------+-----+------+------+------| - | Values | 0.0 | 20.0 | 50.0 | 30.0 | - +--------+-----+------+------+------+ - Rle of length 10 containing 4 elements (avg. length 2.5) - RleDict object with 2 chromosomes/strand pairs. - - >>> raw = gr.to_rle(rpm=False) - >>> raw - chr1 + - -- - +--------+-----+-----+ - | Runs | 1 | 6 | - |--------+-----+-----| - | Values | 0.0 | 1.0 | - +--------+-----+-----+ - Rle of length 7 containing 2 elements (avg. length 3.5) - - chr1 - - -- - +--------+-----+-----+-----+-----+ - | Runs | 4 | 2 | 2 | 2 | - |--------+-----+-----+-----+-----| - | Values | 0.0 | 1.0 | 2.0 | 1.0 | - +--------+-----+-----+-----+-----+ - Rle of length 10 containing 4 elements (avg. length 2.5) - RleDict object with 2 chromosomes/strand pairs. - - >>> result = (value / raw).apply_values(np.log10) - >>> result - chr1 + - -- - +--------+-----+-----+ - | Runs | 1 | 6 | - |--------+-----+-----| - | Values | nan | 1.0 | - +--------+-----+-----+ - Rle of length 7 containing 2 elements (avg. length 3.5) - - chr1 - - -- - +--------+-----+--------------------+--------------------+--------------------+ - | Runs | 4 | 2 | 2 | 2 | - |--------+-----+--------------------+--------------------+--------------------| - | Values | nan | 1.3010300397872925 | 1.3979400396347046 | 1.4771212339401245 | - +--------+-----+--------------------+--------------------+--------------------+ - Rle of length 10 containing 4 elements (avg. length 2.5) - RleDict object with 2 chromosomes/strand pairs. - - >>> out = result.numbers_only().to_ranges() - >>> out - +--------------+-----------+-----------+-------------+--------------+ - | Chromosome | Start | End | Score | Strand | - | (category) | (int64) | (int64) | (float64) | (category) | - |--------------+-----------+-----------+-------------+--------------| - | chr1 | 1 | 7 | 1 | + | - | chr1 | 4 | 6 | 1.30103 | - | - | chr1 | 6 | 8 | 1.39794 | - | - | chr1 | 8 | 10 | 1.47712 | - | - +--------------+-----------+-----------+-------------+--------------+ - Stranded PyRanges object has 4 rows and 5 columns from 1 chromosomes. - For printing, the PyRanges was sorted on Chromosome and Strand. - - >>> to_bigwig(out["-"], "deleteme_reverse.bw", hg19) - >>> to_bigwig(out["+"], "deleteme_forward.bw", hg19) - """ - - try: - import pyBigWig # type: ignore - except ModuleNotFoundError: - print( - "pybigwig must be installed to create bigwigs. Use `conda install -c bioconda pybigwig` or `pip install pybigwig` to install it." - ) - import sys - - sys.exit(1) - - assert ( - len(gr.strands) <= 1 - ), "Can only write one strand at a time. Use an unstranded PyRanges or subset on strand first." - lengths = gr.lengths() - assert isinstance(lengths, pd.Series) - assert np.sum(lengths) == gr.merge().length, "Intervals must not overlap." - - df = gr.df - - unique_chromosomes = list(df.Chromosome.drop_duplicates()) - - if not isinstance(chromosome_sizes, dict): - size_df = chromosome_sizes.df - chromosome_sizes = {k: v for k, v in zip(size_df.Chromosome, size_df.End)} - - header = [(c, int(chromosome_sizes[c])) for c in unique_chromosomes] - - bw = pyBigWig.open(path, "w") - bw.addHeader(header) - - chromosomes = df.Chromosome.tolist() - starts = df.Start.tolist() - ends = df.End.tolist() - values = df.Score.tolist() - - bw.addEntries(chromosomes, starts, ends=ends, values=values) - - def version_info() -> None: import importlib @@ -637,7 +446,6 @@ def update_version_info(_version_info, library) -> None: __all__ = [ "from_string", "from_dict", - "to_bigwig", "count_overlaps", "random", "itergrs", diff --git a/pyranges/genomicfeatures.py b/pyranges/genomicfeatures.py index bb8d909a..298f3bcc 100644 --- a/pyranges/genomicfeatures.py +++ b/pyranges/genomicfeatures.py @@ -1,12 +1,13 @@ +from typing import Dict + import numpy as np import pandas as pd +from pandas.core.frame import DataFrame from sorted_nearest.src.introns import find_introns # type: ignore import pyranges as pr from pyranges.multithreaded import pyrange_apply -from pandas.core.frame import DataFrame from pyranges.pyranges_main import PyRanges -from typing import Dict, Optional __all__ = ["genome_bounds", "tile_genome", "GenomicFeaturesMethods"] @@ -594,10 +595,8 @@ def _introns2(df: DataFrame, exons: DataFrame, **kwargs) -> DataFrame: vc = introns["by_id"].value_counts(sort=False).to_frame().reset_index() vc.columns = pd.Index(["by_id", "counts"]) - genes_without_introns = pd.DataFrame(data={"by_id": np.setdiff1d( - np.array(by_ids.values), - np.array(vc.by_id.values)), - "counts": 0} + genes_without_introns = pd.DataFrame( + data={"by_id": np.setdiff1d(np.array(by_ids.values), np.array(vc.by_id.values)), "counts": 0} ) vc = pd.concat([vc, genes_without_introns]).sort_values("by_id") diff --git a/pyranges/helpers.py b/pyranges/helpers.py index 232e5072..4f238e8f 100644 --- a/pyranges/helpers.py +++ b/pyranges/helpers.py @@ -1,4 +1,4 @@ -from typing import Tuple, Union, List +from typing import List, Tuple, Union import pandas as pd diff --git a/pyranges/multioverlap.py b/pyranges/multioverlap.py index d331525b..61275b43 100644 --- a/pyranges/multioverlap.py +++ b/pyranges/multioverlap.py @@ -1,11 +1,17 @@ +from typing import Dict, Optional + import numpy as np import pyranges as pr from pyranges.pyranges_main import PyRanges -from typing import Dict, Optional -def count_overlaps(grs: Dict[str, PyRanges], features: Optional[PyRanges] = None, strandedness: Optional[str] = None, how: Optional[str] = None) -> PyRanges: +def count_overlaps( + grs: Dict[str, PyRanges], + features: Optional[PyRanges] = None, + strandedness: Optional[str] = None, + how: Optional[str] = None, +) -> PyRanges: """Count overlaps in multiple pyranges. Parameters diff --git a/pyranges/out.py b/pyranges/out.py index cd682197..72eb8245 100644 --- a/pyranges/out.py +++ b/pyranges/out.py @@ -1,8 +1,13 @@ import csv +from pathlib import Path +from typing import Dict, List, Optional, Union import numpy as np import pandas as pd from natsort import natsorted # type: ignore +from pandas.core.frame import DataFrame + +from pyranges.pyranges_main import PyRanges _gtf_columns = { "seqname": "Chromosome", @@ -13,7 +18,6 @@ "score": "Score", "strand": "Strand", "frame": "Frame", - # "attribute": "Attribute" # filled with all others columns } _gff3_columns = _gtf_columns.copy() @@ -43,39 +47,39 @@ ] -def _fill_missing(df, all_columns): +def _fill_missing(df: DataFrame, all_columns: List[str]) -> DataFrame: columns = list(df.columns) - if not df.get(all_columns) is None: - outdf = df.get(all_columns) + if set(columns).intersection(set(all_columns)) == set(all_columns): + return df[all_columns] else: missing = set(all_columns) - set(columns) missing_idx = {all_columns.index(m): m for m in missing} not_missing = set(columns).intersection(set(all_columns)) not_missing_ordered = sorted(not_missing, key=all_columns.index) - outdf = df.get(not_missing_ordered) + outdf = df[not_missing_ordered] - for idx, missing in sorted(missing_idx.items()): - outdf.insert(idx, missing, ".") + for idx, _missing in sorted(missing_idx.items()): + outdf.insert(idx, _missing, ".") - return outdf + return outdf -def _bed(df, keep): +def _bed(df: DataFrame, keep: bool) -> DataFrame: all_columns = "Chromosome Start End Name Score Strand".split() outdf = _fill_missing(df, all_columns) - noncanonical = set(df.columns) - set(all_columns) + noncanonical = list(set(df.columns) - set(all_columns)) noncanonical = [c for c in df.columns if c in noncanonical] if keep: - return pd.concat([outdf, df.get(noncanonical)], axis=1) + return pd.concat([outdf, df[noncanonical]], axis=1) else: return outdf -def _gtf(df, mapping): +def _gtf(df: DataFrame, mapping: Dict[str, str]) -> DataFrame: pr_col2gff_col = {v: k for k, v in mapping.items()} df = df.rename(columns=pr_col2gff_col) # copying here @@ -86,17 +90,17 @@ def _gtf(df, mapping): outdf = _fill_missing(df, all_columns) if "attribute" in df.columns: - attribute = mapping["attribute"] + ' "' + df.attribute + '";' + attribute = pd.Series([mapping["attribute"] + ' "' + df.attribute + '";']) else: # gotten all needed columns, need to join the rest - rest = set(df.columns) - set(all_columns) - rest = sorted(rest, key=columns.index) - rest_df = df.get(rest).copy() + _rest = set(df.columns) - set(all_columns) + rest = sorted(_rest, key=columns.index) + rest_df = df[rest].copy() for c in rest_df: - col = rest_df[c] + col = pd.Series(rest_df[c]) isnull = col.isnull() col = col.astype(str).str.replace("nan", "") - new_val = c + ' "' + col + '";' + new_val = str(c) + ' "' + col + '";' rest_df.loc[:, c] = rest_df[c].astype(str) rest_df.loc[~isnull, c] = new_val rest_df.loc[isnull, c] = "" @@ -107,7 +111,9 @@ def _gtf(df, mapping): return outdf -def _to_gtf(self, path=None, compression="infer", map_cols=None): +def _to_gtf( + self: PyRanges, path: Optional[str] = None, compression: str = "infer", map_cols: Optional[Dict[str, str]] = None +) -> Optional[str]: mapping = _gtf_columns.copy() if map_cols: mapping.update(map_cols) @@ -121,19 +127,26 @@ def _to_gtf(self, path=None, compression="infer", map_cols=None): for outdf in outdfs: outdf.to_csv( path, + sep="\t", index=False, header=False, compression=compression, mode=mode, - sep="\t", quoting=csv.QUOTE_NONE, - ) + ) # type: ignore mode = "a" + return None else: return "".join([outdf.to_csv(index=False, header=False, sep="\t", quoting=csv.QUOTE_NONE) for outdf in outdfs]) -def _to_csv(self, path=None, sep=",", header=True, compression="infer"): +def _to_csv( + self: PyRanges, + path: Optional[Union[Path, str]] = None, + sep: str = ",", + header: bool = True, + compression: str = "infer", +) -> Optional[str]: gr = self if path: @@ -150,6 +163,7 @@ def _to_csv(self, path=None, sep=",", header=True, compression="infer"): ) mode = "a" header = False + return None else: return "".join( [ @@ -159,7 +173,9 @@ def _to_csv(self, path=None, sep=",", header=True, compression="infer"): ) -def _to_bed(self, path=None, sep="\t", keep=True, compression="infer"): +def _to_bed( + self: PyRanges, path: Optional[str] = None, sep: str = "\t", keep: bool = True, compression: str = "infer" +) -> Optional[str]: gr = self outdfs = natsorted(gr.dfs.items()) @@ -176,15 +192,23 @@ def _to_bed(self, path=None, sep="\t", keep=True, compression="infer"): mode=mode, sep="\t", quoting=csv.QUOTE_NONE, - ) + ) # type: ignore mode = "a" - + return None else: res = "".join([outdf.to_csv(index=False, header=False, sep="\t", quoting=csv.QUOTE_NONE) for outdf in outdfs]) return res -def _to_bigwig(self, path, chromosome_sizes, rpm=True, divide=False, value_col=None, dryrun=False): +def _to_bigwig( + self: PyRanges, + path: None, + chromosome_sizes: Union[PyRanges, dict], + rpm: bool = True, + divide: Optional[bool] = False, + value_col: Optional[str] = None, + dryrun: bool = False, +) -> Optional[PyRanges]: try: import pyBigWig # type: ignore except ModuleNotFoundError: @@ -237,8 +261,12 @@ def _to_bigwig(self, path, chromosome_sizes, rpm=True, divide=False, value_col=N bw.addEntries(chromosomes, starts, ends=ends, values=values) + return None + -def _to_gff3(self, path=None, compression="infer", map_cols=None): +def _to_gff3( + self: PyRanges, path: None = None, compression: str = "infer", map_cols: Optional[Dict[str, str]] = None +) -> str: mapping = _gff3_columns.copy() if map_cols: mapping.update(map_cols) @@ -266,7 +294,7 @@ def _to_gff3(self, path=None, compression="infer", map_cols=None): ) -def _gff3(df, mapping): +def _gff3(df: DataFrame, mapping: Dict[str, str]) -> DataFrame: pr_col2gff_col = {v: k for k, v in mapping.items()} df = df.rename(columns=pr_col2gff_col) # copying here @@ -279,14 +307,15 @@ def _gff3(df, mapping): if "attribute" in mapping: attribute_name = mapping["attribute"] attribute_value = df.attribute.iloc[0] - attribute = f"{attribute_name}={attribute_value}" + attribute = pd.Series([f"{attribute_name}={attribute_value}"]) else: # gotten all needed columns, need to join the rest - rest = set(df.columns) - set(all_columns) - rest = sorted(rest, key=columns.index) - rest_df = df.get(rest).copy() + _rest = set(df.columns) - set(all_columns) + rest = sorted(_rest, key=columns.index) + rest_df = df[rest].copy() total_cols = rest_df.shape[1] - for i, c in enumerate(rest_df, 1): + for i, _c in enumerate(rest_df, 1): + c = str(_c) col = rest_df[c] isnull = col.isnull() col = col.astype(str).str.replace("nan", "") diff --git a/pyranges/pyranges_main.py b/pyranges/pyranges_main.py index 5cd8e97f..02055748 100644 --- a/pyranges/pyranges_main.py +++ b/pyranges/pyranges_main.py @@ -5009,9 +5009,7 @@ def three_end(self) -> "PyRanges": # >>> # """ - def to_bed( - self, path: Optional[str] = None, keep: bool = True, compression: str = "infer", chain: bool = False - ) -> Union[str, "PyRanges"]: + def to_bed(self, path: Optional[str] = None, keep: bool = True, compression: str = "infer") -> Optional[str]: r"""Write to bed. Parameters @@ -5027,9 +5025,6 @@ def to_bed( compression : str, compression type to use, by default infer based on extension. See pandas.DataFree.to_csv for more info. - chain : bool, default False - Whether to return the PyRanges after writing. - Examples -------- @@ -5063,16 +5058,7 @@ def to_bed( chr1 1 5 . . + chr1 6 8 . . - - >>> gr.to_bed("test.bed", chain=True) - +--------------+-----------+-----------+--------------+-----------+ - | Chromosome | Start | End | Strand | Gene | - | (category) | (int64) | (int64) | (category) | (int64) | - |--------------+-----------+-----------+--------------+-----------| - | chr1 | 1 | 5 | + | 1 | - | chr1 | 6 | 8 | - | 2 | - +--------------+-----------+-----------+--------------+-----------+ - Stranded PyRanges object has 2 rows and 5 columns from 1 chromosomes. - For printing, the PyRanges was sorted on Chromosome and Strand. + >>> gr.to_bed("test.bed") >>> open("test.bed").readlines() ['chr1\t1\t5\t.\t.\t+\t1\n', 'chr1\t6\t8\t.\t.\t-\t2\n'] @@ -5081,10 +5067,7 @@ def to_bed( result = _to_bed(self, path, keep=keep, compression=compression) - if path and chain: - return self - else: - return result + return result def to_bigwig( self, @@ -5223,7 +5206,7 @@ def to_bigwig( def to_csv( self, path: Optional["Path"] = None, sep: str = ",", header: bool = True, compression: str = "infer" - ) -> Union[str, "PyRanges"]: + ) -> Optional[str]: r"""Write to comma- or other value-separated file. Parameters @@ -5391,9 +5374,8 @@ def to_gtf( self, path: None = None, compression: str = "infer", - chain: bool = False, map_cols: Optional[Dict[str, str]] = None, - ) -> str: + ) -> Optional[str]: """Write to Gene Transfer Format. The GTF format consists of a tab-separated file without header. @@ -5479,10 +5461,7 @@ def to_gtf( result = _to_gtf(self, path, compression=compression, map_cols=map_cols) - if path and chain: - return self - else: - return result + return result def to_rle( self, value_col: Optional[str] = None, strand: Optional[bool] = None, rpm: bool = False, nb_cpu: int = 1 diff --git a/pyranges/readers.py b/pyranges/readers.py index cc3207bc..d90f8a0c 100644 --- a/pyranges/readers.py +++ b/pyranges/readers.py @@ -2,7 +2,7 @@ import sys from pathlib import Path -from typing import Union, Optional, List +from typing import List, Optional, Union import pandas as pd from natsort import natsorted # type: ignore @@ -300,9 +300,9 @@ def read_gtf( def read_gtf_full( f: Union[str, Path], - nrows = None, - skiprows = 0, - duplicate_attr = False, + nrows=None, + skiprows=0, + duplicate_attr=False, ignore_bad: bool = False, chunksize: int = int(1e5), # for unit-testing purposes ) -> pr.PyRanges: @@ -349,9 +349,7 @@ def to_rows(anno: pd.Series, ignore_bad: bool = False) -> pd.DataFrame: for entry in row: str(entry).replace('"', "").replace(";", "").split() except AttributeError: - raise Exception( - f"Invalid attribute string: {entry}. If the file is in GFF3 format, use pr.read_gff3 instead." - ) + raise Exception(f"Invalid attribute string: {entry}. If the file is in GFF3 format, use pr.read_gff3 instead.") rowdicts = [] try: @@ -590,17 +588,3 @@ def read_bigwig(f: Union[str, Path]) -> pr.PyRanges: ) return pr.from_dfs(dfs) - - - - - - -def _test(): - import doctest - - doctest.testmod() - - -if __name__ == "__main__": - _test() From cc0eb202d346e6be9106cb771e0742d5c98a6a58 Mon Sep 17 00:00:00 2001 From: endre bakken stovner Date: Fri, 26 May 2023 14:30:41 +0200 Subject: [PATCH 09/10] Add types to statistics --- pyranges/__init__.py | 10 --- pyranges/pyranges_main.py | 29 +++---- pyranges/statistics.py | 173 ++++++++++++++++++++------------------ 3 files changed, 105 insertions(+), 107 deletions(-) diff --git a/pyranges/__init__.py b/pyranges/__init__.py index d8db666f..fb914b2c 100644 --- a/pyranges/__init__.py +++ b/pyranges/__init__.py @@ -457,13 +457,3 @@ def update_version_info(_version_info, library) -> None: "PyRanges", "version_info", ] - - -def _test(): - import doctest - - doctest.testmod() - - -if __name__ == "__main__": - _test() diff --git a/pyranges/pyranges_main.py b/pyranges/pyranges_main.py index 02055748..cbb70bd7 100644 --- a/pyranges/pyranges_main.py +++ b/pyranges/pyranges_main.py @@ -2289,7 +2289,7 @@ def items(self) -> Union[List[Tuple[str, pd.DataFrame]], List[Tuple[Tuple[str, s def join( self, other: "PyRanges", - strandedness: None = None, + strandedness: Optional[str] = None, how: Optional[str] = None, report_overlap: bool = False, slack: int = 0, @@ -2552,15 +2552,15 @@ def length(self) -> int: 5 """ - lengths = self.lengths(as_dict=False) + lengths = self.lengths() assert isinstance(lengths, pd.Series) length = lengths.sum() assert isinstance(length, (np.int64, int)) return int(length) def lengths( - self, as_dict: bool = False - ) -> Union[pd.Series, Dict[Tuple[str, str], pd.Series], Dict[str, pd.Series]]: + self + ) -> pd.Series: """Return the length of each interval. Parameters @@ -2617,18 +2617,15 @@ def lengths( For printing, the PyRanges was sorted on Chromosome and Strand. """ - if as_dict: - return {k: df.End - df.Start for k, df in self.items()} # type: ignore - else: - _lengths: List[pd.Series] = [] - if not len(self): - return pd.Series([], dtype=np.int64) - for _, df in self: - _lengths.append(df.End - df.Start) + _lengths: List[pd.Series] = [] + if not len(self): + return pd.Series([], dtype=np.int64) + for _, df in self: + _lengths.append(df.End - df.Start) - ls = pd.concat(_lengths).reset_index(drop=True) - assert isinstance(ls, pd.Series) - return ls + ls = pd.concat(_lengths).reset_index(drop=True) + assert isinstance(ls, pd.Series) + return ls def max_disjoint(self, strand: Optional[bool] = None, slack: int = 0, **kwargs) -> "PyRanges": """Find the maximal disjoint set of intervals. @@ -3561,7 +3558,7 @@ def sample(self, n: int = 8, replace: bool = False) -> "PyRanges": def set_intersect( self, other: "PyRanges", - strandedness: None = None, + strandedness: Optional[str] = None, how: Optional[str] = None, new_pos: bool = False, nb_cpu: int = 1, diff --git a/pyranges/statistics.py b/pyranges/statistics.py index b2361acd..e6d5c2aa 100644 --- a/pyranges/statistics.py +++ b/pyranges/statistics.py @@ -9,6 +9,11 @@ import pyranges as pr from pyranges.methods.statistics import _relative_distance from pyranges.multithreaded import pyrange_apply +from numpy import float64, int64, ndarray +from pandas.core.frame import DataFrame +from pandas.core.series import Series +from pyranges.pyranges_main import PyRanges +from typing import Dict, List, Optional, Union, Any __all__ = [ "simes", @@ -22,7 +27,7 @@ ] -def fdr(p_vals): +def fdr(p_vals: Series) -> Series: """Adjust p-values with Benjamini-Hochberg. Parameters @@ -76,7 +81,7 @@ def fdr(p_vals): return fdr -def fisher_exact(tp, fp, fn, tn, pseudocount=0): +def fisher_exact(tp: Series, fp: Series, fn: Series, tn: Series, pseudocount: int = 0) -> DataFrame: """Fisher's exact for contingency tables. Computes the hypotheses two-sided, less and greater at the same time. @@ -149,10 +154,10 @@ def fisher_exact(tp, fp, fn, tn, pseudocount=0): ) sys.exit(-1) - tp = np.array(tp, dtype=np.uint) - fp = np.array(fp, dtype=np.uint) - fn = np.array(fn, dtype=np.uint) - tn = np.array(tn, dtype=np.uint) + tp = pd.Series(np.array(tp, dtype=np.uint)) + fp = pd.Series(np.array(fp, dtype=np.uint)) + fn = pd.Series(np.array(fn, dtype=np.uint)) + tn = pd.Series(np.array(tn, dtype=np.uint)) left, right, twosided = pvalue_npy(tp, fp, fn, tn) @@ -163,7 +168,7 @@ def fisher_exact(tp, fp, fn, tn, pseudocount=0): return df -def mcc(grs, genome=None, labels=None, strand=False, verbose=False): +def mcc(grs: List[PyRanges], genome: Optional[Union[pr.PyRanges, pd.DataFrame, Dict[str, int]]] = None, labels: Optional[str] = None, strand: bool = False, verbose: bool = False) -> DataFrame: """Compute Matthew's correlation coefficient for PyRanges overlaps. Parameters @@ -218,9 +223,28 @@ def mcc(grs, genome=None, labels=None, strand=False, verbose=False): import sys from itertools import chain, combinations_with_replacement + if genome is None: + genome = defaultdict(int) + for gr in grs: + for k, v in gr: + genome[k] = max(genome[k], v.End.max()) + + + if not isinstance(genome, dict): + _genome = genome + genome_length = int(_genome.End.sum()) + else: + _genome = pd.DataFrame( + { + "Chromosome": list(genome.keys()), + "Start": 0, + "End": list(genome.values()) + } + ) + genome_length = sum(genome.values()) + if labels is None: - _labels = list(range(len(grs))) - _labels = combinations_with_replacement(_labels, r=2) + _labels = combinations_with_replacement(np.arange(len(grs)), r=2) else: assert len(labels) == len(grs) _labels = combinations_with_replacement(labels, r=2) @@ -228,18 +252,15 @@ def mcc(grs, genome=None, labels=None, strand=False, verbose=False): # remove all non-loc columns before computation grs = [gr.merge(strand=strand) for gr in grs] - if genome is not None: - if isinstance(genome, (pd.DataFrame, pr.PyRanges)): - genome_length = int(genome.End.sum()) - else: - genome_length = sum(genome.values()) + if _genome is not None: + genome_length = int(_genome.End.sum()) if verbose: # check that genome definition does not have many more # chromosomes than datafiles - gr_cs = set(chain(*[gr.chromosomes for gr in grs])) + gr_cs = set(chain(*[gr.Chromosome for gr in grs])) - g_cs = set(genome.chromosomes) + g_cs = set(_genome.keys()) surplus = g_cs - gr_cs if len(surplus): print( @@ -257,15 +278,7 @@ def make_stranded(df): df2.insert(df2.shape[1], "Strand", "-") return pd.concat([df, df2]) - genome = genome.apply(make_stranded) - - else: - d = defaultdict(int) - for gr in grs: - for k, v in gr: - d[k] = max(d[k], v.End.max()) - - genome_length = sum(d.values()) + _genome = _genome.apply(make_stranded) strandedness = "same" if strand else None @@ -282,7 +295,7 @@ def make_stranded(df): fp = 0 rowdicts.append({"T": lt, "F": lf, "TP": tp, "FP": fp, "TN": tn, "FN": fn, "MCC": 1}) else: - for strand in "+ -".split(): + for _strand in "+ -".split(): tp = t[strand].length fn = 0 tn = genome_length - tp @@ -291,7 +304,7 @@ def make_stranded(df): { "T": lt, "F": lf, - "Strand": strand, + "Strand": _strand, "TP": tp, "FP": fp, "TN": tn, @@ -305,17 +318,17 @@ def make_stranded(df): j = t.join(f, strandedness=strandedness) tp_gr = j.new_position("intersection").merge(strand=strand) if strand: - for strand in "+ -".split(): - tp = tp_gr[strand].length - fp = f[strand].length - tp - fn = t[strand].length - tp + for _strand in "+ -".split(): + tp = tp_gr[_strand].length + fp = f[_strand].length - tp + fn = t[_strand].length - tp tn = genome_length - (tp + fp + fn) mcc = _mcc(tp, fp, tn, fn) rowdicts.append( { "T": lt, "F": lf, - "Strand": strand, + "Strand": _strand, "TP": tp, "FP": fp, "TN": tn, @@ -327,7 +340,7 @@ def make_stranded(df): { "T": lf, "F": lt, - "Strand": strand, + "Strand": _strand, "TP": tp, "FP": fn, "TN": tn, @@ -365,12 +378,12 @@ def make_stranded(df): } ) - df = pd.DataFrame.from_dict(rowdicts).sort_values(["T", "F"]) + df = pd.DataFrame.from_records(rowdicts).sort_values(["T", "F"]) return df -def rowbased_spearman(x, y): +def rowbased_spearman(x: ndarray, y: ndarray) -> ndarray: """Fast row-based Spearman's correlation. Parameters @@ -416,7 +429,7 @@ def rowbased_spearman(x, y): return rowbased_pearson(rx, ry) -def rowbased_pearson(x, y): +def rowbased_pearson(x: Union[ndarray, DataFrame], y: Union[ndarray, DataFrame]) -> ndarray: """Fast row-based Pearson's correlation. Parameters @@ -475,7 +488,7 @@ def ss(a, axis): return r -def rowbased_rankdata(data): +def rowbased_rankdata(data: ndarray) -> DataFrame: """Rank order of entries in each row. Same as SciPy rankdata with method=mean. @@ -519,17 +532,14 @@ def rowbased_rankdata(data): obs = np.column_stack([np.ones(len(res), dtype=bool), res]) - dense = np.take_along_axis(np.apply_along_axis(np.cumsum, 1, obs), inv, 1) + dense = pd.DataFrame(np.take_along_axis(np.apply_along_axis(np.cumsum, 1, obs), inv, 1)) len_r = obs.shape[1] nonzero = np.count_nonzero(obs, axis=1) - obs = pd.DataFrame(obs) - nonzero = pd.Series(nonzero) - dense = pd.DataFrame(dense) - ranks = [] - for _nonzero, nzdf in obs.groupby(nonzero, sort=False): + _ranks = [] + for _nonzero, nzdf in pd.DataFrame(obs).groupby(pd.Series(nonzero), sort=False): nz = np.apply_along_axis(lambda r: np.nonzero(r)[0], 1, nzdf) _count = np.column_stack([nz, np.ones(len(nz)) * len_r]) @@ -538,14 +548,14 @@ def rowbased_rankdata(data): _result = 0.5 * (np.take_along_axis(_count, _dense, 1) + np.take_along_axis(_count, _dense - 1, 1) + 1) result = pd.DataFrame(_result, index=nzdf.index) - ranks.append(result) + _ranks.append(result) - final = pd.concat(ranks).sort_index(kind="mergesort") + final = pd.concat(_ranks).sort_index(kind="mergesort") return final -def simes(df, groupby, pcol, keep_position=False): +def simes(df: DataFrame, groupby: Union[str, List[str]], pcol: str, keep_position: bool = False) -> DataFrame: """Apply Simes method for giving dependent events a p-value. Parameters @@ -634,9 +644,9 @@ def simes(df, groupby, pcol, keep_position=False): sdf = df[positions + sorter].sort_values(sorter) g = sdf.groupby(positions + groupby) - ranks = g.cumcount().values + 1 - size = g.size().values - size = np.repeat(size, size) + ranks = pd.Series(g.cumcount().values) + 1 + _size = np.array(g.size().values) + size = np.repeat(a=_size, repeats=_size) multiplied = sdf[pcol].values * size simes = multiplied / ranks @@ -665,15 +675,15 @@ def simes(df, groupby, pcol, keep_position=False): return simes -def chromsizes_as_int(chromsizes): - if isinstance(chromsizes, int): - pass - elif isinstance(chromsizes, dict): - chromsizes = sum(chromsizes.values()) +def chromsizes_as_int(chromsizes: Union[PyRanges, DataFrame, Dict[Any, int]]) -> int: + if isinstance(chromsizes, dict): + _chromsizes = sum(chromsizes.values()) elif isinstance(chromsizes, (pd.DataFrame, pr.PyRanges)): - chromsizes = chromsizes.End.sum() + _chromsizes = chromsizes.End.sum() + else: + raise TypeError("chromsizes must be dict, DataFrame or PyRanges, was {}".format(type(chromsizes))) - return chromsizes + return _chromsizes class StatisticsMethods: @@ -682,12 +692,10 @@ class StatisticsMethods: Accessed with gr.stats.""" - pr = None - - def __init__(self, pr): + def __init__(self, pr: PyRanges) -> None: self.pr = pr - def forbes(self, other, chromsizes, strandedness=None): + def forbes(self, other: PyRanges, chromsizes: PyRanges, strandedness: Optional[str] = None) -> float64: """Compute Forbes coefficient. Ratio which represents observed versus expected co-occurence. @@ -728,27 +736,24 @@ def forbes(self, other, chromsizes, strandedness=None): >>> gr.stats.forbes(gr2, chromsizes=chromsizes) 1.7168314674978278""" - chromsizes = chromsizes_as_int(chromsizes) + _chromsizes = chromsizes_as_int(chromsizes) - self = self.pr - - kwargs = {} - kwargs["sparse"] = {"self": True, "other": True} + kwargs = {"sparse": {"self": True, "other": True}} kwargs = pr.pyranges_main.fill_kwargs(kwargs) strand = True if kwargs.get("strandedness") else False - reference_length = self.merge(strand=strand).length + reference_length = self.pr.merge(strand=strand).length query_length = other.merge(strand=strand).length intersection_sum = sum( - v.sum() for v in self.set_intersect(other, strandedness=strandedness).lengths(as_dict=True).values() + v.sum() for v in self.pr.set_intersect(other, strandedness=strandedness).lengths() ) - forbes = chromsizes * intersection_sum / (reference_length * query_length) + forbes = _chromsizes * intersection_sum / (reference_length * query_length) return forbes - def jaccard(self, other, **kwargs): + def jaccard(self, other: PyRanges, **kwargs) -> float: """Compute Jaccards coefficient. Ratio of the intersection and union of two sets. @@ -787,27 +792,25 @@ def jaccard(self, other, **kwargs): >>> gr.stats.jaccard(gr2, chromsizes=chromsizes) 6.657941988519211e-05""" - self = self.pr - kwargs["sparse"] = {"self": True, "other": True} kwargs = pr.pyranges_main.fill_kwargs(kwargs) strand = True if kwargs.get("strandedness") else False - intersection_sum = sum(v.sum() for v in self.set_intersect(other).lengths(as_dict=True).values()) + intersection_sum = sum(v.sum() for v in self.pr.set_intersect(other).lengths()) union_sum = 0 - for gr in [self, other]: - union_sum += sum(v.sum() for v in gr.merge(strand=strand).lengths(as_dict=True).values()) + for gr in [self.pr, other]: + union_sum += sum(v.sum() for v in gr.merge(strand=strand).lengths()) denominator = union_sum - intersection_sum if denominator == 0: - return 1 + return 1.0 else: jc = intersection_sum / denominator return jc - def relative_distance(self, other, **kwargs): + def relative_distance(self, other: PyRanges, **kwargs) -> DataFrame: """Compute spatial correllation between two sets. Metric which describes relative distance between each interval in one @@ -899,14 +902,12 @@ def relative_distance(self, other, **kwargs): 49 0.49 194 9956 0.019486 """ - self = self.pr - kwargs["sparse"] = {"self": True, "other": True} kwargs = pr.pyranges_main.fill_kwargs(kwargs) - result = pyrange_apply(_relative_distance, self, other, **kwargs) # pylint: disable=E1132 + dfs = pyrange_apply(_relative_distance, self.pr, other, **kwargs) - result = pd.Series(np.concatenate(list(result.values()))) + result = pd.Series(np.concatenate(list(dfs.values()))) not_nan = ~np.isnan(result) result.loc[not_nan] = np.floor(result[not_nan] * 100) / 100 @@ -920,7 +921,7 @@ def relative_distance(self, other, **kwargs): return vc -def _mcc(tp, fp, tn, fn): +def _mcc(tp: int, fp: int, tn: int, fn: int) -> float: # https://stackoverflow.com/a/56875660/992687 x = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) return ((tp * tn) - (fp * fn)) / sqrt(x) @@ -950,3 +951,13 @@ def _mcc(tp, fp, tn, fn): # _tetrachoric = cos(180/(1 + sqrt((b * c) / (a * d)))) # return _tetrachoric + + +def _test(): + import doctest + + doctest.testmod() + + +if __name__ == "__main__": + _test() From 02b458f1d5991fda1cdcc11e685f20fc4830d69c Mon Sep 17 00:00:00 2001 From: endre bakken stovner Date: Fri, 26 May 2023 17:44:47 +0200 Subject: [PATCH 10/10] Add types to getitem/subset --- pyranges/methods/getitem.py | 9 ++- pyranges/methods/summary.py | 13 ++-- pyranges/out.py | 15 ++-- pyranges/pyranges_main.py | 17 +---- pyranges/statistics.py | 65 ++++++++--------- pyranges/subset.py | 135 +++++++++++++++++++----------------- 6 files changed, 126 insertions(+), 128 deletions(-) diff --git a/pyranges/methods/getitem.py b/pyranges/methods/getitem.py index bdcd01b4..8d9d941a 100644 --- a/pyranges/methods/getitem.py +++ b/pyranges/methods/getitem.py @@ -3,7 +3,7 @@ import pyranges as pr from pyranges.methods.drop import _keep -from pyranges.subset import get_booldict, get_slice, get_string, get_tuple +from pyranges.subset import get_2_tuple, get_booldict, get_chromosome_strand_loc, get_slice, get_string def _getitem(self, val): @@ -12,7 +12,12 @@ def _getitem(self, val): elif isinstance(val, str): dfs = get_string(self, val) elif isinstance(val, tuple): - dfs = get_tuple(self, val) + if len(val) == 2: + dfs = get_2_tuple(self, val[0], val[1]) + elif len(val) == 3: + dfs = get_chromosome_strand_loc(self, val[0], val[1], val[2]) + else: + raise ValueError("Indexing tuple must be of length 2 or 3. Tuple was: {}".format(str(val))) elif isinstance(val, slice): dfs = get_slice(self, val) elif isinstance(val, dict): diff --git a/pyranges/methods/summary.py b/pyranges/methods/summary.py index 166f6f57..4b6c6e57 100644 --- a/pyranges/methods/summary.py +++ b/pyranges/methods/summary.py @@ -7,28 +7,27 @@ def _summary(self, to_stdout=True, return_df=False): lengths = {} total_lengths = {} - lengths["pyrange"] = self.lengths(as_dict=True) + lengths["pyrange"] = self.lengths() total_lengths["pyrange"] = [self.length] if self.stranded: c = self.merge(strand=True) - lengths["coverage_forward"] = c["+"].lengths(as_dict=True) - lengths["coverage_reverse"] = c["-"].lengths(as_dict=True) + lengths["coverage_forward"] = c["+"].lengths() + lengths["coverage_reverse"] = c["-"].lengths() total_lengths["coverage_forward"] = [c["+"].length] total_lengths["coverage_reverse"] = [c["-"].length] else: c = self c = c.merge(strand=False) - lengths["coverage_unstranded"] = c.lengths(as_dict=True) + lengths["coverage_unstranded"] = c.lengths() total_lengths["coverage_unstranded"] = [c.length] summaries = OrderedDict() # statistics for lengths - for summary, d in lengths.items(): - if d: - summaries[summary] = pd.concat(d.values()).describe() + for summary, s in lengths.items(): + summaries[summary] = s.describe() summary = pd.concat(summaries.values(), axis=1) summary.columns = list(summaries) diff --git a/pyranges/out.py b/pyranges/out.py index 72eb8245..7e7872a2 100644 --- a/pyranges/out.py +++ b/pyranges/out.py @@ -90,7 +90,7 @@ def _gtf(df: DataFrame, mapping: Dict[str, str]) -> DataFrame: outdf = _fill_missing(df, all_columns) if "attribute" in df.columns: - attribute = pd.Series([mapping["attribute"] + ' "' + df.attribute + '";']) + attribute = mapping["attribute"] + ' "' + df.attribute + '";' else: # gotten all needed columns, need to join the rest _rest = set(df.columns) - set(all_columns) @@ -294,7 +294,7 @@ def _to_gff3( ) -def _gff3(df: DataFrame, mapping: Dict[str, str]) -> DataFrame: +def _gff3(df, mapping) -> pd.DataFrame: pr_col2gff_col = {v: k for k, v in mapping.items()} df = df.rename(columns=pr_col2gff_col) # copying here @@ -307,15 +307,14 @@ def _gff3(df: DataFrame, mapping: Dict[str, str]) -> DataFrame: if "attribute" in mapping: attribute_name = mapping["attribute"] attribute_value = df.attribute.iloc[0] - attribute = pd.Series([f"{attribute_name}={attribute_value}"]) + attribute = f"{attribute_name}={attribute_value}" else: # gotten all needed columns, need to join the rest - _rest = set(df.columns) - set(all_columns) - rest = sorted(_rest, key=columns.index) - rest_df = df[rest].copy() + rest = set(df.columns) - set(all_columns) + _rest = sorted(rest, key=columns.index) + rest_df = df.get(_rest).copy() total_cols = rest_df.shape[1] - for i, _c in enumerate(rest_df, 1): - c = str(_c) + for i, c in enumerate(rest_df, 1): col = rest_df[c] isnull = col.isnull() col = col.astype(str).str.replace("nan", "") diff --git a/pyranges/pyranges_main.py b/pyranges/pyranges_main.py index cbb70bd7..6708d20a 100644 --- a/pyranges/pyranges_main.py +++ b/pyranges/pyranges_main.py @@ -2558,18 +2558,12 @@ def length(self) -> int: assert isinstance(length, (np.int64, int)) return int(length) - def lengths( - self - ) -> pd.Series: + def lengths(self) -> pd.Series: """Return the length of each interval. Parameters ---------- - as_dict : bool, default False - - Whether to return lengths as pd.Series or dict of pd.Series per key. - Returns ------- pd.Series or dict of pd.Series with the lengths of each interval. @@ -4283,11 +4277,6 @@ def subset(self, f: Callable, strand: Optional[bool] = None, **kwargs) -> "PyRan Whether to do operations on chromosome/strand pairs or chromosomes. If None, will use chromosome/strand pairs if the PyRanges is stranded. - nb_cpu : int, default 1 - - How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple. - Will only lead to speedups on large datasets. - **kwargs Additional keyword arguments to pass as keyword arguments to `f` @@ -5402,10 +5391,6 @@ def to_gtf( Which compression to use. Uses file extension to infer by default. - chain: bool, default False - - Whether to return the PyRanges after writing. - map_cols: dict, default None Override mapping between GTF and PyRanges fields for any number of columns. diff --git a/pyranges/statistics.py b/pyranges/statistics.py index e6d5c2aa..d7336779 100644 --- a/pyranges/statistics.py +++ b/pyranges/statistics.py @@ -2,18 +2,18 @@ from collections import defaultdict from math import sqrt +from typing import Any, Dict, List, Optional, Union import numpy as np import pandas as pd +from numpy import ndarray +from pandas.core.frame import DataFrame +from pandas.core.series import Series import pyranges as pr from pyranges.methods.statistics import _relative_distance from pyranges.multithreaded import pyrange_apply -from numpy import float64, int64, ndarray -from pandas.core.frame import DataFrame -from pandas.core.series import Series from pyranges.pyranges_main import PyRanges -from typing import Dict, List, Optional, Union, Any __all__ = [ "simes", @@ -154,21 +154,27 @@ def fisher_exact(tp: Series, fp: Series, fn: Series, tn: Series, pseudocount: in ) sys.exit(-1) - tp = pd.Series(np.array(tp, dtype=np.uint)) - fp = pd.Series(np.array(fp, dtype=np.uint)) - fn = pd.Series(np.array(fn, dtype=np.uint)) - tn = pd.Series(np.array(tn, dtype=np.uint)) + _tp = np.array(tp, dtype=np.uint) + _fp = np.array(fp, dtype=np.uint) + _fn = np.array(fn, dtype=np.uint) + _tn = np.array(tn, dtype=np.uint) - left, right, twosided = pvalue_npy(tp, fp, fn, tn) + left, right, twosided = pvalue_npy(_tp, _fp, _fn, _tn) - OR = ((tp + pseudocount) / (fp + pseudocount)) / ((fn + pseudocount) / (tn + pseudocount)) + OR = ((_tp + pseudocount) / (_fp + pseudocount)) / ((_fn + pseudocount) / (_tn + pseudocount)) df = pd.DataFrame({"OR": OR, "P": twosided, "PLeft": left, "PRight": right}) return df -def mcc(grs: List[PyRanges], genome: Optional[Union[pr.PyRanges, pd.DataFrame, Dict[str, int]]] = None, labels: Optional[str] = None, strand: bool = False, verbose: bool = False) -> DataFrame: +def mcc( + grs: List["PyRanges"], + genome: Optional[Union["PyRanges", pd.DataFrame, Dict[str, int]]] = None, + labels: Optional[str] = None, + strand: bool = False, + verbose: bool = False, +) -> DataFrame: """Compute Matthew's correlation coefficient for PyRanges overlaps. Parameters @@ -229,18 +235,11 @@ def mcc(grs: List[PyRanges], genome: Optional[Union[pr.PyRanges, pd.DataFrame, D for k, v in gr: genome[k] = max(genome[k], v.End.max()) - if not isinstance(genome, dict): _genome = genome genome_length = int(_genome.End.sum()) else: - _genome = pd.DataFrame( - { - "Chromosome": list(genome.keys()), - "Start": 0, - "End": list(genome.values()) - } - ) + _genome = pd.DataFrame({"Chromosome": list(genome.keys()), "Start": 0, "End": list(genome.values())}) genome_length = sum(genome.values()) if labels is None: @@ -555,7 +554,7 @@ def rowbased_rankdata(data: ndarray) -> DataFrame: return final -def simes(df: DataFrame, groupby: Union[str, List[str]], pcol: str, keep_position: bool = False) -> DataFrame: +def simes(df, groupby, pcol, keep_position=False): """Apply Simes method for giving dependent events a p-value. Parameters @@ -644,9 +643,9 @@ def simes(df: DataFrame, groupby: Union[str, List[str]], pcol: str, keep_positio sdf = df[positions + sorter].sort_values(sorter) g = sdf.groupby(positions + groupby) - ranks = pd.Series(g.cumcount().values) + 1 - _size = np.array(g.size().values) - size = np.repeat(a=_size, repeats=_size) + ranks = g.cumcount().values + 1 + size = g.size().values + size = np.repeat(size, size) multiplied = sdf[pcol].values * size simes = multiplied / ranks @@ -692,10 +691,15 @@ class StatisticsMethods: Accessed with gr.stats.""" - def __init__(self, pr: PyRanges) -> None: + def __init__(self, pr: "PyRanges") -> None: self.pr = pr - def forbes(self, other: PyRanges, chromsizes: PyRanges, strandedness: Optional[str] = None) -> float64: + def forbes( + self, + other: "PyRanges", + chromsizes: Union["PyRanges", DataFrame, Dict[Any, int]], + strandedness: Optional[str] = None, + ) -> float: """Compute Forbes coefficient. Ratio which represents observed versus expected co-occurence. @@ -734,7 +738,8 @@ def forbes(self, other: PyRanges, chromsizes: PyRanges, strandedness: Optional[s >>> gr, gr2 = pr.data.chipseq(), pr.data.chipseq_background() >>> chromsizes = pr.data.chromsizes() >>> gr.stats.forbes(gr2, chromsizes=chromsizes) - 1.7168314674978278""" + 1.7168314674978278 + """ _chromsizes = chromsizes_as_int(chromsizes) @@ -745,9 +750,7 @@ def forbes(self, other: PyRanges, chromsizes: PyRanges, strandedness: Optional[s reference_length = self.pr.merge(strand=strand).length query_length = other.merge(strand=strand).length - intersection_sum = sum( - v.sum() for v in self.pr.set_intersect(other, strandedness=strandedness).lengths() - ) + intersection_sum = self.pr.set_intersect(other, strandedness=strandedness).lengths().sum() forbes = _chromsizes * intersection_sum / (reference_length * query_length) @@ -796,11 +799,11 @@ def jaccard(self, other: PyRanges, **kwargs) -> float: kwargs = pr.pyranges_main.fill_kwargs(kwargs) strand = True if kwargs.get("strandedness") else False - intersection_sum = sum(v.sum() for v in self.pr.set_intersect(other).lengths()) + intersection_sum = self.pr.set_intersect(other).lengths().sum() union_sum = 0 for gr in [self.pr, other]: - union_sum += sum(v.sum() for v in gr.merge(strand=strand).lengths()) + union_sum += gr.merge(strand=strand).lengths().sum() denominator = union_sum - intersection_sum if denominator == 0: diff --git a/pyranges/subset.py b/pyranges/subset.py index 11a9ba42..4ab1544c 100644 --- a/pyranges/subset.py +++ b/pyranges/subset.py @@ -1,12 +1,18 @@ +from typing import Any, Dict, List, Tuple, Union + import pandas as pd from ncls import NCLS # type: ignore +from numpy import int64 +from pandas.core.frame import DataFrame + +from pyranges.pyranges_main import PyRanges -def create_ncls(df): +def create_ncls(df: DataFrame) -> NCLS: return NCLS(df.Start.values, df.End.values, df.index.values) -def find_overlaps(df, start, end): +def find_overlaps(df: DataFrame, start: int, end: Union[int64, int]) -> List[Union[int, Any]]: n = create_ncls(df) idxes = [] @@ -16,106 +22,107 @@ def find_overlaps(df, start, end): return idxes -def get_slice(self, val): +def get_slice(self: PyRanges, val: slice) -> Union[Dict[str, DataFrame], Dict[Tuple[str, str], DataFrame]]: # 100:999 - d = {} - - for k, df in self.items(): - start = val.start or 0 - stop = val.stop or max(df.End.max(), start) - idxes = find_overlaps(df, start, stop) - d[k] = df.reindex(idxes) - - return d + if self.stranded: + sd = {} + for sk, sdf in self._dfs_with_strand.items(): + start = val.start or 0 + stop = val.stop or max(sdf.End.max(), start) + idxes = find_overlaps(sdf, start, stop) + sd[sk] = sdf.reindex(idxes) + return sd + else: + d = {} + for k, df in self._dfs_without_strand.items(): + start = val.start or 0 + stop = val.stop or max(df.End.max(), start) + idxes = find_overlaps(df, start, stop) + d[k] = df.reindex(idxes) + return d -def get_string(self, val): +def get_string(self: PyRanges, val: str) -> Union[Dict[Tuple[str, str], DataFrame], Dict[str, DataFrame]]: if val in self.chromosomes: if self.stranded: - return {k: self.dfs[k] for k in self.keys() if k[0] == val} + return {k: df for k, df in self._dfs_with_strand.items() if k[0] == val} else: - return {val: self.dfs[val]} - + return {val: df for k, df in self._dfs_without_strand.items() if k == val} elif val in "+ -".split(): - return {k: v for k, v in self.items() if k[1] == val} + return {k: v for k, v in self._dfs_with_strand.items() if k[1] == val} else: - return {} - - -def get_tuple(self, val): - if len(val) == 2: - dfs = get_double(self, val) - elif len(val) == 3: - dfs = get_triple(self, val) - - return dfs + d: Dict[str, DataFrame] = {} + return d + + +def get_2_tuple( + self: PyRanges, first: str, second: Union[str, slice] +) -> Union[Dict[str, DataFrame], Dict[Tuple[str, str], DataFrame]]: + if isinstance(first, str) and first in "+-" and isinstance(second, slice): + return get_strand_and_slice(self, strand=first, loc=second) + if isinstance(first, (int, str)) and isinstance(second, str): + return get_chromosome_and_strand(self, chromosome=first, strand=second) + if isinstance(first, (int, str)) and isinstance(second, slice): + return get_chromosome_and_slice(self, chromosome=first, loc=second) + else: + raise TypeError(f"Incorrect types: {type(first)}, {type(second)}") -def get_double(self, val): - if len(val) == 2 and val[0] in self.chromosomes and isinstance(val[1], slice): - chromosome, loc = val +def get_chromosome_and_slice( + self: PyRanges, chromosome: str, loc: slice +) -> Union[Dict[str, DataFrame], Dict[Tuple[str, str], DataFrame]]: + if chromosome in self.chromosomes: start = loc.start or 0 if self.stranded: - dfs = {k: df for k, df in self.items() if k[0] == chromosome} - max_end = max([df.End.max() for df in dfs.values()]) + dfs = [df for (c, _), df in self._dfs_with_strand.items() if c == chromosome] else: - dfs = {val[0]: self.dfs[val[0]]} - max_end = list(dfs.values())[0].End.max() + dfs = [df for c, df in self._dfs_without_strand.items() if c == chromosome] + max_end = max([df.End.max() for df in dfs]) # in case 1:None stop = loc.stop or max(max_end, start) - dfs2 = {} - for k, df in dfs.items(): - idxes = find_overlaps(df, start, stop) - if idxes: - dfs2[k] = df.loc[idxes] + out_dfs = [df[find_overlaps(df, start, stop)] for df in dfs] + + return PyRanges(pd.concat(out_dfs)).dfs - return dfs2 +def get_strand_and_slice(self: PyRanges, strand: str, loc: slice) -> Dict[Tuple[str, str], DataFrame]: # "+", 5:10 - if len(val) == 2 and val[0] in "+ -".split() and isinstance(val[1], slice): - strand, loc = val - start = loc.start or 0 + start = loc.start or 0 - dfs = {k: df for k, df in self.items() if k[1] == strand} - max_end = max([df.End.max() for df in dfs.values()]) + dfs = [df for (c, s), df in self._dfs_with_strand.items() if s == strand] + max_end = max([df.End.max() for df in dfs]) - stop = loc.stop or max(max_end, start) + stop = loc.stop or max(max_end, start) - dfs2 = {} - for k, df in dfs.items(): - idxes = find_overlaps(df, start, stop) - if idxes: - dfs2[k] = df.loc[idxes] + out_dfs = [df[find_overlaps(df, start, stop)] for df in dfs] - return dfs2 + return {k: v for k, v in PyRanges(pd.concat(out_dfs))._dfs_with_strand.items()} - # "chr1", "+" - if len(val) == 2 and val[1] in "+ -".split(): - chromosome, strand = val - if (chromosome, strand) in self.dfs: - return {(chromosome, strand): self.dfs[chromosome, strand]} - else: - return {} +# "chr1", "+" +def get_chromosome_and_strand( + self: PyRanges, chromosome: Union[int, str], strand: str +) -> Dict[Tuple[str, str], DataFrame]: + return {k: df for k, df in self._dfs_with_strand.items() if k == (chromosome, strand)} -def get_triple(self, val): +def get_chromosome_strand_loc( + self: PyRanges, chromosome: str, strand: str, loc: slice +) -> Dict[Tuple[str, str], DataFrame]: # "chr1", "+", 5:10 - chromosome, strand, loc = val start = loc.start or 0 if strand not in "+ -".split(): - raise Exception("Strand '{}' invalid.".format(val)) + raise Exception("Strand '{}' invalid.".format(strand)) r = self[chromosome, strand].values() if len(r): df = r[0] else: - df = pd.DataFrame(columns="Chromosome Start End".split()) - return df + return {} max_end = df.End.max()