From b720e2c3c2f2847370e7776119e994b8a878c6a3 Mon Sep 17 00:00:00 2001
From: endre bakken stovner <endrebakkenstovner@endres-MacBook-Air.local>
Date: Fri, 19 May 2023 18:20:26 +0200
Subject: [PATCH 01/10] Fix tostring2

---
 pyproject.toml                            |  2 +-
 pyranges/tostring2.py                     |  4 ++--
 tests/property_based/hypothesis_helper.py | 11 +++--------
 3 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c015f467..cbdb2d59 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "pyranges"
-version = "0.0.126"
+version = "0.0.127"
 description = "GenomicRanges for Python."
 readme = "README.md"
 authors = [{ name = "Endre Bakken Stovner", email = "endbak@pm.me" }]
diff --git a/pyranges/tostring2.py b/pyranges/tostring2.py
index cf03444a..de9d939a 100644
--- a/pyranges/tostring2.py
+++ b/pyranges/tostring2.py
@@ -195,7 +195,7 @@ def _grow_string_representation(df, columns_dtypes, terminal_width: Optional[int
         new_build_df = pd.concat([build_df, df[c]], axis=1)
 
         new_str_repr = tabulate(
-            new_build_df.to_dict(orient="records"), headers=list(new_build_df.columns), tablefmt="psql", showindex=False
+            new_build_df, headers=list(new_build_df.columns), tablefmt="psql", showindex=False  # type: ignore
         )
 
         table_width = len(new_str_repr.split("\n", 1)[0])
@@ -209,7 +209,7 @@ def _grow_string_representation(df, columns_dtypes, terminal_width: Optional[int
     if i < total_columns:
         new_build_df = add_hidden_col_dotdot(build_df, len(original_header[i:]))
         str_repr = tabulate(
-            new_build_df.to_dict(orient="records"), headers=list(new_build_df.columns), tablefmt="psql", showindex=False
+            new_build_df, headers=list(new_build_df.columns), tablefmt="psql", showindex=False  # type: ignore
         )
 
     return str_repr, original_header[i:]
diff --git a/tests/property_based/hypothesis_helper.py b/tests/property_based/hypothesis_helper.py
index f2c14a84..575e3c1d 100644
--- a/tests/property_based/hypothesis_helper.py
+++ b/tests/property_based/hypothesis_helper.py
@@ -8,14 +8,9 @@
 import pyranges as pr
 from pyranges import PyRanges
 
-if environ.get("GITHUB_ACTIONS"):
-    max_examples = 15
-    slow_max_examples = 5
-    deadline = None
-else:
-    max_examples = 1000
-    slow_max_examples = 100
-    deadline = None
+max_examples = 15
+slow_max_examples = 5
+deadline = None
 
 lengths = st.integers(min_value=1, max_value=int(1e7))
 small_lengths = st.integers(min_value=1, max_value=int(1e4))

From 01ab973b180bc2ffc366db5b353a9992ee5a8378 Mon Sep 17 00:00:00 2001
From: endre bakken stovner <endrebakkenstovner@endres-MacBook-Air.local>
Date: Sun, 21 May 2023 16:20:38 +0200
Subject: [PATCH 02/10] Add types to pyranges_main

---
 pyranges/__init__.py      |  24 +-
 pyranges/helpers.py       |   7 +-
 pyranges/methods/attr.py  |   3 +-
 pyranges/methods/join.py  |   2 +-
 pyranges/multithreaded.py |  44 ---
 pyranges/pyranges_main.py | 722 ++++++++++++++++++++------------------
 6 files changed, 391 insertions(+), 411 deletions(-)

diff --git a/pyranges/__init__.py b/pyranges/__init__.py
index c75b4be3..5bf17528 100644
--- a/pyranges/__init__.py
+++ b/pyranges/__init__.py
@@ -28,7 +28,7 @@
 read_gff = read_gtf
 
 
-def from_dict(d, int64=False):
+def from_dict(d):
     """Create a PyRanges from dict.
 
     Parameters
@@ -37,10 +37,6 @@ def from_dict(d, int64=False):
 
         Dict with data.
 
-    int64 : bool, default False.
-
-        Whether to use 64-bit integers for starts and ends.
-
     Warning
     -------
 
@@ -69,10 +65,10 @@ def from_dict(d, int64=False):
     For printing, the PyRanges was sorted on Chromosome and Strand.
     """
 
-    return PyRanges(pd.DataFrame(d), int64=int64)
+    return PyRanges(pd.DataFrame(d))
 
 
-def from_string(s, int64=False):
+def from_string(s):
     """Create a PyRanges from multiline string.
 
     Parameters
@@ -81,10 +77,6 @@ def from_string(s, int64=False):
 
         String with data.
 
-    int64 : bool, default False.
-
-        Whether to use 64-bit integers for starts and ends.
-
     See Also
     --------
 
@@ -120,7 +112,7 @@ def from_string(s, int64=False):
 
     df = pd.read_csv(StringIO(s), sep=r"\s+", index_col=None)
 
-    return PyRanges(df, int64=int64)
+    return PyRanges(df)
 
 
 def itergrs(prs, strand=None, keys=False):
@@ -233,7 +225,7 @@ def itergrs(prs, strand=None, keys=False):
         return iter(natsorted(grs_per_chromosome.items()))
 
 
-def random(n=1000, length=100, chromsizes=None, strand=True, int64=False, seed=None):
+def random(n=1000, length=100, chromsizes=None, strand=True, seed=None):
     """Return PyRanges with random intervals.
 
     Parameters
@@ -254,9 +246,9 @@ def random(n=1000, length=100, chromsizes=None, strand=True, int64=False, seed=N
 
         Data should have strand.
 
-    int64 : bool, default False
+    seed : int, default None
 
-        Use int64 to represent Start and End.
+        Seed for random number generator.
 
     Examples
     --------
@@ -328,7 +320,7 @@ def random(n=1000, length=100, chromsizes=None, strand=True, int64=False, seed=N
         s = np.random.choice("+ -".split(), size=n)
         random_df.insert(3, "Strand", s)
 
-    return PyRanges(random_df, int64=int64)
+    return PyRanges(random_df)
 
 
 """Namespace for statistcal functions.
diff --git a/pyranges/helpers.py b/pyranges/helpers.py
index d9bcbe5d..cb4f099c 100644
--- a/pyranges/helpers.py
+++ b/pyranges/helpers.py
@@ -1,3 +1,8 @@
+from typing import Tuple, Union
+
+import pandas as pd
+
+
 def get_chromosomes_from_dict(dfs):
     keys = list(dfs.keys())
     if isinstance(keys[0], tuple):
@@ -18,7 +23,7 @@ def get_strands_from_dict(dfs):
     return strands
 
 
-def get_key_from_df(df):
+def get_key_from_df(df: pd.DataFrame) -> Union[str, Tuple[str, str]]:
     chromosome = df.Chromosome.head(1).iloc[0]
     if "Strand" in df:
         strand = df.Strand.head(1).iloc[0]
diff --git a/pyranges/methods/attr.py b/pyranges/methods/attr.py
index cd9cdc18..ca0f7bac 100644
--- a/pyranges/methods/attr.py
+++ b/pyranges/methods/attr.py
@@ -53,9 +53,8 @@ def _setattr(self, column_name, column, pos=False):
     if column_name not in ["Chromosome", "Strand"]:
         self.__dict__["dfs"] = dfs
     else:
-        int64 = True if self.dtypes["Start"] == np.int64 else False
         # will merge the dfs, then split on keys again to ensure they are correct
-        self.__dict__["dfs"] = pr.PyRanges(pr.PyRanges(dfs).df, int64=int64).dfs
+        self.__dict__["dfs"] = pr.PyRanges(pr.PyRanges(dfs).df).dfs
 
 
 def _getattr(self, name):
diff --git a/pyranges/methods/join.py b/pyranges/methods/join.py
index 3ad5c8cf..1662c352 100644
--- a/pyranges/methods/join.py
+++ b/pyranges/methods/join.py
@@ -4,7 +4,7 @@
 
 
 def _both_indexes(scdf, ocdf, how=False, **kwargs):
-    assert (how in "containment first last outer right left".split() + [False, None]) or isinstance(how, int)
+    assert (how in "containment first last outer right left".split() + [False, None]) or isinstance(how, int), how
     starts = scdf.Start.values
     ends = scdf.End.values
     indexes = scdf.index.values
diff --git a/pyranges/multithreaded.py b/pyranges/multithreaded.py
index ccfee396..7505ea60 100644
--- a/pyranges/multithreaded.py
+++ b/pyranges/multithreaded.py
@@ -480,47 +480,3 @@ def _extend_grp(df, **kwargs):
     assert (df.Start < df.End).all(), "Some intervals are negative or zero length after applying extend!"
 
     return df
-
-
-def pyrange_apply_chunks(function, self, as_pyranges, **kwargs):
-    nparams = get_n_args(function)
-    nb_cpu = kwargs.get("nb_cpu", 1)
-    if nb_cpu > 1:
-        import ray  # type: ignore
-
-        with suppress_stdout_stderr():
-            ray.init(num_cpus=nb_cpu, ignore_reinit_error=True)
-
-    function, get, _merge_dfs = get_multithreaded_funcs(function, nb_cpu=nb_cpu)
-
-    keys = []
-    lengths = []
-    results = []
-    for k, v in self.items():
-        dfs = np.array_split(v, nb_cpu)
-        lengths.append(len(dfs))
-        results.extend([call_f_single(function, nparams, df, **kwargs) for df in dfs])
-        keys.append(k)
-
-    results = get(results)
-
-    _results = []
-    start = 0
-    for _, length in zip(keys, lengths):
-        end = start + length
-        _r = results[start:end]
-
-        if as_pyranges:
-            _results.append(pd.concat(_r))
-        else:
-            _results.append(_r)
-
-        start = end
-
-    results = _results
-    if nb_cpu > 1:
-        ray.shutdown()
-
-    results = process_results(results, keys)
-
-    return results
diff --git a/pyranges/pyranges_main.py b/pyranges/pyranges_main.py
index 4b5e921a..21278540 100644
--- a/pyranges/pyranges_main.py
+++ b/pyranges/pyranges_main.py
@@ -1,4 +1,6 @@
 """Data structure for genomic intervals and their annotation."""
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -6,21 +8,22 @@
 
 import pyranges as pr
 from pyranges.methods.intersection import _intersection, _overlap
-from pyranges.multithreaded import (
-    _extend,
-    _extend_grp,
-    _tes,
-    _tss,
-    pyrange_apply,
-    pyrange_apply_chunks,
-    pyrange_apply_single,
-)
+from pyranges.multithreaded import _extend, _extend_grp, _tes, _tss, pyrange_apply, pyrange_apply_single
 from pyranges.tostring2 import tostring
 
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from pandas.core.indexes.base import Index
+    from pyrle.rledict import RleDict  # type: ignore
+
 __all__ = ["PyRanges"]
 
 
-def fill_kwargs(kwargs):
+ChromosomeLocation = Union[str, Tuple[str, str]]
+
+
+def fill_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]:
     """Give the kwargs dict default options."""
 
     defaults = {
@@ -55,7 +58,7 @@ class PyRanges:
 
     Parameters
     ----------
-    df : pandas.DataFrame or dict of pandas.DataFrame, default None
+    df : DataFrame or dict of DataFrame, default None
         The data to be stored in the PyRanges.
 
     chromosomes : array-like or scalar value, default None
@@ -71,7 +74,7 @@ class PyRanges:
         The strands in the PyRanges.
 
     copy_df : bool, default True
-        Copy input pandas.DataFrame
+        Copy input DataFrame
 
     See Also
     --------
@@ -87,7 +90,7 @@ class PyRanges:
     -----
 
     A PyRanges object is represented internally as a dictionary efficiency. The keys are
-    chromosomes or chromosome/strand tuples and the values are pandas DataFrames.
+    chromosomes or chromosome/strand tuples and the values are pandas pd.DataFrames.
 
     Examples
     --------
@@ -139,8 +142,8 @@ class PyRanges:
     For printing, the PyRanges was sorted on Chromosome and Strand.
     """
 
-    dfs = None
-    """Dict mapping chromosomes or chromosome/strand pairs to pandas DataFrames."""
+    dfs: Union[Dict[str, pd.DataFrame], Dict[Tuple[str, str], pd.DataFrame]]
+    """Dict mapping chromosomes or chromosome/strand pairs to pandas pd.DataFrames."""
 
     features = None
     """Namespace for genomic-features methods.
@@ -162,14 +165,13 @@ class PyRanges:
 
     def __init__(
         self,
-        df=None,
-        chromosomes=None,
-        starts=None,
-        ends=None,
-        strands=None,
-        int64=False,
-        copy_df=True,
-    ):
+        df: Optional[Union[pd.DataFrame, Dict[Union[str], pd.DataFrame], Dict[Tuple[str, str], pd.DataFrame]]] = None,
+        chromosomes: Optional[str] = None,
+        starts: Optional[Tuple[int, int]] = None,
+        ends: Optional[List[int]] = None,
+        strands: Optional[Tuple[str, str]] = None,
+        copy_df: bool = True,
+    ) -> None:
         from pyranges.methods.init import _init
 
         if df is None and chromosomes is None:
@@ -177,7 +179,7 @@ def __init__(
 
         _init(self, df, chromosomes, starts, ends, strands, copy_df)
 
-    def __array_ufunc__(self, *args, **kwargs):
+    def __array_ufunc__(self, *args, **kwargs) -> "PyRanges":
         """Apply unary numpy-function.
 
 
@@ -236,7 +238,7 @@ def __array_ufunc__(self, *args, **kwargs):
 
         # self.apply()
 
-    def __getattr__(self, name):
+    def __getattr__(self, name: str) -> pd.Series:
         """Return column.
 
         Parameters
@@ -247,7 +249,7 @@ def __getattr__(self, name):
 
         Returns
         -------
-        pandas.Series
+        pandas.pd.Series
 
         Example
         -------
@@ -264,7 +266,7 @@ def __getattr__(self, name):
 
         return _getattr(self, name)
 
-    def __setattr__(self, column_name, column):
+    def __setattr__(self, column_name: str, column: Any) -> None:
         """Insert or update column.
 
         Parameters
@@ -273,7 +275,7 @@ def __setattr__(self, column_name, column):
 
             Name of column to update or insert.
 
-        column : list, np.array or pd.Series
+        column : list, np.array or pd.pd.Series
 
             Data to insert.
 
@@ -315,7 +317,7 @@ def __setattr__(self, column_name, column):
                         )
                     )
 
-    def __getitem__(self, val):
+    def __getitem__(self, val: Any) -> "PyRanges":
         """Fetch columns or subset on position.
 
         If a list is provided, the column(s) in the list is returned. This subsets on columns.
@@ -326,7 +328,7 @@ def __getitem__(self, val):
 
         Parameters
         ----------
-        val : bool array/Series, tuple, list, str or slice
+        val : bool array/pd.Series, tuple, list, str or slice
 
             Data to fetch.
 
@@ -356,7 +358,7 @@ def __getitem__(self, val):
         Stranded PyRanges object has 2,446 rows and 7 columns from 1 chromosomes.
         For printing, the PyRanges was sorted on Chromosome and Strand.
 
-        Create boolean Series and use it to subset:
+        Create boolean pd.Series and use it to subset:
 
         >>> s = (gr.Feature == "gene") | (gr.gene_id == "ENSG00000223972")
         >>> gr[s]
@@ -462,16 +464,16 @@ def __iter__(self):
 
         return iter(self.items())
 
-    def __len__(self):
+    def __len__(self) -> int:
         """Return the number of intervals in the PyRanges."""
         return sum([len(d) for d in self.values()])
 
-    def __str__(self):
+    def __str__(self) -> str:
         """Return string representation."""
 
         return tostring(self)
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         """Return REPL representation."""
 
         return str(self)
@@ -481,42 +483,32 @@ def _repr_html_(self):
 
         return self.df._repr_html_()
 
-    def apply(self, f, strand=None, as_pyranges=True, nb_cpu=1, **kwargs):
+    def apply(self, f: Callable, strand: Optional[bool] = None, **kwargs) -> "PyRanges":
         """Apply a function to the PyRanges.
 
         Parameters
         ----------
         f : function
-            Function to apply on each DataFrame in a PyRanges
+            Function to apply on each pd.DataFrame in a PyRanges
 
-        strand : bool, default None, i.e. auto
+        strand : Optional[bool], default None, i.e. auto
 
             Whether to do operations on chromosome/strand pairs or chromosomes. If None, will use
             chromosome/strand pairs if the PyRanges is stranded.
 
-        as_pyranges : bool, default True
-
-            Whether to return as a PyRanges or dict. If `f` does not return a DataFrame valid for
-            PyRanges, `as_pyranges` must be False.
-
-        nb_cpu: int, default 1
-
-            How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple.
-            Will only lead to speedups on large datasets.
-
         **kwargs
             Additional keyword arguments to pass as keyword arguments to `f`
 
         Returns
         -------
-        PyRanges or dict
-            Result of applying f to each DataFrame in the PyRanges
+        PyRanges
+            Result of applying f to each pd.DataFrame in the PyRanges
 
         See also
         --------
 
         pyranges.PyRanges.apply_pair: apply a function to a pair of PyRanges
-        pyranges.PyRanges.apply_chunks: apply a row-based function to a PyRanges in parallel
+        pyranges.PyRanges.apply_general: apply a function to a PyRanges and return a Dict[keys, Any]
 
         Note
         ----
@@ -541,12 +533,6 @@ def apply(self, f, strand=None, as_pyranges=True, nb_cpu=1, **kwargs):
         Stranded PyRanges object has 4 rows and 4 columns from 2 chromosomes.
         For printing, the PyRanges was sorted on Chromosome and Strand.
 
-        >>> gr.apply(lambda df: len(df), as_pyranges=False)
-        {('1', '+'): 2, ('2', '+'): 1, ('2', '-'): 1}
-
-        >>> gr.apply(lambda df: len(df), as_pyranges=False, strand=False)
-        {'1': 2, '2': 2}
-
         >>> def add_to_ends(df, **kwargs):
         ...     df.loc[:, "End"] = kwargs["slack"] + df.End
         ...     return df
@@ -573,79 +559,67 @@ def apply(self, f, strand=None, as_pyranges=True, nb_cpu=1, **kwargs):
 
         result = pyrange_apply_single(f, self, **kwargs)
 
-        if not as_pyranges:
-            return result
-        else:
-            return PyRanges(result)
-
-    def apply_chunks(self, f, as_pyranges=False, nb_cpu=1, **kwargs):
-        """Apply a row-based function to arbitrary partitions of the PyRanges.
+        return PyRanges(result)
 
-        apply_chunks speeds up the application of functions where the result is not affected by
-        applying the function to ordered, non-overlapping splits of the data.
+    def apply_general(
+        self, f: Callable, strand: Optional[bool] = None, **kwargs
+    ) -> Union[Dict[str, Any], Dict[Tuple[str, str], Any]]:
+        """Apply a function to the PyRanges and return a dict of dict.
 
         Parameters
         ----------
         f : function
-            Row-based or associative function to apply on the partitions.
+            Function to apply on each pd.DataFrame in a PyRanges
 
-        as_pyranges : bool, default False
-
-            Whether to return as a PyRanges or dict.
-
-        nb_cpu: int, default 1
+        strand : Optional[bool], default None, i.e. auto
 
-            How many cpus to use. The data is split into nb_cpu partitions.
+            Whether to do operations on chromosome/strand pairs or chromosomes. If None, will use
+            chromosome/strand pairs if the PyRanges is stranded.
 
         **kwargs
             Additional keyword arguments to pass as keyword arguments to `f`
 
         Returns
         -------
-        dict of lists
-            Result of applying f to each partition of the DataFrames in the PyRanges.
+        PyRanges
+            Result of applying f to each pd.DataFrame in the PyRanges
 
         See also
         --------
 
+        pyranges.PyRanges.apply: apply a function to a PyRanges and return a PyRanges
         pyranges.PyRanges.apply_pair: apply a function to a pair of PyRanges
-        pyranges.PyRanges.apply_chunks: apply a row-based function to a PyRanges in parallel
 
         Note
         ----
 
-        apply_chunks will only lead to speedups on large datasets or slow-running functions. Using
-        it with nb_cpu=1 is pointless; use apply instead.
+        This is the function used internally to carry out almost all unary PyRanges methods.
 
         Examples
         --------
 
-        >>> gr = pr.from_dict({"Chromosome": [1, 1, 1], "Start": [2, 3, 5], "End": [9, 4, 6]})
-        >>> gr
-        +--------------+-----------+-----------+
-        |   Chromosome |     Start |       End |
-        |   (category) |   (int64) |   (int64) |
-        |--------------+-----------+-----------|
-        |            1 |         2 |         9 |
-        |            1 |         3 |         4 |
-        |            1 |         5 |         6 |
-        +--------------+-----------+-----------+
-        Unstranded PyRanges object has 3 rows and 3 columns from 1 chromosomes.
-        For printing, the PyRanges was sorted on Chromosome.
+        >>> gr = pr.from_dict({"Chromosome": [1, 1, 2, 2], "Strand": ["+", "+", "-", "+"],
+        ...                    "Start": [1, 4, 2, 9], "End": [2, 27, 13, 10]})
+
+        >>> gr.apply_general(lambda df: len(df))
+        {('1', '+'): 2, ('2', '+'): 1, ('2', '-'): 1}
 
-        >>> gr.apply_chunks(
-        ... lambda df, **kwargs: list(df.End + kwargs["add"]), nb_cpu=1, add=1000)
-        {'1': [[1009, 1004, 1006]]}
+        >>> gr.apply_general(lambda df: len(df), strand=False)
+        {'1': 2, '2': 2}
         """
 
+        if strand is None:
+            strand = self.stranded
+
+        kwargs.update({"strand": strand})
         kwargs.update(kwargs.get("kwargs", {}))
         kwargs = fill_kwargs(kwargs)
 
-        result = pyrange_apply_chunks(f, self, as_pyranges, **kwargs)
+        return pyrange_apply_single(f, self, **kwargs)
 
-        return result
-
-    def apply_pair(self, other, f, strandedness=None, as_pyranges=True, **kwargs):
+    def apply_pair(
+        self, other: "PyRanges", f: Callable, strandedness: None = None, as_pyranges: bool = True, **kwargs
+    ) -> Union[Dict[Tuple[str, str], Tuple[int, int]], "PyRanges"]:
         """Apply a function to a pair of PyRanges.
 
         The function is applied to each chromosome or chromosome/strand pair found in at least one
@@ -654,7 +628,7 @@ def apply_pair(self, other, f, strandedness=None, as_pyranges=True, **kwargs):
         Parameters
         ----------
         f : function
-            Row-based or associative function to apply on the DataFrames.
+            Row-based or associative function to apply on the pd.DataFrames.
 
         strandedness : {None, "same", "opposite", False}, default None, i.e. auto
 
@@ -664,7 +638,7 @@ def apply_pair(self, other, f, strandedness=None, as_pyranges=True, **kwargs):
 
         as_pyranges : bool, default False
 
-            Whether to return as a PyRanges or dict. If `f` does not return a DataFrame valid for
+            Whether to return as a PyRanges or dict. If `f` does not return a pd.DataFrame valid for
             PyRanges, `as_pyranges` must be False.
 
         nb_cpu: int, default 1
@@ -678,7 +652,7 @@ def apply_pair(self, other, f, strandedness=None, as_pyranges=True, **kwargs):
         Returns
         -------
         dict of lists
-            Result of applying f to each partition of the DataFrames in the PyRanges.
+            Result of applying f to each partition of the pd.DataFrames in the PyRanges.
 
         See also
         --------
@@ -752,20 +726,20 @@ def apply_pair(self, other, f, strandedness=None, as_pyranges=True, **kwargs):
         else:
             return PyRanges(result)
 
-    def as_df(self):
-        """Return PyRanges as DataFrame.
+    def as_df(self) -> pd.DataFrame:
+        """Return PyRanges as pd.DataFrame.
 
         Returns
         -------
-        DataFrame
+        pd.DataFrame
 
-            A DataFrame natural sorted on Chromosome and Strand. The ordering of rows within
+            A pd.DataFrame natural sorted on Chromosome and Strand. The ordering of rows within
             chromosomes and strands is preserved.
 
         See also
         --------
 
-        PyRanges.df : Return PyRanges as DataFrame.
+        PyRanges.df : Return PyRanges as pd.DataFrame.
 
         Examples
         --------
@@ -800,7 +774,7 @@ def as_df(self):
         else:
             return pd.concat(self.values()).reset_index(drop=True)
 
-    def assign(self, col, f, strand=None, nb_cpu=1, **kwargs):
+    def assign(self, col: str, f: Callable, strand: Optional[bool] = None, nb_cpu: int = 1, **kwargs) -> "PyRanges":
         """Add or replace a column.
 
         Does not change the original PyRanges.
@@ -815,7 +789,7 @@ def assign(self, col, f, strand=None, nb_cpu=1, **kwargs):
         f : function
             Function to create new column.
 
-        strand : bool, default None, i.e. auto
+        strand : Optional[bool], default None, i.e. auto
 
             Whether to do operations on chromosome/strand pairs or chromosomes. If None, will use
             chromosome/strand pairs if the PyRanges is stranded.
@@ -889,7 +863,7 @@ def assign(self, col, f, strand=None, nb_cpu=1, **kwargs):
 
         first_result = next(iter(result.values()))
 
-        assert isinstance(first_result, pd.Series), "result of assign function must be Series, but is {}".format(
+        assert isinstance(first_result, pd.Series), "result of assign function must be pd.Series, but is {}".format(
             type(first_result)
         )
 
@@ -899,7 +873,7 @@ def assign(self, col, f, strand=None, nb_cpu=1, **kwargs):
 
         return new_self
 
-    def boundaries(self, group_by, agg=None):
+    def boundaries(self, group_by: str, agg: Optional[Dict[str, Union[str, Callable]]] = None) -> "PyRanges":
         """Return the boundaries of groups of intervals (e.g. transcripts)
 
         Parameters
@@ -913,7 +887,7 @@ def boundaries(self, group_by, agg=None):
 
             Defines how to aggregate metadata columns. Provided as
             dictionary of column names -> functions, function names or list of such,
-            as accepted by the Pandas.DataFrame.agg method.
+            as accepted by the pd.DataFrame.agg method.
 
 
         Returns
@@ -971,7 +945,7 @@ def boundaries(self, group_by, agg=None):
         result = pyrange_apply_single(_bounds, self, **kwargs)
         return pr.PyRanges(result)
 
-    def calculate_frame(self, by):
+    def calculate_frame(self, by: Union[str, List[str]]) -> "PyRanges":
         """Calculate the frame of each genomic interval, assuming all are coding sequences (CDS), and add it as column inplace.
 
         After this, the input Pyranges will contain an added "Frame" column, which determines the base of the CDS that is the first base of a codon.
@@ -987,17 +961,15 @@ def calculate_frame(self, by):
 
         Returns
         -------
-        None
-            The "Frame" column is added inplace.
-
+        PyRanges
 
         Examples
         --------
-        >>> p= pr.from_dict({"Chromosome": [1,1,1,2,2],
-        ...                  "Strand": ["+","+","+","-","-"],
-        ...                  "Start": [1,31,52,101,201],
-        ...                  "End": [10,45,90,130,218],
-        ...                  "transcript_id": ["t1","t1","t1","t2","t2"] })
+        >>> p = pr.from_dict({"Chromosome": [1,1,1,2,2],
+        ...                   "Strand": ["+","+","+","-","-"],
+        ...                   "Start": [1,31,52,101,201],
+        ...                   "End": [10,45,90,130,218],
+        ...                   "transcript_id": ["t1","t1","t1","t2","t2"]})
         >>> p
         +--------------+--------------+-----------+-----------+-----------------+
         |   Chromosome | Strand       |     Start |       End | transcript_id   |
@@ -1013,7 +985,6 @@ def calculate_frame(self, by):
         For printing, the PyRanges was sorted on Chromosome and Strand.
 
         >>> p.calculate_frame(by=['transcript_id'])
-        >>> p
         +--------------+--------------+-----------+-----------+-----------------+-----------+
         |   Chromosome | Strand       |     Start |       End | transcript_id   |     Frame |
         |   (category) | (category)   |   (int64) |   (int64) | (object)        |   (int64) |
@@ -1028,22 +999,26 @@ def calculate_frame(self, by):
         For printing, the PyRanges was sorted on Chromosome and Strand.
 
         """
+        _self = self.copy()
         # Column to save the initial index
-        self.__index__ = np.arange(len(self))
+        _self.__index__ = np.arange(len(self))
 
         # Filtering for desired columns
-        lst = by if type(by) is list else [by]
-        sorted_p = self[["Strand", "__index__"] + lst]
+        if isinstance(by, str):
+            lst = [by]
+        else:
+            lst = by
+        sorted_p = _self[["Strand", "__index__"] + lst]
 
         # Sorting by 5' (Intervals on + are sorted by ascending order and - are sorted by descending order)
         sorted_p = sorted_p.sort(by="5")
 
         # Creating a column saving the length for the intervals (for selenoprofiles and ensembl)
-        sorted_p.__length__ = sorted_p.End - sorted_p.Start
+        sorted_p.__length__ = sorted_p.lengths()
 
-        # Creating a column saving the cummulative length for the intervals
-        for k, df in sorted_p:
-            sorted_p.dfs[k]["__cumsum__"] = df.groupby(by=by).__length__.cumsum()
+        # Creating a column saving the cumulative length for the intervals
+        for df in sorted_p.values():
+            df["__cumsum__"] = df.groupby(by=by).__length__.cumsum()
 
         # Creating a frame column
         sorted_p.Frame = sorted_p.__cumsum__ - sorted_p.__length__
@@ -1051,13 +1026,13 @@ def calculate_frame(self, by):
         # Appending the Frame of sorted_p by the index of p
         sorted_p = sorted_p.apply(lambda df: df.sort_values(by="__index__"))
 
-        self.Frame = sorted_p.Frame
+        _self.Frame = sorted_p.Frame
 
         # Drop __index__ column
-        self.apply(lambda df: df.drop("__index__", axis=1, inplace=True))
+        return _self.apply(lambda df: df.drop("__index__", axis=1))
 
     @property
-    def chromosomes(self):
+    def chromosomes(self) -> List[str]:
         """Return chromosomes in natsorted order."""
 
         if self.stranded:
@@ -1065,7 +1040,13 @@ def chromosomes(self):
         else:
             return natsorted(set([k for k in self.keys()]))
 
-    def cluster(self, strand=None, by=None, slack=0, count=False, nb_cpu=1):
+    def cluster(
+        self,
+        strand: Optional[bool] = None,
+        by: Optional[Union[List[str], str]] = None,
+        slack: int = 0,
+        count: bool = False,
+    ) -> "PyRanges":
         """Give overlapping intervals a common id.
 
         Parameters
@@ -1184,27 +1165,27 @@ def cluster(self, strand=None, by=None, slack=0, count=False, nb_cpu=1):
         Stranded PyRanges object has 2,446 rows and 7 columns from 1 chromosomes.
         For printing, the PyRanges was sorted on Chromosome and Strand.
         """
-
+        _self = self.copy()
         if strand is None:
-            strand = self.stranded
+            strand = _self.stranded
 
         kwargs = {"strand": strand, "slack": slack, "count": count, "by": by}
         kwargs = fill_kwargs(kwargs)
 
-        _stranded = self.stranded
+        _stranded = _self.stranded
         if not strand and _stranded:
-            self.Strand2 = self.Strand
-            self = self.unstrand()
+            _self.__Strand__ = _self.Strand
+            _self = _self.unstrand()
 
         if not by:
             from pyranges.methods.cluster import _cluster
 
-            df = pyrange_apply_single(_cluster, self, **kwargs)
+            df = pyrange_apply_single(_cluster, _self, **kwargs)
         else:
             from pyranges.methods.cluster import _cluster_by
 
             kwargs["by"] = by
-            df = pyrange_apply_single(_cluster_by, self, **kwargs)
+            df = pyrange_apply_single(_cluster_by, _self, **kwargs)
 
         gr = PyRanges(df)
 
@@ -1224,13 +1205,12 @@ def cluster(self, strand=None, by=None, slack=0, count=False, nb_cpu=1):
             new_dfs[k] = v
 
         if not strand and _stranded:
-            new_dfs = {k: d.rename(columns={"Strand2": "Strand"}) for k, d in new_dfs.items()}
-
-        self = PyRanges(new_dfs)
-
-        return self
+            renamed = [d.rename(columns={"__Strand__": "Strand"}) for d in new_dfs.values()]
+            return PyRanges._zip_locationkey_and_data(new_dfs.keys(), renamed, strand=True)
+        else:
+            return PyRanges._zip_locationkey_and_data(new_dfs.keys(), new_dfs.values(), strand=strand)
 
-    def copy(self):
+    def copy(self) -> "PyRanges":
         """Make a deep copy of the PyRanges.
 
         Notes
@@ -1241,7 +1221,7 @@ def copy(self):
         return self.apply(lambda df: df.copy(deep=True))
 
     @property
-    def columns(self):
+    def columns(self) -> "Index":
         """Return the column labels of the PyRanges.
 
         Returns
@@ -1284,20 +1264,18 @@ def columns(self):
         """
 
         if not len(self.values()):
-            return []
+            return pd.Index([])
 
         first = next(iter(self.values()))
-        columns = first.columns
-
-        return columns
+        return first.columns
 
     def count_overlaps(
         self,
-        other,
-        strandedness=None,
-        keep_nonoverlapping=True,
-        overlap_col="NumberOverlaps",
-    ):
+        other: "PyRanges",
+        strandedness: None = None,
+        keep_nonoverlapping: bool = True,
+        overlap_col: str = "NumberOverlaps",
+    ) -> "PyRanges":
         """Count number of overlaps per interval.
 
         Count how many intervals in self overlap with those in other.
@@ -1388,13 +1366,13 @@ def count_overlaps(
 
     def coverage(
         self,
-        other,
-        strandedness=None,
-        keep_nonoverlapping=True,
-        overlap_col="NumberOverlaps",
-        fraction_col="FractionOverlaps",
-        nb_cpu=1,
-    ):
+        other: "PyRanges",
+        strandedness: None = None,
+        keep_nonoverlapping: bool = True,
+        overlap_col: str = "NumberOverlaps",
+        fraction_col: str = "FractionOverlaps",
+        nb_cpu: int = 1,
+    ) -> "PyRanges":
         """Count number of overlaps and their fraction per interval.
 
         Count how many intervals in self overlap with those in other.
@@ -1501,17 +1479,17 @@ def coverage(
         return counts
 
     @property
-    def df(self):
-        """Return PyRanges as DataFrame.
+    def df(self) -> pd.DataFrame:
+        """Return PyRanges as pd.DataFrame.
 
         See also
         --------
 
-        PyRanges.as_df : return PyRanges as DataFrame."""
+        PyRanges.as_df : return PyRanges as pd.DataFrame."""
 
         return self.as_df()
 
-    def drop(self, drop=None, like=None):
+    def drop(self, drop: Optional[str] = None, like: Optional[str] = None) -> "PyRanges":
         """Drop column(s).
 
         If no arguments are given, all the columns except Chromosome, Start, End and Strand are
@@ -1591,7 +1569,7 @@ def drop(self, drop=None, like=None):
 
         return _drop(self, drop, like)
 
-    def drop_duplicate_positions(self, strand=None, keep="first"):
+    def drop_duplicate_positions(self, strand: Optional[bool] = None, keep: Union[bool, str] = "first") -> "PyRanges":
         """Return PyRanges with duplicate postion rows removed.
 
         Parameters
@@ -1667,15 +1645,12 @@ def drop_duplicate_positions(self, strand=None, keep="first"):
         if strand is None:
             strand = self.stranded
 
-        kwargs = {}
-        kwargs["sparse"] = {"self": False}
-        kwargs["keep"] = keep
+        kwargs = {"sparse": {"self": False}, "keep": keep, "strand": strand and self.stranded}
         kwargs = fill_kwargs(kwargs)
-        kwargs["strand"] = strand and self.stranded
         return PyRanges(pyrange_apply_single(_drop_duplicate_positions, self, **kwargs))
 
     @property
-    def dtypes(self):
+    def dtypes(self) -> pd.Series:
         """Return the dtypes of the PyRanges.
 
         Examples
@@ -1715,12 +1690,12 @@ def dtypes(self):
         return df.dtypes
 
     @property
-    def empty(self):
+    def empty(self) -> bool:
         """Indicate whether PyRanges is empty."""
 
         return len(self) == 0
 
-    def extend(self, ext, group_by=None):
+    def extend(self, ext: Union[Dict[str, int], int], group_by: None = None) -> "PyRanges":
         """Extend the intervals from the ends.
 
         Parameters
@@ -1835,7 +1810,7 @@ def extend(self, ext, group_by=None):
 
     # @profile
 
-    def five_end(self):
+    def five_end(self) -> "PyRanges":
         """Return the five prime end of intervals.
 
         The five prime end is the start of a forward strand or the end of a reverse strand.
@@ -1888,7 +1863,7 @@ def five_end(self):
         kwargs = fill_kwargs({"strand": self.stranded})
         return PyRanges(pyrange_apply_single(_tss, self, **kwargs))
 
-    def head(self, n=8):
+    def head(self, n: int = 8) -> "PyRanges":
         """Return the n first rows.
 
         Parameters
@@ -1949,12 +1924,14 @@ def head(self, n=8):
         subsetter[:n] = True
         return self[subsetter]
 
-    def insert(self, other, loc=None):
+    def insert(
+        self, other: Union[pd.DataFrame, pd.Series, Dict[str, pd.Series]], loc: Optional[int] = None
+    ) -> "PyRanges":
         """Add one or more columns to the PyRanges.
 
         Parameters
         ----------
-        other : Series, DataFrame or dict
+        other : pd.Series, pd.DataFrame or dict
             Data to insert into the PyRanges. `other` must have the same number of rows as the PyRanges.
 
         loc : int, default None, i.e. after last column of PyRanges.
@@ -1968,7 +1945,7 @@ def insert(self, other, loc=None):
         Note
         ----
 
-        If a Series, or a dict of Series is used, the Series must have a name.
+        If a pd.Series, or a dict of pd.Series is used, the pd.Series must have a name.
 
         Examples
         --------
@@ -2024,8 +2001,8 @@ def insert(self, other, loc=None):
         Unstranded PyRanges object has 4 rows and 5 columns from 3 chromosomes.
         For printing, the PyRanges was sorted on Chromosome.
 
-        >>> arbitrary_result = gr.apply(
-        ... lambda df: pd.Series(df.Start + df.End, name="Hi!"), as_pyranges=False)
+        >>> arbitrary_result = gr.apply_general(
+        ... lambda df: pd.Series(df.Start + df.End, name="Hi!"))
         >>> arbitrary_result
         {'E': 1     9
         2    15
@@ -2055,11 +2032,11 @@ def insert(self, other, loc=None):
         from pyranges.methods.attr import _setattr
 
         if isinstance(other, (pd.Series, pd.DataFrame)):
-            assert len(other) == len(self), "Pandas Series or DataFrame must be same length as PyRanges!"
+            assert len(other) == len(self), "Pandas pd.Series or pd.DataFrame must be same length as PyRanges!"
 
             if isinstance(other, pd.Series):
                 if not other.name:
-                    raise Exception("Series must have a name!")
+                    raise Exception("pd.Series must have a name!")
 
                 _setattr(self, other.name, other, loc)
 
@@ -2072,7 +2049,7 @@ def insert(self, other, loc=None):
             first = next(iter(other.values()))
             is_dataframe = isinstance(first, pd.DataFrame)
             if is_dataframe:
-                columns = first.columns
+                columns = [str(c) for c in first.columns]
 
                 ds = []
                 for c in columns:
@@ -2083,14 +2060,16 @@ def insert(self, other, loc=None):
                     loc += 1
             else:
                 if not first.name:
-                    raise Exception("Series must have a name!")
+                    raise Exception("pd.Series must have a name!")
 
                 d = {k: v for k, v in other.items()}
                 _setattr(self, first.name, d, loc)
 
         return self
 
-    def intersect(self, other, strandedness=None, how=None, invert=False, nb_cpu=1):
+    def intersect(
+        self, other: "PyRanges", strandedness: Optional[bool] = None, how: Optional[str] = None, invert: bool = False
+    ) -> "PyRanges":
         """Return overlapping subintervals.
 
         Returns the segments of the intervals in self which overlap with those in other.
@@ -2116,11 +2095,6 @@ def intersect(self, other, strandedness=None, how=None, invert=False, nb_cpu=1):
 
             Whether to return the intervals without overlaps.
 
-        nb_cpu: int, default 1
-
-            How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple.
-            Will only lead to speedups on large datasets.
-
         Returns
         -------
         PyRanges
@@ -2197,9 +2171,8 @@ def intersect(self, other, strandedness=None, how=None, invert=False, nb_cpu=1):
         For printing, the PyRanges was sorted on Chromosome.
         """
 
-        kwargs = {"how": how, "strandedness": strandedness, "nb_cpu": nb_cpu}
+        kwargs = {"how": how, "strandedness": strandedness, "sparse": {"self": False, "other": True}}
         kwargs = fill_kwargs(kwargs)
-        kwargs["sparse"] = {"self": False, "other": True}
 
         if len(self) == 0:
             return self
@@ -2212,26 +2185,26 @@ def intersect(self, other, strandedness=None, how=None, invert=False, nb_cpu=1):
 
         if invert:
             found_idxs = getattr(result, "__ix__", [])
-            result = self[~self.__ix__.isin(found_idxs)]
+            result = self[~pd.Series(self.__ix__).isin(found_idxs)]
             result = result.drop("__ix__")
 
         return result
 
-    def items(self):
-        """Return the pairs of keys and DataFrames.
+    def items(self) -> Union[List[Tuple[str, pd.DataFrame]], List[Tuple[Tuple[str, str], pd.DataFrame]]]:
+        """Return the pairs of keys and pd.DataFrames.
 
         Returns
         -------
         dict
 
-            The dict mapping keys to DataFrames in the PyRanges.
+            The dict mapping keys to pd.DataFrames in the PyRanges.
 
         See Also
         --------
 
         PyRanges.chromosomes : return the chromosomes
         PyRanges.keys : return the keys
-        PyRanges.values : return the DataFrames in the PyRanges
+        PyRanges.values : return the pd.DataFrames in the PyRanges
 
         Examples
         --------
@@ -2248,16 +2221,16 @@ def items(self):
 
     def join(
         self,
-        other,
-        strandedness=None,
-        how=None,
-        report_overlap=False,
-        slack=0,
-        suffix="_b",
-        nb_cpu=1,
-        apply_strand_suffix=None,
-        preserve_order=False,
-    ):
+        other: "PyRanges",
+        strandedness: None = None,
+        how: Optional[str] = None,
+        report_overlap: bool = False,
+        slack: int = 0,
+        suffix: str = "_b",
+        nb_cpu: int = 1,
+        apply_strand_suffix: None = None,
+        preserve_order: bool = False,
+    ) -> "PyRanges":
         """Join PyRanges on genomic location.
 
         Parameters
@@ -2401,7 +2374,7 @@ def join(
 
         from pyranges.methods.join import _write_both
 
-        kwargs = {
+        kwargs: Dict[str, Any] = {
             "strandedness": strandedness,
             "how": how,
             "report_overlap": report_overlap,
@@ -2451,7 +2424,7 @@ def join(
 
         return gr
 
-    def keys(self):
+    def keys(self) -> Union[List[str], List[Tuple[str, str]]]:
         """Return the keys.
 
         Returns
@@ -2480,16 +2453,16 @@ def keys(self):
 
     def k_nearest(
         self,
-        other,
-        k=1,
-        ties=None,
-        strandedness=None,
-        overlap=True,
-        how=None,
-        suffix="_b",
-        nb_cpu=1,
-        apply_strand_suffix=None,
-    ):
+        other: "PyRanges",
+        k: Union[List[int], int] = 1,
+        ties: Optional[str] = None,
+        strandedness: None = None,
+        overlap: bool = True,
+        how: Optional[str] = None,
+        suffix: str = "_b",
+        nb_cpu: int = 1,
+        apply_strand_suffix: None = None,
+    ) -> "PyRanges":
         """Find k nearest intervals.
 
         Parameters
@@ -2498,7 +2471,7 @@ def k_nearest(
 
             PyRanges to find nearest interval in.
 
-        k : int or list/array/Series of int
+        k : int or list/array/pd.Series of int
 
             Number of closest to return. If iterable, must be same length as PyRanges.
 
@@ -2718,29 +2691,24 @@ def k_nearest(
         kwargs = fill_kwargs(kwargs)
         kwargs["stranded"] = self.stranded and other.stranded
 
-        overlap = kwargs.get("overlap", True)
-        ties = kwargs.get("ties", False)
-
-        self = self.copy()
+        _self = self.copy()
 
         if isinstance(k, pd.Series):
             k = k.values
 
         # how many to nearest to find; might be different for each
-        self.__k__ = k
+        _self.__k__ = k
         # give each their own unique ID
-        self.__IX__ = np.arange(len(self))
+        _self.__IX__ = np.arange(len(_self))
 
-        dfs = pyrange_apply(_nearest, self, other, **kwargs)
+        dfs = pyrange_apply(_nearest, _self, other, **kwargs)
         nearest = PyRanges(dfs)
 
         if not overlap:
             result = nearest
         else:
-            from collections import defaultdict
-
-            overlap_how = defaultdict(lambda: None, {"first": "first", "last": "last"})[kwargs.get("ties")]
-            overlaps = self.join(
+            overlap_how = defaultdict(lambda: None, {"first": "first", "last": "last"})[kwargs.get("ties")]  # type: ignore
+            overlaps = _self.join(
                 other,
                 strandedness=strandedness,
                 how=overlap_how,
@@ -2805,7 +2773,7 @@ def k_nearest(
 
         result = result.drop(like="__IX__|__k__")
 
-        self = self.drop(like="__k__|__IX__")
+        _self = _self.drop(like="__k__|__IX__")
 
         def prev_to_neg(df, **kwargs):
             strand = df.Strand.iloc[0] if "Strand" in df else "+"
@@ -2821,7 +2789,7 @@ def prev_to_neg(df, **kwargs):
 
         result = result.apply(prev_to_neg, suffix=kwargs["suffix"])
 
-        if not self.stranded and other.stranded:
+        if not _self.stranded and other.stranded:
             if apply_strand_suffix is None:
                 import sys
 
@@ -2835,7 +2803,7 @@ def prev_to_neg(df, **kwargs):
         return result
 
     @property
-    def length(self):
+    def length(self) -> int:
         """Return the total length of the intervals.
 
         See Also
@@ -2868,9 +2836,15 @@ def length(self):
         5
         """
 
-        return int(self.lengths(as_dict=False).sum())
+        lengths = self.lengths(as_dict=False)
+        assert isinstance(lengths, pd.Series)
+        length = lengths.sum()
+        assert isinstance(length, (np.int64, int))
+        return int(length)
 
-    def lengths(self, as_dict=False):
+    def lengths(
+        self, as_dict: bool = False
+    ) -> Union[pd.Series, Dict[Tuple[str, str], pd.Series], Dict[str, pd.Series]]:
         """Return the length of each interval.
 
         Parameters
@@ -2878,11 +2852,11 @@ def lengths(self, as_dict=False):
 
         as_dict : bool, default False
 
-            Whether to return lengths as Series or dict of Series per key.
+            Whether to return lengths as pd.Series or dict of pd.Series per key.
 
         Returns
         -------
-        Series or dict of Series with the lengths of each interval.
+        pd.Series or dict of pd.Series with the lengths of each interval.
 
         See Also
         --------
@@ -2928,24 +2902,19 @@ def lengths(self, as_dict=False):
         """
 
         if as_dict:
-            if not len(self):
-                return {}
-            lengths = {}
-            for k, df in self.items():
-                lengths[k] = df.End - df.Start
-
-            return lengths
+            return {k: df.End - df.Start for k, df in self.items()}  # type: ignore
         else:
-            _lengths = []
+            _lengths: List[pd.Series] = []
             if not len(self):
-                return np.array(_lengths, dtype=int)
+                return pd.Series([], dtype=np.int64)
             for _, df in self:
-                lengths = df.End - df.Start
-                _lengths.append(lengths)
+                _lengths.append(df.End - df.Start)
 
-            return pd.concat(_lengths).reset_index(drop=True)
+            ls = pd.concat(_lengths).reset_index(drop=True)
+            assert isinstance(ls, pd.Series)
+            return ls
 
-    def max_disjoint(self, strand=None, slack=0, **kwargs):
+    def max_disjoint(self, strand: Optional[bool] = None, slack: int = 0, **kwargs) -> "PyRanges":
         """Find the maximal disjoint set of intervals.
 
         Parameters
@@ -3003,7 +2972,14 @@ def max_disjoint(self, strand=None, slack=0, **kwargs):
 
         return pr.PyRanges(df)
 
-    def merge(self, strand=None, count=False, count_col="Count", by=None, slack=0):
+    def merge(
+        self,
+        strand: Optional[bool] = None,
+        count: bool = False,
+        count_col: str = "Count",
+        by: Optional[Union[List[str], str]] = None,
+        slack: int = 0,
+    ) -> "PyRanges":
         """Merge overlapping intervals into one.
 
         Parameters
@@ -3125,7 +3101,7 @@ def merge(self, strand=None, count=False, count_col="Count", by=None, slack=0):
         if strand is None:
             strand = self.stranded
 
-        kwargs = {
+        kwargs: Dict[str, Any] = {
             "strand": strand,
             "count": count,
             "by": by,
@@ -3146,7 +3122,7 @@ def merge(self, strand=None, count=False, count_col="Count", by=None, slack=0):
 
         return PyRanges(df)
 
-    def mp(self, n=8, formatting=None):
+    def mp(self, n: int = 8, formatting: None = None) -> None:
         """Merge location and print.
 
         See Also
@@ -3192,14 +3168,14 @@ def mspc(self, n=30, formatting=None):
 
     def nearest(
         self,
-        other,
-        strandedness=None,
-        overlap=True,
-        how=None,
-        suffix="_b",
-        nb_cpu=1,
-        apply_strand_suffix=None,
-    ):
+        other: "PyRanges",
+        strandedness: None = None,
+        overlap: bool = True,
+        how: Optional[str] = None,
+        suffix: str = "_b",
+        nb_cpu: int = 1,
+        apply_strand_suffix: None = None,
+    ) -> "PyRanges":
         """Find closest interval.
 
         Parameters
@@ -3339,7 +3315,7 @@ def nearest(
 
         return gr
 
-    def new_position(self, new_pos, columns=None):
+    def new_position(self, new_pos: str, columns: Optional[Tuple[str, str, str, str]] = None) -> "PyRanges":
         """Give new position.
 
         The operation join produces a PyRanges with two pairs of start coordinates and two pairs of
@@ -3351,9 +3327,9 @@ def new_position(self, new_pos, columns=None):
 
            Change of coordinates.
 
-        columns : tuple of str, default None, i.e. auto
+        columns : Optional[tuple of str], default None, i.e. auto
 
-           The name of the coordinate columns. By default uses the two first columns containing
+           The name of the coordinate columns. By default, uses the two first columns containing
            "Start" and the two first columns containing "End".
 
         See Also
@@ -3471,9 +3447,7 @@ def new_position(self, new_pos, columns=None):
         if self.empty:
             return self
 
-        kwargs = {"strand": None}
-        kwargs["sparse"] = {"self": False}
-        kwargs["new_pos"] = new_pos
+        kwargs: Dict[str, Any] = {"strand": None, "sparse": {"self": False}, "new_pos": new_pos}
 
         if columns is None:
             start1, start2 = self.columns[self.columns.str.contains("Start")][:2]
@@ -3488,7 +3462,14 @@ def new_position(self, new_pos, columns=None):
 
         return pr.PyRanges(dfs)
 
-    def overlap(self, other, strandedness=None, how="first", invert=False, nb_cpu=1):
+    def overlap(
+        self,
+        other: "PyRanges",
+        strandedness: Optional[Union[bool, str]] = None,
+        how: Optional[str] = "first",
+        invert: bool = False,
+        nb_cpu: int = 1,
+    ) -> "PyRanges":
         """Return overlapping intervals.
 
         Returns the intervals in self which overlap with those in other.
@@ -3605,10 +3586,13 @@ def overlap(self, other, strandedness=None, how="first", invert=False, nb_cpu=1)
         For printing, the PyRanges was sorted on Chromosome.
         """
 
-        kwargs = {"strandedness": strandedness, "nb_cpu": nb_cpu}
-        kwargs["sparse"] = {"self": False, "other": True}
-        kwargs["how"] = how
-        kwargs["invert"] = invert
+        kwargs = {
+            "strandedness": strandedness,
+            "nb_cpu": nb_cpu,
+            "sparse": {"self": False, "other": True},
+            "how": how,
+            "invert": invert,
+        }
         kwargs = fill_kwargs(kwargs)
 
         if len(self) == 0:
@@ -3623,7 +3607,7 @@ def overlap(self, other, strandedness=None, how="first", invert=False, nb_cpu=1)
 
         if invert:
             found_idxs = getattr(result, "__ix__", [])
-            result = self[~self.__ix__.isin(found_idxs)]
+            result = self[~self.__ix__.isin(found_idxs)]  # type: ignore
             result = result.drop("__ix__")
 
         return result
@@ -3640,7 +3624,9 @@ def pc(self, n=8, formatting=None):
 
         return self
 
-    def print(self, n=8, merge_position=False, sort=False, formatting=None, chain=False):
+    def print(
+        self, n: int = 8, merge_position: bool = False, sort: bool = False, formatting: Optional[Dict[str, str]] = None
+    ) -> None:
         """Print the PyRanges.
 
         Parameters
@@ -3650,7 +3636,7 @@ def print(self, n=8, merge_position=False, sort=False, formatting=None, chain=Fa
 
             The number of rows to print.
 
-        merge_postion : bool, default False
+        merge_position : bool, default False
 
             Print location in same column to save screen space.
 
@@ -3663,10 +3649,6 @@ def print(self, n=8, merge_position=False, sort=False, formatting=None, chain=Fa
 
             Formatting options per column.
 
-        chain : False
-
-            Return the PyRanges. Useful to print intermediate results in call chains.
-
         See Also
         --------
 
@@ -3677,7 +3659,7 @@ def print(self, n=8, merge_position=False, sort=False, formatting=None, chain=Fa
         PyRanges.mpc : merge print chain
         PyRanges.msp : merge sort print
         PyRanges.mspc : merge sort print chain
-        PyRanges.rp : raw print dictionary of DataFrames
+        PyRanges.rp : raw print dictionary of pd.DataFrames
 
         Examples
         --------
@@ -3796,11 +3778,8 @@ def print(self, n=8, merge_position=False, sort=False, formatting=None, chain=Fa
 
         print(s)
 
-        if chain:
-            return self
-
     def rp(self):
-        """Print dict of DataFrames.
+        """Print dict of pd.DataFrames.
 
         See Also
         --------
@@ -3810,7 +3789,7 @@ def rp(self):
         print(self.dfs)
 
     def rpc(self):
-        """Print dict of DataFrames and return self.
+        """Print dict of pd.DataFrames and return self.
 
         See Also
         --------
@@ -3821,7 +3800,7 @@ def rpc(self):
 
         return self
 
-    def sample(self, n=8, replace=False):
+    def sample(self, n: int = 8, replace: bool = False) -> "PyRanges":
         """Subsample arbitrary rows of PyRanges.
 
         If n is larger than length of PyRanges, replace must be True.
@@ -3863,7 +3842,14 @@ def sample(self, n=8, replace=False):
         subsetter[sample] = True
         return self[subsetter]
 
-    def set_intersect(self, other, strandedness=None, how=None, new_pos=False, nb_cpu=1):
+    def set_intersect(
+        self,
+        other: "PyRanges",
+        strandedness: None = None,
+        how: Optional[str] = None,
+        new_pos: bool = False,
+        nb_cpu: int = 1,
+    ) -> "PyRanges":
         """Return set-theoretical intersection.
 
         Like intersect, but both PyRanges are merged first.
@@ -3981,7 +3967,7 @@ def set_intersect(self, other, strandedness=None, how=None, new_pos=False, nb_cp
 
         return PyRanges(dfs)
 
-    def set_union(self, other, strandedness=None, nb_cpu=1):
+    def set_union(self, other: "PyRanges", strandedness: None = None, nb_cpu: int = 1) -> "PyRanges":
         """Return set-theoretical union.
 
         Parameters
@@ -4073,7 +4059,7 @@ def set_union(self, other, strandedness=None, nb_cpu=1):
 
         return gr
 
-    def sort(self, by=None, nb_cpu=1):
+    def sort(self, by: Optional[str] = None, nb_cpu: int = 1) -> "PyRanges":
         """Sort by position or columns.
 
         Parameters
@@ -4092,7 +4078,7 @@ def sort(self, by=None, nb_cpu=1):
         Note
         ----
 
-        Since a PyRanges contains multiple DataFrames, the sorting only happens within dataframes.
+        Since a PyRanges contains multiple pd.DataFrames, the sorting only happens within dataframes.
 
         Returns
         -------
@@ -4172,8 +4158,7 @@ def sort(self, by=None, nb_cpu=1):
 
         from pyranges.methods.sort import _sort
 
-        kwargs = {"strand": self.stranded}
-        kwargs["sparse"] = {"self": False}
+        kwargs = {"strand": self.stranded, "sparse": {"self": False}}
         if by:
             assert "5" not in by or (
                 ((type(by) is str and by == "5") or (type(by) is not str and "5" in by)) and self.stranded
@@ -4209,7 +4194,14 @@ def slack(self, slack):
         """Deprecated: this function has been moved to Pyranges.extend"""
         return self.extend(slack)
 
-    def spliced_subsequence(self, start=0, end=None, by=None, strand=None, **kwargs):
+    def spliced_subsequence(
+        self,
+        start: int = 0,
+        end: Optional[int] = None,
+        by: Optional[str] = None,
+        strand: Optional[bool] = None,
+        **kwargs
+    ) -> "PyRanges":
         """Get subsequences of the intervals, using coordinates mapping to spliced transcripts (without introns)
 
         The returned intervals are subregions of self, cut according to specifications.
@@ -4352,12 +4344,12 @@ def spliced_subsequence(self, start=0, end=None, by=None, strand=None, **kwargs)
 
         return pr.PyRanges(result)
 
-    def split(self, strand=None, between=False, nb_cpu=1):
+    def split(self, strand: Optional[bool] = None, between: bool = False) -> "PyRanges":
         """Split into non-overlapping intervals.
 
         Parameters
         ----------
-        strand : bool, default None, i.e. auto
+        strand : Optional[bool], default None, i.e. auto
 
             Whether to ignore strand information if PyRanges is stranded.
 
@@ -4365,11 +4357,6 @@ def split(self, strand=None, between=False, nb_cpu=1):
 
             Include lengths between intervals.
 
-        nb_cpu: int, default 1
-
-            How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple.
-            Will only lead to speedups on large datasets.
-
         Returns
         -------
         PyRanges
@@ -4470,13 +4457,13 @@ def split(self, strand=None, between=False, nb_cpu=1):
 
         split = pr.PyRanges(df)
         if not between:
-            strandedness = "same" if strand else False
+            strandedness: Union[str, bool] = "same" if strand else False
             split = split.overlap(self, strandedness=strandedness)
 
         return split
 
     @property
-    def stranded(self):
+    def stranded(self) -> bool:
         """Whether PyRanges has (valid) strand info.
 
         Note
@@ -4524,7 +4511,7 @@ def stranded(self):
         return isinstance(key, tuple)
 
     @property
-    def strands(self):
+    def strands(self) -> List[Union[Any, str]]:
         """Return strands.
 
         Notes
@@ -4570,13 +4557,13 @@ def strands(self):
 
         return natsorted(set([k[1] for k in self.keys()]))
 
-    def subset(self, f, strand=None, **kwargs):
+    def subset(self, f: Callable, strand: Optional[bool] = None, **kwargs) -> "PyRanges":
         """Return a subset of the rows.
 
         Parameters
         ----------
         f : function
-            Function which returns boolean Series equal to length of df.
+            Function which returns boolean pd.Series equal to length of df.
 
         strand : bool, default None, i.e. auto
 
@@ -4594,7 +4581,7 @@ def subset(self, f, strand=None, **kwargs):
         Notes
         -----
 
-        PyRanges can also be subsetted directly with a boolean Series. This function is slightly
+        PyRanges can also be subsetted directly with a boolean pd.Series. This function is slightly
         faster, but more cumbersome.
 
         Returns
@@ -4667,7 +4654,14 @@ def subset(self, f, strand=None, **kwargs):
 
         return self[result]
 
-    def subsequence(self, start=0, end=None, by=None, strand=None, **kwargs):
+    def subsequence(
+        self,
+        start: int = 0,
+        end: Optional[int] = None,
+        by: Optional[str] = None,
+        strand: Optional[bool] = None,
+        **kwargs
+    ) -> "PyRanges":
         """Get subsequences of the intervals.
 
         The returned intervals are subregions of self, cut according to specifications.
@@ -4800,7 +4794,7 @@ def subsequence(self, start=0, end=None, by=None, strand=None, **kwargs):
 
         return pr.PyRanges(result)
 
-    def subtract(self, other, strandedness=None, nb_cpu=1):
+    def subtract(self, other: "PyRanges", strandedness: None = None, nb_cpu: int = 1) -> "PyRanges":
         """Subtract intervals.
 
         Parameters
@@ -4864,8 +4858,7 @@ def subtract(self, other, strandedness=None, nb_cpu=1):
 
         from pyranges.methods.subtraction import _subtraction
 
-        kwargs = {"strandedness": strandedness}
-        kwargs["sparse"] = {"self": False, "other": True}
+        kwargs = {"strandedness": strandedness, "sparse": {"self": False, "other": True}}
         kwargs = fill_kwargs(kwargs)
 
         strand = True if strandedness else False
@@ -4879,7 +4872,7 @@ def subtract(self, other, strandedness=None, nb_cpu=1):
 
         return PyRanges(result).drop("__num__")
 
-    def summary(self, to_stdout=True, return_df=False):
+    def summary(self, to_stdout: bool = True, return_df: bool = False) -> Optional[pd.DataFrame]:
         """Return info.
 
         Count refers to the number of intervals, the rest to the lengths.
@@ -4903,7 +4896,7 @@ def summary(self, to_stdout=True, return_df=False):
 
         Returns
         -------
-            None or DataFrame with summary.
+            None or pd.DataFrame with summary.
 
 
         Examples
@@ -4960,7 +4953,7 @@ def summary(self, to_stdout=True, return_df=False):
 
         return _summary(self, to_stdout, return_df)
 
-    def tail(self, n=8):
+    def tail(self, n: int = 8) -> "PyRanges":
         """Return the n last rows.
 
         Parameters
@@ -5021,7 +5014,7 @@ def tail(self, n=8):
         subsetter[(len(self) - n) :] = True
         return self[subsetter]
 
-    def tile(self, tile_size, overlap=False, strand=None, nb_cpu=1):
+    def tile(self, tile_size: int, overlap: bool = False, strand: Optional[bool] = None, nb_cpu: int = 1) -> "PyRanges":
         """Return overlapping genomic tiles.
 
         The genome is divided into bookended tiles of length `tile_size` and one is returned per
@@ -5124,15 +5117,13 @@ def tile(self, tile_size, overlap=False, strand=None, nb_cpu=1):
         if strand is None:
             strand = self.stranded
 
-        kwargs = {"strand": strand, "overlap": overlap}
-        kwargs["sparse"] = {"self": False}
-        kwargs["tile_size"] = tile_size
+        kwargs = {"strand": strand, "overlap": overlap, "sparse": {"self": False}, "tile_size": tile_size}
 
         df = pyrange_apply_single(_tiles, self, **kwargs)
 
         return PyRanges(df)
 
-    def to_example(self, n=10):
+    def to_example(self, n: int = 10) -> Dict[str, List[Union[int, str]]]:
         """Return as dict.
 
         Used for easily creating examples for copy and pasting.
@@ -5199,7 +5190,7 @@ def to_example(self, n=10):
 
         return d
 
-    def three_end(self):
+    def three_end(self) -> "PyRanges":
         """Return the 3'-end.
 
         The 3'-end is the start of intervals on the reverse strand and the end of intervals on the
@@ -5302,7 +5293,9 @@ def three_end(self):
     #         >>>
     #         """
 
-    def to_bed(self, path=None, keep=True, compression="infer", chain=False):
+    def to_bed(
+        self, path: Optional[str] = None, keep: bool = True, compression: str = "infer", chain: bool = False
+    ) -> Union[str, "PyRanges"]:
         r"""Write to bed.
 
         Parameters
@@ -5379,14 +5372,14 @@ def to_bed(self, path=None, keep=True, compression="infer", chain=False):
 
     def to_bigwig(
         self,
-        path=None,
-        chromosome_sizes=None,
-        rpm=True,
-        divide=None,
-        value_col=None,
-        dryrun=False,
-        chain=False,
-    ):
+        path: None = None,
+        chromosome_sizes: None = None,
+        rpm: bool = True,
+        divide: Optional[bool] = None,
+        value_col: Optional[str] = None,
+        dryrun: bool = False,
+        chain: bool = False,
+    ) -> Optional["PyRanges"]:
         """Write regular or value coverage to bigwig.
 
         Note
@@ -5433,7 +5426,7 @@ def to_bigwig(
 
         See Also
         --------
-        pyranges.to_bigwig : write pandas DataFrame to bigwig.
+        pyranges.to_bigwig : write pandas pd.DataFrame to bigwig.
 
         Examples
         --------
@@ -5511,9 +5504,11 @@ def to_bigwig(
         if chain:
             return self
         else:
-            pass
+            return None
 
-    def to_csv(self, path=None, sep=",", header=True, compression="infer", chain=False):
+    def to_csv(
+        self, path: Optional["Path"] = None, sep: str = ",", header: bool = True, compression: str = "infer"
+    ) -> Union[str, "PyRanges"]:
         r"""Write to comma- or other value-separated file.
 
         Parameters
@@ -5534,10 +5529,6 @@ def to_csv(self, path=None, sep=",", header=True, compression="infer", chain=Fal
 
             Which compression to use. Uses file extension to infer by default.
 
-        chain: bool, default False
-
-            Whether to return the PyRanges after writing.
-
 
         Note
         ----
@@ -5561,13 +5552,15 @@ def to_csv(self, path=None, sep=",", header=True, compression="infer", chain=Fal
 
         from pyranges.out import _to_csv
 
-        result = _to_csv(self, path, sep=sep, header=header, compression=compression)
-        if path and chain:
-            return self
-        else:
-            return result
+        return _to_csv(self, path, sep=sep, header=header, compression=compression)
 
-    def to_gff3(self, path=None, compression="infer", chain=False, map_cols=None):
+    def to_gff3(
+        self,
+        path: None = None,
+        compression: str = "infer",
+        chain: bool = False,
+        map_cols: Optional[Dict[str, str]] = None,
+    ) -> str:
         """Write to General Feature Format 3.
 
         The GFF format consists of a tab-separated file without header.
@@ -5679,7 +5672,13 @@ def to_gff3(self, path=None, compression="infer", chain=False, map_cols=None):
         else:
             return result
 
-    def to_gtf(self, path=None, compression="infer", chain=False, map_cols=None):
+    def to_gtf(
+        self,
+        path: None = None,
+        compression: str = "infer",
+        chain: bool = False,
+        map_cols: Optional[Dict[str, str]] = None,
+    ) -> str:
         """Write to Gene Transfer Format.
 
         The GTF format consists of a tab-separated file without header.
@@ -5770,7 +5769,9 @@ def to_gtf(self, path=None, compression="infer", chain=False, map_cols=None):
         else:
             return result
 
-    def to_rle(self, value_col=None, strand=None, rpm=False, nb_cpu=1):
+    def to_rle(
+        self, value_col: Optional[str] = None, strand: Optional[bool] = None, rpm: bool = False, nb_cpu: int = 1
+    ) -> "RleDict":
         """Return as RleDict.
 
         Create collection of Rles representing the coverage or other numerical value.
@@ -5880,7 +5881,7 @@ def to_rle(self, value_col=None, strand=None, rpm=False, nb_cpu=1):
 
         return _to_rle(self, value_col, strand=strand, rpm=rpm, nb_cpu=nb_cpu)
 
-    def unstrand(self):
+    def unstrand(self) -> "PyRanges":
         """Remove strand.
 
         Note
@@ -5933,12 +5934,12 @@ def unstrand(self):
 
         return pr.PyRanges(gr.dfs)
 
-    def values(self):
-        """Return the underlying DataFrames."""
+    def values(self) -> List[pd.DataFrame]:
+        """Return the underlying pd.DataFrames."""
 
         return [df for k, df in self.items() if not df.empty]
 
-    def window(self, window_size, strand=None):
+    def window(self, window_size: int, strand: Optional[bool] = None) -> "PyRanges":
         """Return overlapping genomic windows.
 
         Windows of length `window_size` are returned.
@@ -6057,3 +6058,30 @@ def __getstate__(self):
 
     def __setstate__(self, d):
         self.__dict__["dfs"] = d
+
+    @staticmethod
+    def _zip_locationkey_and_data(keys: Iterable, dfs: Iterable[pd.DataFrame], strand: bool) -> "PyRanges":
+        """Zip keys and data into a PyRanges object.
+
+        Helper method because MyPy has difficulty seeing that PyRanges keys are
+        either list[str] or list[tuple[str, str]]. It considers them to be list[Union[str, tuple[str, str]]]
+        which results in typecheck errors.
+        """
+        if strand:
+            for k in keys:
+                assert isinstance(k, tuple)
+            return pr.PyRanges(dict(zip(keys, dfs)))
+        else:
+            for k in keys:
+                assert isinstance(k, str)
+            return pr.PyRanges(dict(zip(keys, dfs)))
+
+
+def _test():
+    import doctest
+
+    doctest.testmod()
+
+
+if __name__ == "__main__":
+    _test()

From c5045856ba6a1026803b662a7c0974f8946ee0ae Mon Sep 17 00:00:00 2001
From: endre bakken stovner <endrebakkenstovner@endres-MacBook-Air.local>
Date: Sun, 21 May 2023 16:55:41 +0200
Subject: [PATCH 03/10] Add types to init

---
 pyranges/__init__.py | 57 +++++++++++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 17 deletions(-)

diff --git a/pyranges/__init__.py b/pyranges/__init__.py
index 5bf17528..a026829d 100644
--- a/pyranges/__init__.py
+++ b/pyranges/__init__.py
@@ -1,7 +1,10 @@
 from __future__ import print_function
 
+import itertools
 import sys
 from collections import defaultdict
+from pathlib import Path
+from typing import Dict, Iterable, Optional, Set, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -27,8 +30,10 @@
 
 read_gff = read_gtf
 
+Chromsizes = Union[Dict[str, int], Dict[Tuple[str, str], int]]
 
-def from_dict(d):
+
+def from_dict(d: Dict[str, Iterable]) -> PyRanges:
     """Create a PyRanges from dict.
 
     Parameters
@@ -68,7 +73,7 @@ def from_dict(d):
     return PyRanges(pd.DataFrame(d))
 
 
-def from_string(s):
+def from_string(s: str) -> PyRanges:
     """Create a PyRanges from multiline string.
 
     Parameters
@@ -115,7 +120,7 @@ def from_string(s):
     return PyRanges(df)
 
 
-def itergrs(prs, strand=None, keys=False):
+def itergrs(prs: Iterable[PyRanges], strand=None, keys=False):
     r"""Iterate over multiple PyRanges at once.
 
     Parameters
@@ -209,14 +214,12 @@ def itergrs(prs, strand=None, keys=False):
         prs = [gr.unstrand() for gr in prs]
 
     grs_per_chromosome = defaultdict(list)
-    set_keys = set()
-    for gr in prs:
-        set_keys.update(gr.dfs.keys())
+    set_keys: Union[Set[str], Set[Tuple[str, str]]] = set(itertools.chain.from_iterable(*[gr.dfs.keys() for gr in prs]))
 
     empty_dfs = [pd.DataFrame(columns=gr.columns) for gr in prs]
     for gr, empty in zip(prs, empty_dfs):
         for k in set_keys:
-            df = gr.dfs.get(k, empty)
+            df = gr.dfs.get(k, empty)  # type: ignore
             grs_per_chromosome[k].append(df)
 
     if not keys:
@@ -225,7 +228,13 @@ def itergrs(prs, strand=None, keys=False):
         return iter(natsorted(grs_per_chromosome.items()))
 
 
-def random(n=1000, length=100, chromsizes=None, strand=True, seed=None):
+def random(
+    n: int = 1000,
+    length: int = 100,
+    chromsizes: Optional[Chromsizes] = None,
+    strand: bool = True,
+    seed: Optional[int] = None,
+):
     """Return PyRanges with random intervals.
 
     Parameters
@@ -296,8 +305,7 @@ def random(n=1000, length=100, chromsizes=None, strand=True, seed=None):
     """
 
     if chromsizes is None:
-        chromsizes = data.chromsizes()
-        df = chromsizes.df
+        df = data.chromsizes().df
     elif isinstance(chromsizes, dict):
         df = pd.DataFrame({"Chromosome": list(chromsizes.keys()), "End": list(chromsizes.values())})
     else:
@@ -307,7 +315,7 @@ def random(n=1000, length=100, chromsizes=None, strand=True, seed=None):
 
     n_per_chrom = pd.Series(np.random.choice(df.index, size=n, p=p)).value_counts(sort=False).to_frame()
     n_per_chrom.insert(1, "Chromosome", df.loc[n_per_chrom.index].Chromosome)
-    n_per_chrom.columns = "Count Chromosome".split()
+    n_per_chrom.columns = pd.Index("Count Chromosome".split())
 
     random_dfs = []
     for _, (count, chrom) in n_per_chrom.iterrows():
@@ -330,14 +338,17 @@ def random(n=1000, length=100, chromsizes=None, strand=True, seed=None):
 pyranges.statistics : statistcal methods for genomics."""
 
 
-def to_bigwig(gr, path, chromosome_sizes):
+def to_bigwig(gr: PyRanges, path: Path, chromosome_sizes=Optional[Chromsizes]):
     """Write df to bigwig.
 
     Must contain the columns Chromosome, Start, End and Score. All others are ignored.
 
     Parameters
     ----------
-    path : str
+    gr: PyRanges
+        Intervals to write.
+
+    path : Path
 
         Where to write bigwig.
 
@@ -492,7 +503,9 @@ def to_bigwig(gr, path, chromosome_sizes):
     assert (
         len(gr.strands) <= 1
     ), "Can only write one strand at a time. Use an unstranded PyRanges or subset on strand first."
-    assert np.sum(gr.lengths()) == gr.merge().length, "Intervals must not overlap."
+    lengths = gr.lengths()
+    assert isinstance(lengths, pd.Series)
+    assert np.sum(lengths) == gr.merge().length, "Intervals must not overlap."
 
     df = gr.df
 
@@ -515,16 +528,16 @@ def to_bigwig(gr, path, chromosome_sizes):
     bw.addEntries(chromosomes, starts, ends=ends, values=values)
 
 
-def version_info():
+def version_info() -> None:
     import importlib
 
-    def update_version_info(version_info, library):
+    def update_version_info(_version_info, library) -> None:
         if importlib.util.find_spec(library):
             version = importlib.import_module(library).__version__
         else:
             version = "not installed"
 
-        version_info[library] = version
+        _version_info[library] = version
 
     version_info = {
         "pyranges version": pr.__version__,
@@ -561,3 +574,13 @@ def update_version_info(version_info, library):
     "PyRanges",
     "version_info",
 ]
+
+
+def _test():
+    import doctest
+
+    doctest.testmod()
+
+
+if __name__ == "__main__":
+    _test()

From 01313cfe0c2e1fbfd389be22c3ccb30364708c2e Mon Sep 17 00:00:00 2001
From: endre bakken stovner <endrebakkenstovner@endres-MacBook-Air.local>
Date: Sun, 21 May 2023 17:08:46 +0200
Subject: [PATCH 04/10] add types to tostring

---
 pyranges/__init__.py  |  2 +-
 pyranges/tostring2.py | 50 +++++++++++++++++++++++++++----------------
 2 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/pyranges/__init__.py b/pyranges/__init__.py
index a026829d..a0ebe0cc 100644
--- a/pyranges/__init__.py
+++ b/pyranges/__init__.py
@@ -532,7 +532,7 @@ def version_info() -> None:
     import importlib
 
     def update_version_info(_version_info, library) -> None:
-        if importlib.util.find_spec(library):
+        if importlib.util.find_spec(library):  # type: ignore
             version = importlib.import_module(library).__version__
         else:
             version = "not installed"
diff --git a/pyranges/tostring2.py b/pyranges/tostring2.py
index de9d939a..bfdebc5a 100644
--- a/pyranges/tostring2.py
+++ b/pyranges/tostring2.py
@@ -1,29 +1,32 @@
 import functools
 import os
 import shutil
-from typing import Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 import natsort  # type: ignore
 import pandas as pd
+from pandas.core.frame import DataFrame
+
+from pyranges.pyranges_main import PyRanges
 
 sort_cols = "Start End".split()
 
 GITHUB_ACTIONS = os.environ.get("GITHUB_ACTIONS", False)
 
 
-def _get_stranded_f(self, half_entries, f, sort=False):
+def _get_stranded_f(self: PyRanges, half_entries: int, f: str, sort: bool = False) -> DataFrame:
     counter = 0
     dfs = []
 
     chromosomes = self.chromosomes
 
     if f == "tail":
-        chromosomes = reversed(chromosomes)
+        chromosomes = list(reversed(chromosomes))
 
     default = pd.DataFrame(columns=self.columns)
     for chromosome in chromosomes:
-        plus = self.dfs.get((chromosome, "+"), default)
-        minus = self.dfs.get((chromosome, "-"), default)
+        plus = self.dfs.get((chromosome, "+"), default)  # type: ignore
+        minus = self.dfs.get((chromosome, "-"), default)  # type: ignore
 
         if sort:
             plus = plus.sort_values(sort_cols)
@@ -54,18 +57,18 @@ def _get_stranded_f(self, half_entries, f, sort=False):
     return df
 
 
-def _get_unstranded_f(self, half_entries, f, sort=False):
+def _get_unstranded_f(self: PyRanges, half_entries: int, f: str, sort: bool = False) -> DataFrame:
     chromosomes = self.chromosomes
 
     if f == "tail":
-        chromosomes = reversed(chromosomes)
+        chromosomes = list(reversed(chromosomes))
 
     default = pd.DataFrame(columns=self.columns)
 
     counter = 0
     dfs = []
     for chromosome in chromosomes:
-        cdf = self.dfs.get((chromosome), default)
+        cdf = self.dfs.get(chromosome, default)  # type: ignore
         cdf = getattr(cdf, f)(half_entries)
 
         if sort:
@@ -85,7 +88,7 @@ def _get_unstranded_f(self, half_entries, f, sort=False):
     return df
 
 
-def _get_df(self, n, sort):
+def _get_df(self: PyRanges, n: int, sort: bool) -> DataFrame:
     half_entries = int(n / 2)
 
     if len(self) <= n:
@@ -110,7 +113,7 @@ def _get_df(self, n, sort):
     return df
 
 
-def show_pos_merge_position(df):
+def show_pos_merge_position(df: DataFrame) -> DataFrame:
     # all_dots = df.Start == "..."
 
     cols_to_drop = "Chromosome Start End".split()
@@ -136,7 +139,7 @@ def show_pos_merge_position(df):
     return df
 
 
-def get_columns_dtypes(self):
+def get_columns_dtypes(self: PyRanges) -> Dict[str, str]:
     _df = next(iter(self.dfs.values()))
     dtypes = [
         str(d)
@@ -149,7 +152,7 @@ def get_columns_dtypes(self):
     return {c: d for c, d in zip(columns, dtypes)}
 
 
-def build_header(columns_dtypes):
+def build_header(columns_dtypes: Dict[str, str]) -> List[str]:
     header = []
     for c, d in columns_dtypes.items():
         cd = "".join([str(c), "\n(", d, ")"])
@@ -166,7 +169,9 @@ def add_hidden_col_dotdot(df, n_hidden_cols):
     return df
 
 
-def _grow_string_representation(df, columns_dtypes, terminal_width: Optional[int] = None):
+def _grow_string_representation(
+    df: DataFrame, columns_dtypes: Dict[str, str], terminal_width: Optional[int] = None
+) -> Tuple[str, List[str]]:
     from tabulate import tabulate
 
     _terminal_width = shutil.get_terminal_size().columns if terminal_width is None else terminal_width
@@ -174,7 +179,7 @@ def _grow_string_representation(df, columns_dtypes, terminal_width: Optional[int
 
     if len(columns_dtypes) < 15:
         header = build_header(columns_dtypes)
-        str_repr = tabulate(df, headers=header, tablefmt="psql", showindex=False)
+        str_repr = tabulate(df, headers=header, tablefmt="psql", showindex=False)  # type: ignore
 
         table_width = len(str_repr.split("\n", 1)[0])
 
@@ -183,10 +188,11 @@ def _grow_string_representation(df, columns_dtypes, terminal_width: Optional[int
 
     header = build_header({k: columns_dtypes[k] for k in columns_dtypes})
     original_header = list(columns_dtypes)
-    df.columns = header
+    df.columns = pd.Index(header)
 
     # know that any pyrange will have at least three columns
     build_df = df.get(list(df.columns[:3]))
+    assert isinstance(build_df, DataFrame)
 
     total_columns = len(df.columns)
 
@@ -222,7 +228,7 @@ def _grow_string_representation(df, columns_dtypes, terminal_width: Optional[int
 )
 
 
-def untraditional_strand_info(self, str_repr_width):
+def untraditional_strand_info(self: PyRanges, str_repr_width: int) -> str:
     _ustr = ""
     if "Strand" in self.columns and not self.stranded:
         strands = []
@@ -249,7 +255,7 @@ def untraditional_strand_info(self, str_repr_width):
     return _ustr
 
 
-def hidden_columns_info(hidden_columns, str_repr_width):
+def hidden_columns_info(hidden_columns: List[Any], str_repr_width: int) -> str:
     n_hidden_cols = len(hidden_columns)
     _hstr = ""
     if n_hidden_cols:
@@ -268,7 +274,7 @@ def hidden_columns_info(hidden_columns, str_repr_width):
     return _hstr
 
 
-def add_text_to_str_repr(self, str_repr, hidden_columns, sort):
+def add_text_to_str_repr(self: PyRanges, str_repr: str, hidden_columns: List[Any], sort: bool) -> str:
     n_intervals = len(self)
     n_chromosomes = len(self.chromosomes)
 
@@ -297,7 +303,13 @@ def add_text_to_str_repr(self, str_repr, hidden_columns, sort):
     return str_repr
 
 
-def tostring(self, n=8, merge_position=False, formatting=None, sort=False):
+def tostring(
+    self: PyRanges,
+    n: int = 8,
+    merge_position: bool = False,
+    formatting: Optional[Dict[str, str]] = None,
+    sort: bool = False,
+) -> str:
     if len(self) == 0:
         return "Empty PyRanges"
 

From e652433421c5e1ed49cc5cd577da569672623d8f Mon Sep 17 00:00:00 2001
From: endre bakken stovner <endrebakkenstovner@endres-MacBook-Air.local>
Date: Sun, 21 May 2023 17:39:52 +0200
Subject: [PATCH 05/10] Types to multithreaded

---
 pyranges/multithreaded.py | 154 +++++++++++---------------------------
 pyranges/tostring2.py     |  19 ++---
 2 files changed, 53 insertions(+), 120 deletions(-)

diff --git a/pyranges/multithreaded.py b/pyranges/multithreaded.py
index 7505ea60..4a7fb3e0 100644
--- a/pyranges/multithreaded.py
+++ b/pyranges/multithreaded.py
@@ -1,35 +1,24 @@
 import os
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, Union
 
 import numpy as np
 import pandas as pd
 from natsort import natsorted  # type: ignore
+from pandas.core.frame import DataFrame
 
-import pyranges as pr
+if TYPE_CHECKING:
+    from pyranges.pyranges_main import PyRanges
 
 ray = None
 
 
-def get_n_args(f):
+def get_n_args(f: Callable) -> int:
     import inspect
 
     nparams = len(inspect.signature(f).parameters)
     return nparams
 
 
-def call_f(f, nparams, df, odf, kwargs):
-    if nparams == 3:
-        return f.remote(df, odf, **kwargs)
-    else:
-        return f.remote(df, odf)
-
-
-def call_f_single(f, nparams, df, **kwargs):
-    if nparams == 2:
-        return f.remote(df, **kwargs)
-    else:
-        return f.remote(df)
-
-
 class suppress_stdout_stderr(object):
     """
     A context manager for doing a "deep suppression" of stdout and stderr in
@@ -61,20 +50,20 @@ def __exit__(self, *_):
         os.close(self.null_fds[1])
 
 
-def merge_dfs(df1, df2):
+def merge_dfs(df1: DataFrame, df2: DataFrame) -> DataFrame:
     if not df1.empty and not df2.empty:
         return pd.concat([df1, df2], sort=False).reset_index(drop=True)
 
     elif df1.empty and df2.empty:
         # can this happen?
-        return None
+        return pd.DataFrame()
     elif df1.empty:
         return df2
     else:
         return df1
 
 
-def process_results(results, keys):
+def process_results(results: List[Any], keys: Union[List[str], List[Tuple[str, str]]]) -> dict:
     results_dict = {k: r for k, r in zip(keys, results) if r is not None}
 
     try:
@@ -103,7 +92,7 @@ def process_results(results, keys):
     return results_dict
 
 
-def make_sparse(df):
+def make_sparse(df: DataFrame) -> DataFrame:
     if "Strand" in df:
         cols = "Chromosome Start End Strand".split()
     else:
@@ -112,7 +101,7 @@ def make_sparse(df):
     return df[cols]
 
 
-def make_binary_sparse(kwargs, df, odf):
+def make_binary_sparse(kwargs: Dict[str, Any], df: DataFrame, odf: DataFrame) -> Tuple[DataFrame, DataFrame]:
     sparse = kwargs.get("sparse")
 
     if not sparse:
@@ -127,13 +116,10 @@ def make_binary_sparse(kwargs, df, odf):
     return df, odf
 
 
-def make_unary_sparse(kwargs, df):
-    sparse = kwargs.get("sparse").get("self")
-
-    if sparse:
-        df = make_sparse(df)
+def make_unary_sparse(kwargs: Dict[str, Any], df: DataFrame) -> DataFrame:
+    sparse = kwargs.get("sparse", {}).get("self")
 
-    return df
+    return make_sparse(df) if sparse else df
 
 
 def ray_initialized():
@@ -157,40 +143,9 @@ def test_function():
             raise e
 
 
-def get_multithreaded_funcs(function, nb_cpu):
-    if nb_cpu > 1:
-        import ray  # type: ignore
-
-        _merge_dfs = ray.remote(merge_dfs)
-        get = ray.get
-        function = ray.remote(function)
-    else:
-
-        def _merge_dfs():
-            return "dummy value"
-
-        _merge_dfs.remote = merge_dfs
-
-        def get(x):
-            return x
-
-        function.remote = function
-
-    return function, get, _merge_dfs
-
-
-def pyrange_apply(function, self, other, **kwargs):
-    nparams = get_n_args(function)
-    nb_cpu = kwargs.get("nb_cpu", 1)
-
-    if nb_cpu > 1:
-        import ray  # type: ignore
-
-        with suppress_stdout_stderr():
-            ray.init(num_cpus=nb_cpu, ignore_reinit_error=True)
-
-    function, get, _merge_dfs = get_multithreaded_funcs(function, nb_cpu=nb_cpu)
-
+def pyrange_apply(
+    function: Callable, self: "PyRanges", other: "PyRanges", **kwargs
+) -> Union[Dict[Tuple[str, str], Any], Dict[str, Any]]:
     strandedness = kwargs["strandedness"]
 
     other_strand = {"+": "-", "-": "+"}
@@ -228,7 +183,7 @@ def pyrange_apply(function, self, other, **kwargs):
                 odf = other[c, os].values()[0]
 
             df, odf = make_binary_sparse(kwargs, df, odf)
-            result = call_f(function, nparams, df, odf, kwargs)
+            result = function(df, odf, **kwargs)
 
             results.append(result)
 
@@ -241,7 +196,7 @@ def pyrange_apply(function, self, other, **kwargs):
                     odf = other_dfs[c]
 
                 df, odf = make_binary_sparse(kwargs, df, odf)
-                result = call_f(function, nparams, df, odf, kwargs)
+                result = function(df, odf, **kwargs)
                 results.append(result)
 
         elif not self.stranded and other.stranded:
@@ -249,28 +204,28 @@ def pyrange_apply(function, self, other, **kwargs):
                 if c not in other_chromosomes:
                     odf = dummy
                 else:
-                    odf1 = other_dfs.get((c, "+"), dummy)
-                    odf2 = other_dfs.get((c, "-"), dummy)
+                    odf1 = other_dfs.get((c, "+"), dummy)  # type: ignore
+                    odf2 = other_dfs.get((c, "-"), dummy)  # type: ignore
 
-                    odf = _merge_dfs.remote(odf1, odf2)
+                    odf = merge_dfs(odf1, odf2)
 
                 df, odf = make_binary_sparse(kwargs, df, odf)
 
-                result = call_f(function, nparams, df, odf, kwargs)
+                result = function(df, odf, **kwargs)
                 results.append(result)
 
         elif self.stranded and other.stranded:
-            for (c, s), df in self.items():
+            for (c, s), df in self.items():  # type: ignore
                 if c not in other_chromosomes:
-                    odfs = pr.PyRanges(dummy)
+                    odfs = [dummy]
                 else:
-                    odfp = other_dfs.get((c, "+"), dummy)
-                    odfm = other_dfs.get((c, "-"), dummy)
+                    odfp = other_dfs.get((c, "+"), dummy)  # type: ignore
+                    odfm = other_dfs.get((c, "-"), dummy)  # type: ignore
 
                     odfs = [odfp, odfm]
 
                 if len(odfs) == 2:
-                    odf = _merge_dfs.remote(*odfs)
+                    odf = merge_dfs(*odfs)
                 elif len(odfs) == 1:
                     odf = odfs[0]
                 else:
@@ -278,7 +233,7 @@ def pyrange_apply(function, self, other, **kwargs):
 
                 df, odf = make_binary_sparse(kwargs, df, odf)
 
-                result = call_f(function, nparams, df, odf, kwargs)
+                result = function(df, odf, **kwargs)
                 results.append(result)
 
         else:
@@ -290,62 +245,46 @@ def pyrange_apply(function, self, other, **kwargs):
 
                 df, odf = make_binary_sparse(kwargs, df, odf)
 
-                result = call_f(function, nparams, df, odf, kwargs)
+                result = function(df, odf, **kwargs)
                 results.append(result)
 
-    results = get(results)
-
-    results = process_results(results, keys)
-
-    if nb_cpu > 1:
-        ray.shutdown()
+    return process_results(results, keys)
 
-    return results
 
-
-def pyrange_apply_single(function, self, **kwargs):
-    nparams = get_n_args(function)
-    nb_cpu = kwargs.get("nb_cpu", 1)
+def pyrange_apply_single(function: Callable, self: "PyRanges", **kwargs) -> Any:
     strand = kwargs["strand"]
 
-    if nb_cpu > 1:
-        import ray  # type: ignore
-
-        with suppress_stdout_stderr():
-            ray.init(num_cpus=nb_cpu, ignore_reinit_error=True)
-
-    function, get, _merge_dfs = get_multithreaded_funcs(function, nb_cpu=nb_cpu)
-
     if strand:
         assert self.stranded, "Can only do stranded operation when PyRange contains strand info"
 
     results = []
 
+    keys: Union[List[str], List[Tuple[str, str]]] = []  # type: ignore
     if strand:
-        for (c, s), df in self.items():
+        for (c, s), df in self.items():  # type: ignore
             kwargs["chromosome"] = c
             _strand = s
             kwargs["strand"] = _strand
 
             df = make_unary_sparse(kwargs, df)
-            result = call_f_single(function, nparams, df, **kwargs)
+            result = function(df, **kwargs)
             results.append(result)
 
         keys = self.keys()
 
     elif not self.stranded:
-        keys = []
         for c, df in self.items():
             kwargs["chromosome"] = c
+            assert isinstance(c, str)
 
             df = make_unary_sparse(kwargs, df)
-            result = call_f_single(function, nparams, df, **kwargs)
+            result = function(df, **kwargs)
             results.append(result)
             keys.append(c)
 
     else:
-        keys = []
         for c in self.chromosomes:
+            assert isinstance(c, str)
             kwargs["chromosome"] = c
 
             dfs = self[c]
@@ -353,23 +292,16 @@ def pyrange_apply_single(function, self, **kwargs):
             if len(dfs.keys()) == 2:
                 df, df2 = dfs.values()
                 # merge strands
-                df = _merge_dfs.remote(df, df2)
+                df = merge_dfs(df, df2)
             else:
                 df = dfs.values()[0]
 
             df = make_unary_sparse(kwargs, df)
-            result = call_f_single(function, nparams, df, **kwargs)
+            result = function(df, **kwargs)
             results.append(result)
             keys.append(c)
 
-    results = get(results)
-
-    if nb_cpu > 1:
-        ray.shutdown()
-
-    results = process_results(results, keys)
-
-    return results
+    return process_results(results, keys)
 
 
 def _lengths(df):
@@ -378,7 +310,7 @@ def _lengths(df):
     return lengths
 
 
-def _tss(df, **kwargs):
+def _tss(df: DataFrame, **kwargs) -> DataFrame:
     df = df.copy(deep=True)
     dtype = df.dtypes["Start"]
     slack = kwargs.get("slack", 0)
@@ -394,7 +326,7 @@ def _tss(df, **kwargs):
     return df
 
 
-def _tes(df, **kwargs):
+def _tes(df: DataFrame, **kwargs) -> DataFrame:
     df = df.copy(deep=True)
     dtype = df.dtypes["Start"]
     slack = kwargs.get("slack", 0)
@@ -410,7 +342,7 @@ def _tes(df, **kwargs):
     return df
 
 
-def _extend(df, **kwargs):
+def _extend(df: DataFrame, **kwargs) -> DataFrame:
     df = df.copy()
     dtype = df.Start.dtype
     slack = kwargs["ext"]
diff --git a/pyranges/tostring2.py b/pyranges/tostring2.py
index bfdebc5a..72bed06f 100644
--- a/pyranges/tostring2.py
+++ b/pyranges/tostring2.py
@@ -1,20 +1,21 @@
 import functools
 import os
 import shutil
-from typing import Any, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
 import natsort  # type: ignore
 import pandas as pd
 from pandas.core.frame import DataFrame
 
-from pyranges.pyranges_main import PyRanges
+if TYPE_CHECKING:
+    from pyranges.pyranges_main import PyRanges
 
 sort_cols = "Start End".split()
 
 GITHUB_ACTIONS = os.environ.get("GITHUB_ACTIONS", False)
 
 
-def _get_stranded_f(self: PyRanges, half_entries: int, f: str, sort: bool = False) -> DataFrame:
+def _get_stranded_f(self: "PyRanges", half_entries: int, f: str, sort: bool = False) -> DataFrame:
     counter = 0
     dfs = []
 
@@ -57,7 +58,7 @@ def _get_stranded_f(self: PyRanges, half_entries: int, f: str, sort: bool = Fals
     return df
 
 
-def _get_unstranded_f(self: PyRanges, half_entries: int, f: str, sort: bool = False) -> DataFrame:
+def _get_unstranded_f(self: "PyRanges", half_entries: int, f: str, sort: bool = False) -> DataFrame:
     chromosomes = self.chromosomes
 
     if f == "tail":
@@ -88,7 +89,7 @@ def _get_unstranded_f(self: PyRanges, half_entries: int, f: str, sort: bool = Fa
     return df
 
 
-def _get_df(self: PyRanges, n: int, sort: bool) -> DataFrame:
+def _get_df(self: "PyRanges", n: int, sort: bool) -> DataFrame:
     half_entries = int(n / 2)
 
     if len(self) <= n:
@@ -139,7 +140,7 @@ def show_pos_merge_position(df: DataFrame) -> DataFrame:
     return df
 
 
-def get_columns_dtypes(self: PyRanges) -> Dict[str, str]:
+def get_columns_dtypes(self: "PyRanges") -> Dict[str, str]:
     _df = next(iter(self.dfs.values()))
     dtypes = [
         str(d)
@@ -228,7 +229,7 @@ def _grow_string_representation(
 )
 
 
-def untraditional_strand_info(self: PyRanges, str_repr_width: int) -> str:
+def untraditional_strand_info(self: "PyRanges", str_repr_width: int) -> str:
     _ustr = ""
     if "Strand" in self.columns and not self.stranded:
         strands = []
@@ -274,7 +275,7 @@ def hidden_columns_info(hidden_columns: List[Any], str_repr_width: int) -> str:
     return _hstr
 
 
-def add_text_to_str_repr(self: PyRanges, str_repr: str, hidden_columns: List[Any], sort: bool) -> str:
+def add_text_to_str_repr(self: "PyRanges", str_repr: str, hidden_columns: List[Any], sort: bool) -> str:
     n_intervals = len(self)
     n_chromosomes = len(self.chromosomes)
 
@@ -304,7 +305,7 @@ def add_text_to_str_repr(self: PyRanges, str_repr: str, hidden_columns: List[Any
 
 
 def tostring(
-    self: PyRanges,
+    self: "PyRanges",
     n: int = 8,
     merge_position: bool = False,
     formatting: Optional[Dict[str, str]] = None,

From 009abe88aaa54edc4444b79582ebeaa608643d95 Mon Sep 17 00:00:00 2001
From: endre bakken stovner <endrebakkenstovner@endres-MacBook-Air.local>
Date: Fri, 26 May 2023 09:44:38 +0200
Subject: [PATCH 06/10] Fix doctests

---
 hi                                           | Bin 1890 -> 1880 bytes
 pyproject.toml                               |   2 +-
 pyranges/__init__.py                         |  79 ++-
 pyranges/_typing.py                          |   0
 pyranges/genomicfeatures.py                  |   6 +-
 pyranges/methods/attr.py                     |   4 +-
 pyranges/methods/concat.py                   |  34 +-
 pyranges/methods/getitem.py                  |   5 +-
 pyranges/methods/init.py                     | 100 +--
 pyranges/multioverlap.py                     |   4 +-
 pyranges/multithreaded.py                    | 156 ++---
 pyranges/pyranges_main.py                    | 629 ++++++-------------
 tests/property_based/hypothesis_helper.py    |   8 +-
 tests/unit/df_dict_mismatch/test_mismatch.py |  17 +-
 tests/unit/join/test_join.py                 |   4 +-
 tests/unit/slack/test_slack.py               |   2 +-
 tests/unit/test_count_overlaps.py            |   2 +-
 tests/unit/test_genomicfeatures.py           |  16 -
 18 files changed, 356 insertions(+), 712 deletions(-)
 create mode 100644 pyranges/_typing.py

diff --git a/hi b/hi
index 669e467e2b12ef2392bbb5b01d1b6f356b1eb8c2..9c39f1a4d8ad11b16e9fd62e66f0a291f29c505f 100644
GIT binary patch
delta 146
zcmaFFcY}|mfn}=iMizY*LFG<o0cHjW-~dsR<5-NrqHIu6P7u9$7K;|s<Oi%Qe5_F6
zDJ4^qGA2weWRaPCfQ<ztCC!d3HOrf+eM%5eRqPavjJ1<_*=;3uX6(w?ov|llZ|#(f
P{Xl%MqqsDwG)WHt*%v3g

delta 160
zcmcb?_lS?Bfn{p=MizY*ZG|4DOp_^{9qm(srf7IGXQ=r3`FZ^Z0x;ptP%<T{)0u;r
zfq{WzvI~nbXFrg~2IO&W?qJbk;sbJ7Sb^LrKy?{YCO=?RkOm2{!-VE}Gl5JlnG!oi
sBV*%aPFCs34s0xwmDuGZ_GawM*q?DA<6!NSjKe^Dw4=B*sWeFs0H4M=mjD0&

diff --git a/pyproject.toml b/pyproject.toml
index cbdb2d59..2a87ae11 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,7 +23,7 @@ keywords = ["bioinformatics", "genomicranges", "genomics"]
 dependencies = ["pandas", "ncls>=0.0.63", "tabulate", "sorted_nearest>=0.0.33", "natsort"]
 
 [project.optional-dependencies]
-dev = ["black", "bumpver", "isort", "pip-tools", "pytest"]
+dev = ["pyrle", "bamread", "bwread", "fisher"]
 
 [tool.setuptools.packages.find]
 where = ["."]
diff --git a/pyranges/__init__.py b/pyranges/__init__.py
index a0ebe0cc..4b89d8ac 100644
--- a/pyranges/__init__.py
+++ b/pyranges/__init__.py
@@ -4,7 +4,7 @@
 import sys
 from collections import defaultdict
 from pathlib import Path
-from typing import Dict, Iterable, Optional, Set, Tuple, Union
+from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -15,6 +15,7 @@
 import pyranges.genomicfeatures as gf  # NOQA: F401
 from pyranges import data, statistics
 from pyranges.get_fasta import get_fasta, get_sequence, get_transcript_sequence
+from pyranges.helpers import get_key_from_df, single_value_key
 from pyranges.methods.concat import concat
 from pyranges.multioverlap import count_overlaps
 from pyranges.pyranges_main import PyRanges
@@ -33,6 +34,49 @@
 Chromsizes = Union[Dict[str, int], Dict[Tuple[str, str], int]]
 
 
+def from_args(
+    chromosomes: Union[Sequence[str], Sequence[int]],
+    starts: Sequence[int],
+    ends: Sequence[int],
+    strands: Optional[Union[str, Sequence[str]]] = None,
+) -> "PyRanges":
+    if isinstance(chromosomes, str) or isinstance(chromosomes, int):
+        _chromosomes = pd.Series([chromosomes] * len(starts), dtype="category")
+    else:
+        _chromosomes = pd.Series(chromosomes, dtype="category")
+
+    columns: List[pd.Series] = [_chromosomes, pd.Series(starts), pd.Series(ends)]
+    colnames = ["Chromosome", "Start", "End"]
+    if strands is not None:
+        if isinstance(strands, str):
+            _strands = pd.Series([strands] * len(starts), dtype="category")
+        else:
+            _strands = pd.Series(strands, dtype="category")
+
+        columns.append(_strands)
+        colnames.append("Strand")
+
+    lengths = list(str(len(s)) for s in columns)
+    assert len(set(lengths)) == 1, "[{colnames} must be of equal length. But are {columns}".format(
+        colnames=", ".join(colnames), columns=", ".join(lengths)
+    )
+
+    idx = range(len(starts))
+    series_to_concat = []
+    for s in columns:
+        if isinstance(s, pd.Series):
+            s = pd.Series(s.values, index=idx)
+        else:
+            s = pd.Series(s, index=idx)
+
+        series_to_concat.append(s)
+
+    df = pd.concat(series_to_concat, axis=1)
+    df.columns = pd.Index(colnames)
+
+    return pr.PyRanges(df)
+
+
 def from_dict(d: Dict[str, Iterable]) -> PyRanges:
     """Create a PyRanges from dict.
 
@@ -73,6 +117,36 @@ def from_dict(d: Dict[str, Iterable]) -> PyRanges:
     return PyRanges(pd.DataFrame(d))
 
 
+def from_dfs(dfs: Union[Dict[str, pd.DataFrame], Dict[Tuple[str, str], pd.DataFrame]]) -> "PyRanges":
+    df: pd.DataFrame
+    empty_removed = {k: v.copy() for k, v in dfs.items() if not v.empty}
+
+    _strand_valid = True
+    for key, df in empty_removed.items():
+        _key = get_key_from_df(df)
+        if not single_value_key(df):
+            raise ValueError("All Chromosome/Strand vals in a df must be the same.")
+        _key_same = _key == key
+
+        if isinstance(_key, tuple):
+            _strand_valid = _strand_valid and (_key[1] in ["+", "-"])
+
+        if _strand_valid and not _key_same:
+            raise ValueError(f"All keys must be the same, but df has {_key} and dict had {key}.")
+
+    if not _strand_valid:
+        df = pd.concat(empty_removed.values()).reset_index(drop=True)
+
+        groupby_cols = ["Chromosome"]
+
+        empty_removed = {k[0]: v for k, v in df.groupby(groupby_cols)}  # type: ignore
+
+    gr = PyRanges()
+    gr.__dict__["dfs"] = empty_removed
+
+    return gr  # type: ignore
+
+
 def from_string(s: str) -> PyRanges:
     """Create a PyRanges from multiline string.
 
@@ -214,7 +288,8 @@ def itergrs(prs: Iterable[PyRanges], strand=None, keys=False):
         prs = [gr.unstrand() for gr in prs]
 
     grs_per_chromosome = defaultdict(list)
-    set_keys: Union[Set[str], Set[Tuple[str, str]]] = set(itertools.chain.from_iterable(*[gr.dfs.keys() for gr in prs]))
+    keys = [gr.dfs.keys() for gr in prs]
+    set_keys: Union[Set[str], Set[Tuple[str, str]]] = set(itertools.chain.from_iterable(keys))
 
     empty_dfs = [pd.DataFrame(columns=gr.columns) for gr in prs]
     for gr, empty in zip(prs, empty_dfs):
diff --git a/pyranges/_typing.py b/pyranges/_typing.py
new file mode 100644
index 00000000..e69de29b
diff --git a/pyranges/genomicfeatures.py b/pyranges/genomicfeatures.py
index 5c4fa328..26145a1b 100644
--- a/pyranges/genomicfeatures.py
+++ b/pyranges/genomicfeatures.py
@@ -189,7 +189,7 @@ def introns(self, by="gene", nb_cpu=1):
         >>> gr.features.introns(by="gene")
         +--------------+------------+-----------+-----------+--------------+-----------------+-----------------+
         | Chromosome   | Feature    | Start     | End       | Strand       | gene_id         | transcript_id   |
-        | (object)     | (object)   | (int64)   | (int64)   | (category)   | (object)        | (object)        |
+        | (category)   | (object)   | (int64)   | (int64)   | (category)   | (object)        | (object)        |
         |--------------+------------+-----------+-----------+--------------+-----------------+-----------------|
         | 1            | intron     | 1173926   | 1174265   | +            | ENSG00000162571 | nan             |
         | 1            | intron     | 1174321   | 1174423   | +            | ENSG00000162571 | nan             |
@@ -207,7 +207,7 @@ def introns(self, by="gene", nb_cpu=1):
         >>> gr.features.introns(by="transcript")
         +--------------+------------+-----------+-----------+--------------+-----------------+-----------------+
         | Chromosome   | Feature    | Start     | End       | Strand       | gene_id         | transcript_id   |
-        | (object)     | (object)   | (int64)   | (int64)   | (category)   | (object)        | (object)        |
+        | (category)   | (object)   | (int64)   | (int64)   | (category)   | (object)        | (object)        |
         |--------------+------------+-----------+-----------+--------------+-----------------+-----------------|
         | 1            | intron     | 818202    | 818722    | +            | ENSG00000177757 | ENST00000326734 |
         | 1            | intron     | 960800    | 961292    | +            | ENSG00000187961 | ENST00000338591 |
@@ -241,7 +241,7 @@ def introns(self, by="gene", nb_cpu=1):
 
         result = pyrange_apply(_introns2, by_gr, exons, **kwargs)
 
-        return pr.PyRanges(result)
+        return pr.from_dfs(result)
 
 
 def _outside_bounds(df, **kwargs):
diff --git a/pyranges/methods/attr.py b/pyranges/methods/attr.py
index ca0f7bac..8ad77b7f 100644
--- a/pyranges/methods/attr.py
+++ b/pyranges/methods/attr.py
@@ -53,8 +53,8 @@ def _setattr(self, column_name, column, pos=False):
     if column_name not in ["Chromosome", "Strand"]:
         self.__dict__["dfs"] = dfs
     else:
-        # will merge the dfs, then split on keys again to ensure they are correct
-        self.__dict__["dfs"] = pr.PyRanges(pr.PyRanges(dfs).df).dfs
+        df = pd.concat(dfs.values())
+        self.__dict__["dfs"] = pr.PyRanges(df).dfs
 
 
 def _getattr(self, name):
diff --git a/pyranges/methods/concat.py b/pyranges/methods/concat.py
index a427194c..12419976 100644
--- a/pyranges/methods/concat.py
+++ b/pyranges/methods/concat.py
@@ -1,5 +1,3 @@
-from collections import defaultdict
-
 import pandas as pd
 
 import pyranges as pr
@@ -9,8 +7,7 @@ def concat(pyranges, strand=None):
     if not pyranges:
         return None
 
-    pyranges = [pr for pr in pyranges if not pr.empty]
-    grs_per_chromosome = defaultdict(list)
+    pyranges = [gr for gr in pyranges if not gr.empty]
 
     strand_info = [gr.stranded for gr in pyranges]
 
@@ -20,31 +17,4 @@ def concat(pyranges, strand=None):
     if strand:
         assert all([gr.stranded for gr in pyranges]), "Cannot do stranded concat, not all pyranges contain strand info."
 
-        for gr in pyranges:
-            for k, df in gr.dfs.items():
-                # dbg(df)
-                grs_per_chromosome[k].append(df)
-    else:
-        for gr in pyranges:
-            for chromosome in gr.chromosomes:
-                df = gr[chromosome].df
-                grs_per_chromosome[chromosome].append(df)
-
-    new_pyrange = {}
-
-    for k, v in grs_per_chromosome.items():
-        new_pyrange[k] = pd.concat(v, sort=False)
-
-    res = pr.multithreaded.process_results(new_pyrange.values(), new_pyrange.keys())
-
-    if any(strand_info) and not all(strand_info):
-        new_res = {}
-        for k, v in res.items():
-            v.loc[:, "Strand"] = v.Strand.cat.add_categories(["."])
-            new_res[k] = v.assign(Strand=v.Strand.fillna("."))
-        res = pr.PyRanges(new_res)
-        res.Strand = res.Strand
-    else:
-        res = pr.PyRanges(res)
-
-    return res
+    return pr.PyRanges(pd.concat([gr.df for gr in pyranges]))
diff --git a/pyranges/methods/getitem.py b/pyranges/methods/getitem.py
index cf0041ce..bdcd01b4 100644
--- a/pyranges/methods/getitem.py
+++ b/pyranges/methods/getitem.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pandas as pd
 
-from pyranges import PyRanges
+import pyranges as pr
 from pyranges.methods.drop import _keep
 from pyranges.subset import get_booldict, get_slice, get_string, get_tuple
 
@@ -32,5 +32,4 @@ def _getitem(self, val):
     else:
         raise Exception("Not a valid subsetter: {}".format(str(val)))
 
-    gr = PyRanges(dfs)
-    return gr
+    return pr.from_dfs(dfs)
diff --git a/pyranges/methods/init.py b/pyranges/methods/init.py
index c9a8b58f..6c00f31d 100644
--- a/pyranges/methods/init.py
+++ b/pyranges/methods/init.py
@@ -3,7 +3,6 @@
 
 from pyranges import PyRanges
 from pyranges.genomicfeatures import GenomicFeaturesMethods
-from pyranges.helpers import get_key_from_df, single_value_key
 from pyranges.statistics import StatisticsMethods
 
 
@@ -45,44 +44,6 @@ def create_df_dict(df, stranded):
     return {k: v for k, v in df.groupby(grpby_key)}
 
 
-def create_pyranges_df(chromosomes, starts, ends, strands=None):
-    if isinstance(chromosomes, str) or isinstance(chromosomes, int):
-        chromosomes = pd.Series([chromosomes] * len(starts), dtype="category")
-
-    if strands is not None:
-        if isinstance(strands, str):
-            strands = pd.Series([strands] * len(starts), dtype="category")
-
-        columns = [chromosomes, starts, ends, strands]
-        lengths = list(str(len(s)) for s in columns)
-        assert (
-            len(set(lengths)) == 1
-        ), "chromosomes, starts, ends and strands must be of equal length. But are {}".format(", ".join(lengths))
-        colnames = "Chromosome Start End Strand".split()
-    else:
-        columns = [chromosomes, starts, ends]
-        lengths = list(str(len(s)) for s in columns)
-        assert len(set(lengths)) == 1, "chromosomes, starts and ends must be of equal length. But are {}".format(
-            ", ".join(lengths)
-        )
-        colnames = "Chromosome Start End".split()
-
-    idx = range(len(starts))
-    series_to_concat = []
-    for s in columns:
-        if isinstance(s, pd.Series):
-            s = pd.Series(s.values, index=idx)
-        else:
-            s = pd.Series(s, index=idx)
-
-        series_to_concat.append(s)
-
-    df = pd.concat(series_to_concat, axis=1)
-    df.columns = colnames
-
-    return df
-
-
 def check_strandedness(df):
     """Check whether strand contains '.'"""
 
@@ -102,68 +63,19 @@ def check_strandedness(df):
     return not contains_more_than_plus_minus_in_strand_col
 
 
-def _init(
-    self,
-    df=None,
-    chromosomes=None,
-    starts=None,
-    ends=None,
-    strands=None,
-    copy_df=True,
-):
-    # TODO: add categorize argument with dict of args to categorize?
-
+def _init(self, df: pd.DataFrame) -> None:
     if isinstance(df, PyRanges):
         raise Exception("Object is already a PyRange.")
 
-    if isinstance(df, pd.DataFrame):
-        assert all(
-            c in df for c in "Chromosome Start End".split()
-        ), "The dataframe does not have all the columns Chromosome, Start and End."
-        if copy_df:
-            df = df.copy()
-
-    if df is False or df is None:
-        df = create_pyranges_df(chromosomes, starts, ends, strands)
-
-    if isinstance(df, pd.DataFrame):
-        df = df.reset_index(drop=True)
-
-        stranded = check_strandedness(df)
-
-        df = set_dtypes(df)
-
-        self.__dict__["dfs"] = create_df_dict(df, stranded)
-
-    # df is actually dict of dfs
-    else:
-        empty_removed = {k: v.copy() for k, v in df.items() if not v.empty}
-
-        _single_value_key = True
-        _key_same = True
-        _strand_valid = True
-        _has_strand = True
-        for key, df in empty_removed.items():
-            _key = get_key_from_df(df)
-            _single_value_key = single_value_key(df) and _single_value_key
-            _key_same = (_key == key) and _key_same
-
-            if isinstance(_key, tuple):
-                _strand_valid = _strand_valid and (_key[1] in ["+", "-"])
-            else:
-                _has_strand = False
+    df = df.copy()
 
-        if not all([_single_value_key, _key_same, _strand_valid]):
-            df = pd.concat(empty_removed.values()).reset_index(drop=True)
+    df = df.reset_index(drop=True)
 
-            if _has_strand and _strand_valid:
-                empty_removed = df.groupby(["Chromosome", "Strand"])
-            else:
-                empty_removed = df.groupby("Chromosome")
+    stranded = check_strandedness(df)
 
-            empty_removed = {k: v for (k, v) in empty_removed}
+    df = set_dtypes(df)
 
-        self.__dict__["dfs"] = empty_removed
+    self.__dict__["dfs"] = create_df_dict(df, stranded)
 
     self.__dict__["features"] = GenomicFeaturesMethods(self)
     self.__dict__["stats"] = StatisticsMethods(self)
diff --git a/pyranges/multioverlap.py b/pyranges/multioverlap.py
index 3c58d662..0dc5fa8d 100644
--- a/pyranges/multioverlap.py
+++ b/pyranges/multioverlap.py
@@ -91,7 +91,7 @@ def count_overlaps(grs, features=None, strandedness=None, how=None, nb_cpu=1):
     >>> pr.count_overlaps(grs)
     +--------------+-----------+-----------+-----------+-----------+-----------+
     | Chromosome   | Start     | End       | a         | b         | c         |
-    | (object)     | (int64)   | (int64)   | (int64)   | (int64)   | (int64)   |
+    | (category)   | (int64)   | (int64)   | (int64)   | (int64)   | (int64)   |
     |--------------+-----------+-----------+-----------+-----------+-----------|
     | chr1         | 6         | 8         | 1         | 0         | 0         |
     | chr1         | 8         | 10        | 1         | 0         | 1         |
@@ -106,7 +106,7 @@ def count_overlaps(grs, features=None, strandedness=None, how=None, nb_cpu=1):
     Unstranded PyRanges object has 12 rows and 6 columns from 1 chromosomes.
     For printing, the PyRanges was sorted on Chromosome.
 
-    >>> gr = pr.PyRanges(chromosomes=["chr1"] * 4, starts=[0, 10, 20, 30], ends=[10, 20, 30, 40])
+    >>> gr = pr.from_args(chromosomes=["chr1"] * 4, starts=[0, 10, 20, 30], ends=[10, 20, 30, 40])
     >>> gr
     +--------------+-----------+-----------+
     | Chromosome   |     Start |       End |
diff --git a/pyranges/multithreaded.py b/pyranges/multithreaded.py
index 4a7fb3e0..40139ede 100644
--- a/pyranges/multithreaded.py
+++ b/pyranges/multithreaded.py
@@ -1,4 +1,3 @@
-import os
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, Union
 
 import numpy as np
@@ -9,46 +8,6 @@
 if TYPE_CHECKING:
     from pyranges.pyranges_main import PyRanges
 
-ray = None
-
-
-def get_n_args(f: Callable) -> int:
-    import inspect
-
-    nparams = len(inspect.signature(f).parameters)
-    return nparams
-
-
-class suppress_stdout_stderr(object):
-    """
-    A context manager for doing a "deep suppression" of stdout and stderr in
-    Python, i.e. will suppress all print, even if the print originates in a
-    compiled C/Fortran sub-function.
-       This will not suppress raised exceptions, since exceptions are printed
-    to stderr just before a script exits, and after the context manager has
-    exited (at least, I think that is why it lets exceptions through).
-
-    """
-
-    def __init__(self):
-        # Open a pair of null files
-        self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)]
-        # Save the actual stdout (1) and stderr (2) file descriptors.
-        self.save_fds = (os.dup(1), os.dup(2))
-
-    def __enter__(self):
-        # Assign the null pointers to stdout and stderr.
-        os.dup2(self.null_fds[0], 1)
-        os.dup2(self.null_fds[1], 2)
-
-    def __exit__(self, *_):
-        # Re-assign the real stdout/stderr back to (1) and (2)
-        os.dup2(self.save_fds[0], 1)
-        os.dup2(self.save_fds[1], 2)
-        # Close the null files
-        os.close(self.null_fds[0])
-        os.close(self.null_fds[1])
-
 
 def merge_dfs(df1: DataFrame, df2: DataFrame) -> DataFrame:
     if not df1.empty and not df2.empty:
@@ -67,11 +26,21 @@ def process_results(results: List[Any], keys: Union[List[str], List[Tuple[str, s
     results_dict = {k: r for k, r in zip(keys, results) if r is not None}
 
     try:
-        first_item = next(iter(results_dict.values()))
+        next(iter(results_dict.values()))
     except StopIteration:  # empty collection
         return results_dict
 
-    if not isinstance(first_item, pd.DataFrame):
+    # An arbitrary operation might make the keys in the dict and df out of sync.
+    # This fixes that by having the PyRanges initializer find the correct keys again..
+    try:
+        if all(isinstance(v, pd.DataFrame) for v in results_dict.values()):
+            df = pd.concat(results_dict.values())
+            import pyranges as pr
+
+            _results_dict = pr.PyRanges(df).dfs
+        else:
+            return results_dict
+    except (ValueError, TypeError):
         return results_dict
 
     to_delete = []
@@ -89,7 +58,7 @@ def process_results(results: List[Any], keys: Union[List[str], List[Tuple[str, s
     for k in to_delete:
         del results_dict[k]
 
-    return results_dict
+    return _results_dict
 
 
 def make_sparse(df: DataFrame) -> DataFrame:
@@ -122,27 +91,6 @@ def make_unary_sparse(kwargs: Dict[str, Any], df: DataFrame) -> DataFrame:
     return make_sparse(df) if sparse else df
 
 
-def ray_initialized():
-    def test_function():
-        pass
-
-    try:
-        test_function = ray.remote(test_function)
-    except Exception as e:
-        if isinstance(e, NameError):
-            return False
-
-        raise e
-
-    try:
-        test_function.remote()
-    except Exception as e:
-        if "RayConnectionError" in str(type(e)):
-            return True
-        else:
-            raise e
-
-
 def pyrange_apply(
     function: Callable, self: "PyRanges", other: "PyRanges", **kwargs
 ) -> Union[Dict[Tuple[str, str], Any], Dict[str, Any]]:
@@ -165,7 +113,6 @@ def pyrange_apply(
 
     results = []
 
-    items = natsorted(self.dfs.items())
     keys = natsorted(self.dfs.keys())
 
     dummy = pd.DataFrame(columns="Chromosome Start End".split())
@@ -174,7 +121,8 @@ def pyrange_apply(
     other_dfs = other.dfs
 
     if strandedness:
-        for (c, s), df in items:
+        for c, s in self.chromosomes_and_strands:
+            df = self._dfs_with_strand[c, s]
             os = strand_dict[s]
 
             if not (c, os) in other.keys() or len(other[c, os].values()) == 0:
@@ -183,39 +131,50 @@ def pyrange_apply(
                 odf = other[c, os].values()[0]
 
             df, odf = make_binary_sparse(kwargs, df, odf)
-            result = function(df, odf, **kwargs)
+
+            try:
+                result = function(df, odf, **kwargs)
+            except TypeError:
+                result = function(df, odf)
 
             results.append(result)
 
     else:
         if self.stranded and not other.stranded:
-            for (c, s), df in items:
+            for (c, s), df in self._dfs_with_strand.items():
                 if c not in other_chromosomes:
                     odf = dummy
                 else:
-                    odf = other_dfs[c]
+                    odf = other._dfs_without_strands[c]
 
                 df, odf = make_binary_sparse(kwargs, df, odf)
-                result = function(df, odf, **kwargs)
+
+                try:
+                    result = function(df, odf, **kwargs)
+                except TypeError:
+                    result = function(df, odf)
                 results.append(result)
 
         elif not self.stranded and other.stranded:
-            for c, df in items:
+            for c, df in self._dfs_without_strand.items():
                 if c not in other_chromosomes:
                     odf = dummy
                 else:
-                    odf1 = other_dfs.get((c, "+"), dummy)  # type: ignore
-                    odf2 = other_dfs.get((c, "-"), dummy)  # type: ignore
+                    odf1 = other._dfs_with_strand.get((c, "+"), dummy)
+                    odf2 = other._dfs_with_strand.get((c, "-"), dummy)
 
                     odf = merge_dfs(odf1, odf2)
 
                 df, odf = make_binary_sparse(kwargs, df, odf)
 
-                result = function(df, odf, **kwargs)
+                try:
+                    result = function(df, odf, **kwargs)
+                except TypeError:
+                    result = function(df, odf)
                 results.append(result)
 
         elif self.stranded and other.stranded:
-            for (c, s), df in self.items():  # type: ignore
+            for (c, s), df in self._dfs_with_strand.items():
                 if c not in other_chromosomes:
                     odfs = [dummy]
                 else:
@@ -233,19 +192,25 @@ def pyrange_apply(
 
                 df, odf = make_binary_sparse(kwargs, df, odf)
 
-                result = function(df, odf, **kwargs)
+                try:
+                    result = function(df, odf, **kwargs)
+                except TypeError:
+                    result = function(df, odf)
                 results.append(result)
 
         else:
-            for c, df in items:
+            for c, df in self._dfs_without_strand.items():
                 if c not in other_chromosomes:
                     odf = dummy
                 else:
-                    odf = other_dfs[c]
+                    odf = other._dfs_without_strand[c]
 
                 df, odf = make_binary_sparse(kwargs, df, odf)
 
-                result = function(df, odf, **kwargs)
+                try:
+                    result = function(df, odf, **kwargs)
+                except TypeError:
+                    result = function(df, odf)
                 results.append(result)
 
     return process_results(results, keys)
@@ -259,26 +224,31 @@ def pyrange_apply_single(function: Callable, self: "PyRanges", **kwargs) -> Any:
 
     results = []
 
-    keys: Union[List[str], List[Tuple[str, str]]] = []  # type: ignore
+    keys: List = []
     if strand:
-        for (c, s), df in self.items():  # type: ignore
+        for (c, s), df in self._dfs_with_strand.items():  # type: ignore
             kwargs["chromosome"] = c
             _strand = s
             kwargs["strand"] = _strand
 
-            df = make_unary_sparse(kwargs, df)
-            result = function(df, **kwargs)
+            try:
+                result = function(df, **kwargs)
+            except TypeError:
+                result = function(df)
             results.append(result)
 
         keys = self.keys()
 
     elif not self.stranded:
-        for c, df in self.items():
+        for c, df in self._dfs_without_strand.items():
             kwargs["chromosome"] = c
             assert isinstance(c, str)
 
-            df = make_unary_sparse(kwargs, df)
-            result = function(df, **kwargs)
+            try:
+                result = function(df, **kwargs)
+            except TypeError:
+                result = function(df)
+
             results.append(result)
             keys.append(c)
 
@@ -296,11 +266,15 @@ def pyrange_apply_single(function: Callable, self: "PyRanges", **kwargs) -> Any:
             else:
                 df = dfs.values()[0]
 
-            df = make_unary_sparse(kwargs, df)
-            result = function(df, **kwargs)
-            results.append(result)
             keys.append(c)
 
+            try:
+                result = function(df, **kwargs)
+            except TypeError:
+                result = function(df)
+
+            results.append(result)
+
     return process_results(results, keys)
 
 
@@ -375,7 +349,7 @@ def _extend(df: DataFrame, **kwargs) -> DataFrame:
     return df
 
 
-def _extend_grp(df, **kwargs):
+def _extend_grp(df: pd.DataFrame, **kwargs):
     df = df.copy()
     dtype = df.Start.dtype
     slack = kwargs["ext"]
diff --git a/pyranges/pyranges_main.py b/pyranges/pyranges_main.py
index 21278540..2b3b4590 100644
--- a/pyranges/pyranges_main.py
+++ b/pyranges/pyranges_main.py
@@ -1,5 +1,4 @@
 """Data structure for genomic intervals and their annotation."""
-from collections import defaultdict
 from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
@@ -98,7 +97,7 @@ class PyRanges:
     >>> pr.PyRanges()
     Empty PyRanges
 
-    >>> pr.PyRanges(chromosomes="chr1", starts=(1, 5), ends=[3, 149],
+    >>> pr.from_args(chromosomes="chr1", starts=(1, 5), ends=[3, 149],
     ...             strands=("+", "-"))
     +--------------+-----------+-----------+--------------+
     | Chromosome   |     Start |       End | Strand       |
@@ -163,21 +162,19 @@ class PyRanges:
     pyranges.stats.StatisticsMethods : namespace for statistics
     """
 
-    def __init__(
-        self,
-        df: Optional[Union[pd.DataFrame, Dict[Union[str], pd.DataFrame], Dict[Tuple[str, str], pd.DataFrame]]] = None,
-        chromosomes: Optional[str] = None,
-        starts: Optional[Tuple[int, int]] = None,
-        ends: Optional[List[int]] = None,
-        strands: Optional[Tuple[str, str]] = None,
-        copy_df: bool = True,
-    ) -> None:
+    def __init__(self, df: Optional[pd.DataFrame] = None) -> None:
         from pyranges.methods.init import _init
 
-        if df is None and chromosomes is None:
-            df = pd.DataFrame(columns="Chromosome Start End".split())
+        if df is None:
+            _df = pd.DataFrame(columns="Chromosome Start End".split())
+        else:
+            _df = df
 
-        _init(self, df, chromosomes, starts, ends, strands, copy_df)
+        assert all(
+            c in _df.columns for c in "Chromosome Start End".split()
+        ), f"The dataframe does not have all the columns Chromosome, Start and End: {_df}"
+
+        _init(self, _df)
 
     def __array_ufunc__(self, *args, **kwargs) -> "PyRanges":
         """Apply unary numpy-function.
@@ -230,14 +227,10 @@ def __array_ufunc__(self, *args, **kwargs) -> "PyRanges":
         for chromosome, df in gr:
             subset = df.head(1)[non_index].select_dtypes(include=np.number).columns
             _v = getattr(func, call)(df[subset], **kwargs)
-            # print(_v)
-            # print(df[_c])
             df[subset] = _v
 
         return gr
 
-        # self.apply()
-
     def __getattr__(self, name: str) -> pd.Series:
         """Return column.
 
@@ -557,9 +550,7 @@ def apply(self, f: Callable, strand: Optional[bool] = None, **kwargs) -> "PyRang
         kwargs.update(kwargs.get("kwargs", {}))
         kwargs = fill_kwargs(kwargs)
 
-        result = pyrange_apply_single(f, self, **kwargs)
-
-        return PyRanges(result)
+        return pr.from_dfs(pyrange_apply_single(f, self, **kwargs))
 
     def apply_general(
         self, f: Callable, strand: Optional[bool] = None, **kwargs
@@ -588,7 +579,8 @@ def apply_general(
         --------
 
         pyranges.PyRanges.apply: apply a function to a PyRanges and return a PyRanges
-        pyranges.PyRanges.apply_pair: apply a function to a pair of PyRanges
+        pyranges.PyRanges.apply_pair: apply a function to a pair of PyRanges and return a PyRanges
+        pyranges.PyRanges.apply_pair_general: apply a function to a pair of PyRanges and return a dict
 
         Note
         ----
@@ -617,9 +609,7 @@ def apply_general(
 
         return pyrange_apply_single(f, self, **kwargs)
 
-    def apply_pair(
-        self, other: "PyRanges", f: Callable, strandedness: None = None, as_pyranges: bool = True, **kwargs
-    ) -> Union[Dict[Tuple[str, str], Tuple[int, int]], "PyRanges"]:
+    def apply_pair(self, other: "PyRanges", f: Callable, strandedness: None = None, **kwargs) -> "PyRanges":
         """Apply a function to a pair of PyRanges.
 
         The function is applied to each chromosome or chromosome/strand pair found in at least one
@@ -630,6 +620,8 @@ def apply_pair(
         f : function
             Row-based or associative function to apply on the pd.DataFrames.
 
+        other : PyRanges
+
         strandedness : {None, "same", "opposite", False}, default None, i.e. auto
 
             Whether to compare PyRanges on the same strand, the opposite or ignore strand
@@ -709,10 +701,89 @@ def apply_pair(
         +--------------+-----------+-----------+------------+-----------+--------------+
         Stranded PyRanges object has 2 rows and 6 columns from 1 chromosomes.
         For printing, the PyRanges was sorted on Chromosome and Strand.
+        """
 
-        >>> f1.apply_pair(f2, lambda df, df2: (len(df), len(df2)), as_pyranges=False)
-        {('chr1', '+'): (2, 2), ('chr1', '-'): (1, 2)}
+        kwargs.update({"strandedness": strandedness})
+        kwargs.update(kwargs.get("kwargs", {}))
+        kwargs = fill_kwargs(kwargs)
+
+        result = pyrange_apply(f, self, other, **kwargs)
+
+        return pr.from_dfs(result)
+
+    def apply_pair_general(
+        self, other: "PyRanges", f: Callable, strandedness: None = None, **kwargs
+    ) -> Union[Dict[str, Any], Dict[Tuple[str, str], Any]]:
+        """Apply a function to a pair of PyRanges.
+
+        The function is applied to each chromosome or chromosome/strand pair found in at least one
+        of the PyRanges.
+
+        Parameters
+        ----------
+        f : function
+            Row-based or associative function to apply on the pd.DataFrames.
+
+        other : PyRanges
+
+        strandedness : {None, "same", "opposite", False}, default None, i.e. auto
+
+            Whether to compare PyRanges on the same strand, the opposite or ignore strand
+            information. The default, None, means use "same" if both PyRanges are strande,
+            otherwise ignore the strand information.
+
+        **kwargs
+            Additional keyword arguments to pass as keyword arguments to `f`
+
+        Returns
+        -------
+        dict of lists
+            Result of applying f to each partition of the pd.DataFrames in the PyRanges.
+
+        See also
+        --------
+
+        pyranges.PyRanges.apply: apply a function to a pair of PyRanges
+        pyranges.PyRanges.apply_general: apply a function to a PyRanges and return a dict of Any
+        pyranges.PyRanges.apply_pair: apply a function to a pair of PyRanges
+        pyranges.iter: iterate over two or more PyRanges
+
+        Note
+        ----
+
+        This is the function used internally to carry out almost all comparison functions in
+        PyRanges.
+
+        Examples
+        --------
+
+        >>> f1 = pr.data.f1()
+        >>> f1
+        +--------------+-----------+-----------+------------+-----------+--------------+
+        | Chromosome   |     Start |       End | Name       |     Score | Strand       |
+        | (category)   |   (int64) |   (int64) | (object)   |   (int64) | (category)   |
+        |--------------+-----------+-----------+------------+-----------+--------------|
+        | chr1         |         3 |         6 | interval1  |         0 | +            |
+        | chr1         |         8 |         9 | interval3  |         0 | +            |
+        | chr1         |         5 |         7 | interval2  |         0 | -            |
+        +--------------+-----------+-----------+------------+-----------+--------------+
+        Stranded PyRanges object has 3 rows and 6 columns from 1 chromosomes.
+        For printing, the PyRanges was sorted on Chromosome and Strand.
+
+        >>> f2 = pr.data.f2()
+        >>> f2
+        +--------------+-----------+-----------+------------+-----------+--------------+
+        | Chromosome   |     Start |       End | Name       |     Score | Strand       |
+        | (category)   |   (int64) |   (int64) | (object)   |   (int64) | (category)   |
+        |--------------+-----------+-----------+------------+-----------+--------------|
+        | chr1         |         1 |         2 | a          |         0 | +            |
+        | chr1         |         6 |         7 | b          |         0 | -            |
+        +--------------+-----------+-----------+------------+-----------+--------------+
+        Stranded PyRanges object has 2 rows and 6 columns from 1 chromosomes.
+        For printing, the PyRanges was sorted on Chromosome and Strand.
 
+        >>> f1.apply_pair_general(f2, lambda df, df2: (len(df), len(df2)))
+        {('chr1', '+'): (2, 2), ('chr1', '-'): (1, 2)}
         """
 
         kwargs.update({"strandedness": strandedness})
@@ -720,11 +791,7 @@ def apply_pair(
         kwargs = fill_kwargs(kwargs)
 
         result = pyrange_apply(f, self, other, **kwargs)
-
-        if not as_pyranges:
-            return result
-        else:
-            return PyRanges(result)
+        return result
 
     def as_df(self) -> pd.DataFrame:
         """Return PyRanges as pd.DataFrame.
@@ -943,7 +1010,7 @@ def boundaries(self, group_by: str, agg: Optional[Dict[str, Union[str, Callable]
         kwargs = fill_kwargs(kwargs)
 
         result = pyrange_apply_single(_bounds, self, **kwargs)
-        return pr.PyRanges(result)
+        return pr.from_dfs(result)
 
     def calculate_frame(self, by: Union[str, List[str]]) -> "PyRanges":
         """Calculate the frame of each genomic interval, assuming all are coding sequences (CDS), and add it as column inplace.
@@ -1040,6 +1107,15 @@ def chromosomes(self) -> List[str]:
         else:
             return natsorted(set([k for k in self.keys()]))
 
+    @property
+    def chromosomes_and_strands(self) -> List[Tuple[str, str]]:
+        """Return chromosomes and strands in natsorted order."""
+
+        if not self.stranded:
+            raise ValueError("PyRanges is not stranded.")
+        else:
+            return natsorted(set(self.keys()))
+
     def cluster(
         self,
         strand: Optional[bool] = None,
@@ -1187,7 +1263,7 @@ def cluster(
             kwargs["by"] = by
             df = pyrange_apply_single(_cluster_by, _self, **kwargs)
 
-        gr = PyRanges(df)
+        gr = pr.from_dfs(df)
 
         # each chromosome got overlapping ids (0 to len). Need to make unique!
         new_dfs = {}
@@ -1362,7 +1438,7 @@ def count_overlaps(
 
         counts = pyrange_apply(_number_overlapping, self, other, **kwargs)
 
-        return pr.PyRanges(counts)
+        return pr.from_dfs(counts)
 
     def coverage(
         self,
@@ -1474,7 +1550,7 @@ def coverage(
 
         from pyranges.methods.coverage import _coverage
 
-        counts = pr.PyRanges(pyrange_apply(_coverage, counts, other, **kwargs))
+        counts = pr.from_dfs(pyrange_apply(_coverage, counts, other, **kwargs))
 
         return counts
 
@@ -1647,7 +1723,7 @@ def drop_duplicate_positions(self, strand: Optional[bool] = None, keep: Union[bo
 
         kwargs = {"sparse": {"self": False}, "keep": keep, "strand": strand and self.stranded}
         kwargs = fill_kwargs(kwargs)
-        return PyRanges(pyrange_apply_single(_drop_duplicate_positions, self, **kwargs))
+        return pr.from_dfs(pyrange_apply_single(_drop_duplicate_positions, self, **kwargs))
 
     @property
     def dtypes(self) -> pd.Series:
@@ -1782,15 +1858,11 @@ def extend(self, ext: Union[Dict[str, int], int], group_by: None = None) -> "PyR
         if isinstance(ext, dict):
             assert self.stranded, "PyRanges must be stranded to add 5/3-end specific extend."
 
-        kwargs = fill_kwargs({"ext": ext, "strand": self.stranded})
-
-        if group_by is None:
-            prg = PyRanges(pyrange_apply_single(_extend, self, **kwargs))
-        else:
-            kwargs["group_by"] = group_by
-            prg = PyRanges(pyrange_apply_single(_extend_grp, self, **kwargs))
+        kwargs = fill_kwargs({"ext": ext, "strand": self.stranded, "group_by": group_by})
+        func = _extend if group_by is None else _extend_grp
+        dfs = pyrange_apply_single(func, self, **kwargs)
 
-        return prg
+        return pr.from_dfs(dfs)
 
     # # TODO: use subtract code here instead, easier
     # def no_overlap(self, other, **kwargs):
@@ -1861,7 +1933,7 @@ def five_end(self) -> "PyRanges":
 
         assert self.stranded, "Need stranded pyrange to find 5'."
         kwargs = fill_kwargs({"strand": self.stranded})
-        return PyRanges(pyrange_apply_single(_tss, self, **kwargs))
+        return pr.from_dfs(pyrange_apply_single(_tss, self, **kwargs))
 
     def head(self, n: int = 8) -> "PyRanges":
         """Return the n first rows.
@@ -2181,7 +2253,7 @@ def intersect(
             self.__ix__ = np.arange(len(self))
 
         dfs = pyrange_apply(_intersection, self, other, **kwargs)
-        result = pr.PyRanges(dfs)
+        result = pr.from_dfs(dfs)
 
         if invert:
             found_idxs = getattr(result, "__ix__", [])
@@ -2404,7 +2476,7 @@ def join(
             kwargs["example_header_self"] = self.head(1).df
 
         dfs = pyrange_apply(_write_both, self, other, **kwargs)
-        gr = PyRanges(dfs)
+        gr = pr.from_dfs(dfs)
 
         if slack and len(gr) > 0:
             gr.Start = gr.Start__slack
@@ -2451,357 +2523,6 @@ def keys(self) -> Union[List[str], List[Tuple[str, str]]]:
 
         return natsorted(self.dfs.keys())
 
-    def k_nearest(
-        self,
-        other: "PyRanges",
-        k: Union[List[int], int] = 1,
-        ties: Optional[str] = None,
-        strandedness: None = None,
-        overlap: bool = True,
-        how: Optional[str] = None,
-        suffix: str = "_b",
-        nb_cpu: int = 1,
-        apply_strand_suffix: None = None,
-    ) -> "PyRanges":
-        """Find k nearest intervals.
-
-        Parameters
-        ----------
-        other : PyRanges
-
-            PyRanges to find nearest interval in.
-
-        k : int or list/array/pd.Series of int
-
-            Number of closest to return. If iterable, must be same length as PyRanges.
-
-        ties : {None, "first", "last", "different"}, default None
-
-            How to resolve ties, i.e. closest intervals with equal distance. None means that the k nearest intervals are kept.
-            "first" means that the first tie is kept, "last" meanst that the last is kept.
-            "different" means that all nearest intervals with the k unique nearest distances are kept.
-
-        strandedness : {None, "same", "opposite", False}, default None, i.e. auto
-
-            Whether to compare PyRanges on the same strand, the opposite or ignore strand
-            information. The default, None, means use "same" if both PyRanges are stranded,
-            otherwise ignore the strand information.
-
-        overlap : bool, default True
-
-            Whether to include overlaps.
-
-        how : {None, "upstream", "downstream"}, default None, i.e. both directions
-
-            Whether to only look for nearest in one direction. Always with respect to the PyRanges
-            it is called on.
-
-        suffix : str, default "_b"
-
-            Suffix to give columns with shared name in other.
-
-        apply_strand_suffix : bool, default None
-
-            If first pyranges is unstranded, but the second is not, the first will be given a strand column.
-            apply_strand_suffix makes the added strand column a regular data column instead by adding a suffix.
-
-
-        nb_cpu: int, default 1
-
-            How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple.
-            Will only lead to speedups on large datasets.
-
-        Returns
-        -------
-        PyRanges
-
-            A PyRanges with columns of nearest interval horizontally appended.
-
-        Notes
-        -----
-
-        nearest also exists, and is more performant.
-
-        See also
-        --------
-
-        PyRanges.new_position : give joined PyRanges new coordinates
-        PyRanges.nearest : find nearest intervals
-
-        Examples
-        --------
-
-        >>> f1 = pr.from_dict({'Chromosome': ['chr1', 'chr1', 'chr1'], 'Start': [3, 8, 5],
-        ...                    'End': [6, 9, 7], 'Strand': ['+', '+', '-']})
-        >>> f1
-        +--------------+-----------+-----------+--------------+
-        | Chromosome   |     Start |       End | Strand       |
-        | (category)   |   (int64) |   (int64) | (category)   |
-        |--------------+-----------+-----------+--------------|
-        | chr1         |         3 |         6 | +            |
-        | chr1         |         8 |         9 | +            |
-        | chr1         |         5 |         7 | -            |
-        +--------------+-----------+-----------+--------------+
-        Stranded PyRanges object has 3 rows and 4 columns from 1 chromosomes.
-        For printing, the PyRanges was sorted on Chromosome and Strand.
-
-        >>> f2 = pr.from_dict({'Chromosome': ['chr1', 'chr1'], 'Start': [1, 6],
-        ...                    'End': [2, 7], 'Strand': ['+', '-']})
-        >>> f2
-        +--------------+-----------+-----------+--------------+
-        | Chromosome   |     Start |       End | Strand       |
-        | (category)   |   (int64) |   (int64) | (category)   |
-        |--------------+-----------+-----------+--------------|
-        | chr1         |         1 |         2 | +            |
-        | chr1         |         6 |         7 | -            |
-        +--------------+-----------+-----------+--------------+
-        Stranded PyRanges object has 2 rows and 4 columns from 1 chromosomes.
-        For printing, the PyRanges was sorted on Chromosome and Strand.
-
-        >>> f1.k_nearest(f2, k=2)
-        +--------------+-----------+-----------+--------------+-----------+-----------+--------------+------------+
-        | Chromosome   |     Start |       End | Strand       |   Start_b |     End_b | Strand_b     |   Distance |
-        | (category)   |   (int64) |   (int64) | (category)   |   (int64) |   (int64) | (category)   |    (int64) |
-        |--------------+-----------+-----------+--------------+-----------+-----------+--------------+------------|
-        | chr1         |         3 |         6 | +            |         6 |         7 | -            |          1 |
-        | chr1         |         3 |         6 | +            |         1 |         2 | +            |         -2 |
-        | chr1         |         8 |         9 | +            |         6 |         7 | -            |         -2 |
-        | chr1         |         8 |         9 | +            |         1 |         2 | +            |         -7 |
-        | chr1         |         5 |         7 | -            |         6 |         7 | -            |          0 |
-        | chr1         |         5 |         7 | -            |         1 |         2 | +            |          4 |
-        +--------------+-----------+-----------+--------------+-----------+-----------+--------------+------------+
-        Stranded PyRanges object has 6 rows and 8 columns from 1 chromosomes.
-        For printing, the PyRanges was sorted on Chromosome and Strand.
-
-        >>> f1.k_nearest(f2, how="upstream", k=2)
-        +--------------+-----------+-----------+--------------+-----------+-----------+--------------+------------+
-        | Chromosome   |     Start |       End | Strand       |   Start_b |     End_b | Strand_b     |   Distance |
-        | (category)   |   (int64) |   (int64) | (category)   |   (int64) |   (int64) | (category)   |    (int64) |
-        |--------------+-----------+-----------+--------------+-----------+-----------+--------------+------------|
-        | chr1         |         3 |         6 | +            |         1 |         2 | +            |         -2 |
-        | chr1         |         8 |         9 | +            |         6 |         7 | -            |         -2 |
-        | chr1         |         8 |         9 | +            |         1 |         2 | +            |         -7 |
-        | chr1         |         5 |         7 | -            |         6 |         7 | -            |          0 |
-        +--------------+-----------+-----------+--------------+-----------+-----------+--------------+------------+
-        Stranded PyRanges object has 4 rows and 8 columns from 1 chromosomes.
-        For printing, the PyRanges was sorted on Chromosome and Strand.
-
-        >>> f1.k_nearest(f2, k=[1, 2, 1])
-        +--------------+-----------+-----------+--------------+-----------+-----------+--------------+------------+
-        | Chromosome   |     Start |       End | Strand       |   Start_b |     End_b | Strand_b     |   Distance |
-        | (category)   |   (int64) |   (int64) | (category)   |   (int64) |   (int64) | (category)   |    (int64) |
-        |--------------+-----------+-----------+--------------+-----------+-----------+--------------+------------|
-        | chr1         |         3 |         6 | +            |         6 |         7 | -            |          1 |
-        | chr1         |         8 |         9 | +            |         6 |         7 | -            |         -2 |
-        | chr1         |         8 |         9 | +            |         1 |         2 | +            |         -7 |
-        | chr1         |         5 |         7 | -            |         6 |         7 | -            |          0 |
-        +--------------+-----------+-----------+--------------+-----------+-----------+--------------+------------+
-        Stranded PyRanges object has 4 rows and 8 columns from 1 chromosomes.
-        For printing, the PyRanges was sorted on Chromosome and Strand.
-
-        >>> d1 = {"Chromosome": [1], "Start": [5], "End": [6]}
-        >>> d2 = {"Chromosome": 1, "Start": [1] * 2 + [5] * 2 + [9] * 2,
-        ...       "End": [3] * 2 + [7] * 2 + [11] * 2, "ID": range(6)}
-        >>> gr, gr2 = pr.from_dict(d1), pr.from_dict(d2)
-
-        >>> gr
-        +--------------+-----------+-----------+
-        |   Chromosome |     Start |       End |
-        |   (category) |   (int64) |   (int64) |
-        |--------------+-----------+-----------|
-        |            1 |         5 |         6 |
-        +--------------+-----------+-----------+
-        Unstranded PyRanges object has 1 rows and 3 columns from 1 chromosomes.
-        For printing, the PyRanges was sorted on Chromosome.
-
-        >>> gr2
-        +--------------+-----------+-----------+-----------+
-        |   Chromosome |     Start |       End |        ID |
-        |   (category) |   (int64) |   (int64) |   (int64) |
-        |--------------+-----------+-----------+-----------|
-        |            1 |         1 |         3 |         0 |
-        |            1 |         1 |         3 |         1 |
-        |            1 |         5 |         7 |         2 |
-        |            1 |         5 |         7 |         3 |
-        |            1 |         9 |        11 |         4 |
-        |            1 |         9 |        11 |         5 |
-        +--------------+-----------+-----------+-----------+
-        Unstranded PyRanges object has 6 rows and 4 columns from 1 chromosomes.
-        For printing, the PyRanges was sorted on Chromosome.
-
-        >>> gr.k_nearest(gr2, k=2)
-        +--------------+-----------+-----------+-----------+-----------+-----------+------------+
-        |   Chromosome |     Start |       End |   Start_b |     End_b |        ID |   Distance |
-        |   (category) |   (int64) |   (int64) |   (int64) |   (int64) |   (int64) |    (int64) |
-        |--------------+-----------+-----------+-----------+-----------+-----------+------------|
-        |            1 |         5 |         6 |         5 |         7 |         2 |          0 |
-        |            1 |         5 |         6 |         5 |         7 |         3 |          0 |
-        +--------------+-----------+-----------+-----------+-----------+-----------+------------+
-        Unstranded PyRanges object has 2 rows and 7 columns from 1 chromosomes.
-        For printing, the PyRanges was sorted on Chromosome.
-
-        >>> gr.k_nearest(gr2, k=2, ties="different")
-        +--------------+-----------+-----------+-----------+-----------+-----------+------------+
-        |   Chromosome |     Start |       End |   Start_b |     End_b |        ID |   Distance |
-        |   (category) |   (int64) |   (int64) |   (int64) |   (int64) |   (int64) |    (int64) |
-        |--------------+-----------+-----------+-----------+-----------+-----------+------------|
-        |            1 |         5 |         6 |         5 |         7 |         2 |          0 |
-        |            1 |         5 |         6 |         5 |         7 |         3 |          0 |
-        |            1 |         5 |         6 |         1 |         3 |         1 |         -3 |
-        |            1 |         5 |         6 |         1 |         3 |         0 |         -3 |
-        +--------------+-----------+-----------+-----------+-----------+-----------+------------+
-        Unstranded PyRanges object has 4 rows and 7 columns from 1 chromosomes.
-        For printing, the PyRanges was sorted on Chromosome.
-
-        >>> gr.k_nearest(gr2, k=3, ties="first")
-        +--------------+-----------+-----------+-----------+-----------+-----------+------------+
-        |   Chromosome |     Start |       End |   Start_b |     End_b |        ID |   Distance |
-        |   (category) |   (int64) |   (int64) |   (int64) |   (int64) |   (int64) |    (int64) |
-        |--------------+-----------+-----------+-----------+-----------+-----------+------------|
-        |            1 |         5 |         6 |         5 |         7 |         2 |          0 |
-        |            1 |         5 |         6 |         1 |         3 |         1 |         -3 |
-        |            1 |         5 |         6 |         9 |        11 |         4 |          4 |
-        +--------------+-----------+-----------+-----------+-----------+-----------+------------+
-        Unstranded PyRanges object has 3 rows and 7 columns from 1 chromosomes.
-        For printing, the PyRanges was sorted on Chromosome.
-
-        >>> gr.k_nearest(gr2, k=1, overlap=False)
-        +--------------+-----------+-----------+-----------+-----------+-----------+------------+
-        |   Chromosome |     Start |       End |   Start_b |     End_b |        ID |   Distance |
-        |   (category) |   (int64) |   (int64) |   (int64) |   (int64) |   (int64) |    (int64) |
-        |--------------+-----------+-----------+-----------+-----------+-----------+------------|
-        |            1 |         5 |         6 |         1 |         3 |         1 |         -3 |
-        +--------------+-----------+-----------+-----------+-----------+-----------+------------+
-        Unstranded PyRanges object has 1 rows and 7 columns from 1 chromosomes.
-        For printing, the PyRanges was sorted on Chromosome.
-        """
-
-        from sorted_nearest import get_all_ties, get_different_ties  # type: ignore
-
-        from pyranges.methods.k_nearest import _nearest  # type: ignore
-
-        kwargs = {
-            "strandedness": strandedness,
-            "how": how,
-            "overlap": overlap,
-            "nb_cpu": nb_cpu,
-            "k": k,
-            "ties": ties,
-        }
-        kwargs = fill_kwargs(kwargs)
-        kwargs["stranded"] = self.stranded and other.stranded
-
-        _self = self.copy()
-
-        if isinstance(k, pd.Series):
-            k = k.values
-
-        # how many to nearest to find; might be different for each
-        _self.__k__ = k
-        # give each their own unique ID
-        _self.__IX__ = np.arange(len(_self))
-
-        dfs = pyrange_apply(_nearest, _self, other, **kwargs)
-        nearest = PyRanges(dfs)
-
-        if not overlap:
-            result = nearest
-        else:
-            overlap_how = defaultdict(lambda: None, {"first": "first", "last": "last"})[kwargs.get("ties")]  # type: ignore
-            overlaps = _self.join(
-                other,
-                strandedness=strandedness,
-                how=overlap_how,
-                nb_cpu=nb_cpu,
-                apply_strand_suffix=apply_strand_suffix,
-            )
-            overlaps.Distance = 0
-            result = pr.concat([overlaps, nearest])
-
-        if not len(result):
-            return pr.PyRanges()
-        new_result = {}
-        if ties in ["first", "last"]:
-            for c, df in result:
-                df = df.sort_values(["__IX__", "Distance"])
-                grpby = df.groupby("__k__", sort=False)
-                dfs = []
-                for k, kdf in grpby:
-                    grpby2 = kdf.groupby("__IX__", sort=False)
-                    _df = grpby2.head(k)
-                    dfs.append(_df)
-
-                if dfs:
-                    new_result[c] = pd.concat(dfs)
-
-        elif ties == "different" or not ties:
-            for c, df in result:
-                if df.empty:
-                    continue
-                dfs = []
-
-                df = df.sort_values(["__IX__", "Distance"])
-                grpby = df.groupby("__k__", sort=False)
-
-                for k, kdf in grpby:
-                    if ties:
-                        lx = get_different_ties(
-                            kdf.index.values,
-                            kdf.__IX__.values,
-                            kdf.Distance.astype(np.int64).values,
-                            k,
-                        )
-                        _df = kdf.reindex(lx)
-                    else:
-                        lx = get_all_ties(
-                            kdf.index.values,
-                            kdf.__IX__.values,
-                            kdf.Distance.astype(np.int64).values,
-                            k,
-                        )
-                        _df = kdf.reindex(lx)
-                        _df = _df.groupby("__IX__").head(k)
-                    dfs.append(_df)
-
-                if dfs:
-                    new_result[c] = pd.concat(dfs)
-
-        result = pr.PyRanges(new_result)
-
-        if not result.__IX__.is_monotonic_increasing:
-            result = result.sort("__IX__")
-
-        result = result.drop(like="__IX__|__k__")
-
-        _self = _self.drop(like="__k__|__IX__")
-
-        def prev_to_neg(df, **kwargs):
-            strand = df.Strand.iloc[0] if "Strand" in df else "+"
-
-            suffix = kwargs["suffix"]
-
-            bools = df["End" + suffix] < df.Start
-            if not strand == "+":
-                bools = ~bools
-
-            df.loc[bools, "Distance"] = -df.loc[bools, "Distance"]
-            return df
-
-        result = result.apply(prev_to_neg, suffix=kwargs["suffix"])
-
-        if not _self.stranded and other.stranded:
-            if apply_strand_suffix is None:
-                import sys
-
-                print(
-                    "join: Strand data from other will be added as strand data to self.\nIf this is undesired use the flag apply_strand_suffix=False.\nTo turn off the warning set apply_strand_suffix to True or False.",
-                    file=sys.stderr,
-                )
-            elif apply_strand_suffix:
-                result.columns = result.columns.str.replace("Strand", "Strand" + kwargs["suffix"])
-
-        return result
-
     @property
     def length(self) -> int:
         """Return the total length of the intervals.
@@ -2968,9 +2689,9 @@ def max_disjoint(self, strand: Optional[bool] = None, slack: int = 0, **kwargs)
 
         from pyranges.methods.max_disjoint import _max_disjoint
 
-        df = pyrange_apply_single(_max_disjoint, self, **kwargs)
+        dfs = pyrange_apply_single(_max_disjoint, self, **kwargs)
 
-        return pr.PyRanges(df)
+        return pr.from_dfs(dfs)
 
     def merge(
         self,
@@ -3120,7 +2841,7 @@ def merge(
 
             df = pyrange_apply_single(_merge_by, self, **kwargs)
 
-        return PyRanges(df)
+        return pr.from_dfs(df)
 
     def mp(self, n: int = 8, formatting: None = None) -> None:
         """Merge location and print.
@@ -3300,7 +3021,7 @@ def nearest(
             assert other.stranded, "If doing upstream or downstream nearest, other pyranges must be stranded"
 
         dfs = pyrange_apply(_nearest, self, other, **kwargs)
-        gr = PyRanges(dfs)
+        gr = pr.from_dfs(dfs)
 
         if not self.stranded and other.stranded:
             if apply_strand_suffix is None:
@@ -3460,7 +3181,7 @@ def new_position(self, new_pos: str, columns: Optional[Tuple[str, str, str, str]
 
         dfs = pyrange_apply_single(_new_position, self, **kwargs)
 
-        return pr.PyRanges(dfs)
+        return pr.from_dfs(dfs)
 
     def overlap(
         self,
@@ -3603,7 +3324,7 @@ def overlap(
             self.__ix__ = np.arange(len(self))
 
         dfs = pyrange_apply(_overlap, self, other, **kwargs)
-        result = pr.PyRanges(dfs)
+        result = pr.from_dfs(dfs)
 
         if invert:
             found_idxs = getattr(result, "__ix__", [])
@@ -3965,7 +3686,7 @@ def set_intersect(
         other_clusters = other.merge(strand=strand)
         dfs = pyrange_apply(_intersection, self_clusters, other_clusters, **kwargs)
 
-        return PyRanges(dfs)
+        return pr.from_dfs(dfs)
 
     def set_union(self, other: "PyRanges", strandedness: None = None, nb_cpu: int = 1) -> "PyRanges":
         """Return set-theoretical union.
@@ -4166,7 +3887,7 @@ def sort(self, by: Optional[str] = None, nb_cpu: int = 1) -> "PyRanges":
             kwargs["by"] = by
 
         kwargs = fill_kwargs(kwargs)
-        return PyRanges(pyrange_apply_single(_sort, self, **kwargs))
+        return pr.from_dfs(pyrange_apply_single(_sort, self, **kwargs))
 
     def sp(self, n=30, formatting=None):
         """Sort on location and print.
@@ -4200,7 +3921,7 @@ def spliced_subsequence(
         end: Optional[int] = None,
         by: Optional[str] = None,
         strand: Optional[bool] = None,
-        **kwargs
+        **kwargs,
     ) -> "PyRanges":
         """Get subsequences of the intervals, using coordinates mapping to spliced transcripts (without introns)
 
@@ -4342,7 +4063,7 @@ def spliced_subsequence(
 
         result = pyrange_apply_single(_spliced_subseq, sorted_p, **kwargs)
 
-        return pr.PyRanges(result)
+        return pr.from_dfs(result)
 
     def split(self, strand: Optional[bool] = None, between: bool = False) -> "PyRanges":
         """Split into non-overlapping intervals.
@@ -4388,38 +4109,38 @@ def split(self, strand: Optional[bool] = None, between: bool = False) -> "PyRang
         For printing, the PyRanges was sorted on Chromosome and Strand.
 
         >>> gr.split()
-        +--------------+-----------+-----------+------------+
-        | Chromosome   |     Start |       End | Strand     |
-        | (object)     |   (int64) |   (int64) | (object)   |
-        |--------------+-----------+-----------+------------|
-        | chr1         |         3 |         5 | +          |
-        | chr1         |         5 |         6 | +          |
-        | chr1         |         6 |         9 | +          |
-        | chr1         |         5 |         7 | -          |
-        | chr1         |        11 |        12 | -          |
-        +--------------+-----------+-----------+------------+
+        +--------------+-----------+-----------+--------------+
+        | Chromosome   |     Start |       End | Strand       |
+        | (category)   |   (int64) |   (int64) | (category)   |
+        |--------------+-----------+-----------+--------------|
+        | chr1         |         3 |         5 | +            |
+        | chr1         |         5 |         6 | +            |
+        | chr1         |         6 |         9 | +            |
+        | chr1         |         5 |         7 | -            |
+        | chr1         |        11 |        12 | -            |
+        +--------------+-----------+-----------+--------------+
         Stranded PyRanges object has 5 rows and 4 columns from 1 chromosomes.
         For printing, the PyRanges was sorted on Chromosome and Strand.
 
         >>> gr.split(between=True)
-        +--------------+-----------+-----------+------------+
-        | Chromosome   |     Start |       End | Strand     |
-        | (object)     |   (int64) |   (int64) | (object)   |
-        |--------------+-----------+-----------+------------|
-        | chr1         |         3 |         5 | +          |
-        | chr1         |         5 |         6 | +          |
-        | chr1         |         6 |         9 | +          |
-        | chr1         |         5 |         7 | -          |
-        | chr1         |         7 |        11 | -          |
-        | chr1         |        11 |        12 | -          |
-        +--------------+-----------+-----------+------------+
+        +--------------+-----------+-----------+--------------+
+        | Chromosome   |     Start |       End | Strand       |
+        | (category)   |   (int64) |   (int64) | (category)   |
+        |--------------+-----------+-----------+--------------|
+        | chr1         |         3 |         5 | +            |
+        | chr1         |         5 |         6 | +            |
+        | chr1         |         6 |         9 | +            |
+        | chr1         |         5 |         7 | -            |
+        | chr1         |         7 |        11 | -            |
+        | chr1         |        11 |        12 | -            |
+        +--------------+-----------+-----------+--------------+
         Stranded PyRanges object has 6 rows and 4 columns from 1 chromosomes.
         For printing, the PyRanges was sorted on Chromosome and Strand.
 
         >>> gr.split(strand=False)
         +--------------+-----------+-----------+
         | Chromosome   |     Start |       End |
-        | (object)     |   (int64) |   (int64) |
+        | (category)   |   (int64) |   (int64) |
         |--------------+-----------+-----------|
         | chr1         |         3 |         5 |
         | chr1         |         5 |         6 |
@@ -4433,7 +4154,7 @@ def split(self, strand: Optional[bool] = None, between: bool = False) -> "PyRang
         >>> gr.split(strand=False, between=True)
         +--------------+-----------+-----------+
         | Chromosome   |     Start |       End |
-        | (object)     |   (int64) |   (int64) |
+        | (category)   |   (int64) |   (int64) |
         |--------------+-----------+-----------|
         | chr1         |         3 |         5 |
         | chr1         |         5 |         6 |
@@ -4455,7 +4176,7 @@ def split(self, strand: Optional[bool] = None, between: bool = False) -> "PyRang
 
         df = pyrange_apply_single(_split, self, **kwargs)
 
-        split = pr.PyRanges(df)
+        split = pr.from_dfs(df)
         if not between:
             strandedness: Union[str, bool] = "same" if strand else False
             split = split.overlap(self, strandedness=strandedness)
@@ -4660,7 +4381,7 @@ def subsequence(
         end: Optional[int] = None,
         by: Optional[str] = None,
         strand: Optional[bool] = None,
-        **kwargs
+        **kwargs,
     ) -> "PyRanges":
         """Get subsequences of the intervals.
 
@@ -4792,7 +4513,7 @@ def subsequence(
 
         result = pyrange_apply_single(_subseq, self, **kwargs)
 
-        return pr.PyRanges(result)
+        return pr.from_dfs(result)
 
     def subtract(self, other: "PyRanges", strandedness: None = None, nb_cpu: int = 1) -> "PyRanges":
         """Subtract intervals.
@@ -4864,13 +4585,13 @@ def subtract(self, other: "PyRanges", strandedness: None = None, nb_cpu: int = 1
         strand = True if strandedness else False
         other_clusters = other.merge(strand=strand)
 
-        self = self.count_overlaps(other_clusters, strandedness=strandedness, overlap_col="__num__")
+        _self = self.copy()
 
-        result = pyrange_apply(_subtraction, self, other_clusters, **kwargs)
+        _self = _self.count_overlaps(other_clusters, strandedness=strandedness, overlap_col="__num__")
 
-        self = self.drop("__num__")
+        result = pyrange_apply(_subtraction, _self, other_clusters, **kwargs)
 
-        return PyRanges(result).drop("__num__")
+        return pr.from_dfs(result).drop("__num__")
 
     def summary(self, to_stdout: bool = True, return_df: bool = False) -> Optional[pd.DataFrame]:
         """Return info.
@@ -5121,7 +4842,7 @@ def tile(self, tile_size: int, overlap: bool = False, strand: Optional[bool] = N
 
         df = pyrange_apply_single(_tiles, self, **kwargs)
 
-        return PyRanges(df)
+        return pr.from_dfs(df)
 
     def to_example(self, n: int = 10) -> Dict[str, List[Union[int, str]]]:
         """Return as dict.
@@ -5237,7 +4958,7 @@ def three_end(self) -> "PyRanges":
 
         assert self.stranded, "Need stranded pyrange to find 3'."
         kwargs = fill_kwargs({"strand": True})
-        return PyRanges(pyrange_apply_single(_tes, self, **kwargs))
+        return pr.from_dfs(pyrange_apply_single(_tes, self, **kwargs))
 
     #     def to_bam(self, path=None, header=None, chromosome_sizes=None, chain=False):
 
@@ -5930,9 +5651,11 @@ def unstrand(self) -> "PyRanges":
 
         gr = pr.concat([self["+"], self["-"]])
 
-        gr = gr.apply(lambda df: df.drop("Strand", axis=1).reset_index(drop=True))
+        dfs = []
+        for _, df in gr.dfs.items():
+            dfs.append(df.drop("Strand", axis=1).reset_index(drop=True))
 
-        return pr.PyRanges(gr.dfs)
+        return pr.PyRanges(pd.concat(dfs).reset_index(drop=True))
 
     def values(self) -> List[pd.DataFrame]:
         """Return the underlying pd.DataFrames."""
@@ -6049,9 +5772,9 @@ def window(self, window_size: int, strand: Optional[bool] = None) -> "PyRanges":
             "window_size": window_size,
         }
 
-        df = pyrange_apply_single(_windows, self, **kwargs)
+        dfs = pyrange_apply_single(_windows, self, **kwargs)
 
-        return PyRanges(df)
+        return pr.from_dfs(dfs)
 
     def __getstate__(self):
         return self.dfs
@@ -6070,11 +5793,23 @@ def _zip_locationkey_and_data(keys: Iterable, dfs: Iterable[pd.DataFrame], stran
         if strand:
             for k in keys:
                 assert isinstance(k, tuple)
-            return pr.PyRanges(dict(zip(keys, dfs)))
+            return pr.from_dfs(dict(zip(keys, dfs)))
         else:
             for k in keys:
                 assert isinstance(k, str)
-            return pr.PyRanges(dict(zip(keys, dfs)))
+            return pr.from_dfs(dict(zip(keys, dfs)))
+
+    @property
+    def _dfs_without_strand(self) -> Dict[str, pd.DataFrame]:
+        """Return a dictionary of stranded dataframes."""
+        assert not self.stranded, "PyRanges object is stranded"
+        return {k: v for k, v in self.dfs.items() if isinstance(k, str)}
+
+    @property
+    def _dfs_with_strand(self) -> Dict[Tuple[str, str], pd.DataFrame]:
+        """Return a dictionary of stranded dataframes."""
+        assert self.stranded, "PyRanges object is not stranded"
+        return {k: v for k, v in self.dfs.items() if isinstance(k, tuple)}
 
 
 def _test():
diff --git a/tests/property_based/hypothesis_helper.py b/tests/property_based/hypothesis_helper.py
index 575e3c1d..904e292a 100644
--- a/tests/property_based/hypothesis_helper.py
+++ b/tests/property_based/hypothesis_helper.py
@@ -1,5 +1,3 @@
-from os import environ
-
 import hypothesis.strategies as st
 import numpy as np
 import pandas as pd
@@ -108,7 +106,7 @@ def dfs_min2(draw):  # nosec
     # if not strand:
     #     df = df.drop("Strand", axis=1)
 
-    gr = PyRanges(df, int64=True)
+    gr = PyRanges(df)
     # gr = PyRanges(df)
 
     # do not sort like this, use pyranges sort
@@ -134,7 +132,7 @@ def dfs_min(draw):  # nosec
     # if not strand:
     #     df = df.drop("Strand", axis=1)
 
-    gr = PyRanges(df, int64=True)
+    gr = PyRanges(df)
     # print(gr)
     # raise
     # gr = PyRanges(df)
@@ -159,7 +157,7 @@ def dfs_no_min(draw):  # nosec
     # if not strand:
     #     df = df.drop("Strand", axis=1)
 
-    gr = PyRanges(df, int64=True)
+    gr = PyRanges(df)
     # gr = PyRanges(df)
 
     # do not sort like this, use pyranges sort
diff --git a/tests/unit/df_dict_mismatch/test_mismatch.py b/tests/unit/df_dict_mismatch/test_mismatch.py
index a0ba0506..9e662389 100644
--- a/tests/unit/df_dict_mismatch/test_mismatch.py
+++ b/tests/unit/df_dict_mismatch/test_mismatch.py
@@ -1,4 +1,5 @@
 import pandas as pd
+import pytest
 
 import pyranges as pr
 
@@ -7,9 +8,8 @@ def test_unstranded_but_has_chrom_key():
     df = pd.DataFrame({"Chromosome": "chr1", "Start": 5, "End": 10}, index=[0])
     dfs = {("chr1", "+"): df}
 
-    gr = pr.PyRanges(dfs)
-
-    assert not gr.stranded
+    with pytest.raises(ValueError, match=r"All keys must be the same, but df has chr1 and dict had .*"):
+        pr.from_dfs(dfs)
 
 
 def test_has_bad_strand_and_strand_key():
@@ -17,7 +17,7 @@ def test_has_bad_strand_and_strand_key():
 
     dfs = {("chr1", "+"): df}
 
-    gr = pr.PyRanges(dfs)
+    gr = pr.from_dfs(dfs)
 
     assert not gr.stranded
 
@@ -25,10 +25,7 @@ def test_has_bad_strand_and_strand_key():
 def test_has_strand_but_is_not_stranded():
     df = pd.DataFrame({"Chromosome": "chr1", "Start": 5, "End": 10, "Strand": "+"}, index=[0])
 
-    dfs = {("chr1"): df}
-
-    gr = pr.PyRanges(dfs)
-
-    print(gr.dfs)
+    dfs = {"chr1": df}
 
-    assert gr.stranded
+    with pytest.raises(ValueError, match=r"All keys must be the same, but df has .* and dict had .*"):
+        pr.from_dfs(dfs)
diff --git a/tests/unit/join/test_join.py b/tests/unit/join/test_join.py
index 78800b14..f4e7c00f 100644
--- a/tests/unit/join/test_join.py
+++ b/tests/unit/join/test_join.py
@@ -5,8 +5,8 @@
 
 # with slack
 def test_join_with_slack():
-    gr1 = pr.PyRanges(chromosomes="chr1", starts=[0], ends=[10], strands="+")
-    gr2 = pr.PyRanges(chromosomes="chr1", starts=[15], ends=[20], strands="+")
+    gr1 = pr.from_args(chromosomes="chr1", starts=[0], ends=[10], strands="+")
+    gr2 = pr.from_args(chromosomes="chr1", starts=[15], ends=[20], strands="+")
 
     result = gr1.join(gr2, slack=10)
     df = result.df
diff --git a/tests/unit/slack/test_slack.py b/tests/unit/slack/test_slack.py
index c3093d6c..fc1c6862 100644
--- a/tests/unit/slack/test_slack.py
+++ b/tests/unit/slack/test_slack.py
@@ -3,6 +3,6 @@
 
 # 3' and 5'
 def test_slack():
-    gr = pr.PyRanges(chromosomes="chr1", starts=[15, 300], ends=[20, 305], strands="+ -".split())
+    gr = pr.from_args(chromosomes="chr1", starts=[15, 300], ends=[20, 305], strands="+ -".split())
     print(gr)
     gr = gr.slack({"5": 10, "3": 5})
diff --git a/tests/unit/test_count_overlaps.py b/tests/unit/test_count_overlaps.py
index f013032d..b83f5872 100644
--- a/tests/unit/test_count_overlaps.py
+++ b/tests/unit/test_count_overlaps.py
@@ -19,7 +19,7 @@
 grs = {n: pr.from_string(s) for n, s in zip(["a", "b", "c"], [a, b, c])}
 unstranded_grs = {n: gr.unstrand() for n, gr in grs.items()}
 
-features = pr.PyRanges(
+features = pr.from_args(
     chromosomes=["chr1"] * 4,
     starts=[0, 10, 20, 30],
     ends=[10, 20, 30, 40],
diff --git a/tests/unit/test_genomicfeatures.py b/tests/unit/test_genomicfeatures.py
index 9b1cbd99..be6a9818 100644
--- a/tests/unit/test_genomicfeatures.py
+++ b/tests/unit/test_genomicfeatures.py
@@ -15,11 +15,6 @@ def compute_introns_single(df, by):
     x.Strand = "-"
     x = x.df
 
-    print("g " * 100)
-    print(g)
-    print("x " * 100)
-    print(x)
-
     if g.empty or x.empty:
         return pd.DataFrame()
 
@@ -74,10 +69,6 @@ def _introns_correct(full, genes, exons, introns, by):
         expected = expected_results[gene_id]
         exons = pr.PyRanges(based_on[gene_id]).subset(lambda df: df.Feature == "exon").merge(by=id_column)
         genes = pr.PyRanges(based_on[gene_id]).subset(lambda df: df.Feature == by)
-        print("exons", exons)
-        print("based_on", based_on[gene_id])
-        print("actual", idf["Chromosome Start End Strand".split()])
-        print("expected", expected["Chromosome Start End Strand".split()])
         _introns = pr.PyRanges(idf)
         assert len(exons.intersect(_introns)) == 0
         assert len(genes.intersect(_introns)) == len(_introns)
@@ -96,17 +87,10 @@ def test_introns_single():
     exons.Feature = "exon"
     exons = exons.df
     df = pd.concat([gr[gr.Feature == "gene"].df, exons], sort=False)
-    print(df)
 
     for gid, gdf in df.groupby("gene_id"):
-        print("-------" * 20)
-        print(gid)
-        print(gdf)
-        print("gdf", len(gdf))
         expected = compute_introns_single(gdf, by="gene")
-        print("expected", len(expected))
         actual = pr.PyRanges(gdf).features.introns().df
-        print("actual", len(actual))
         if actual.empty:
             assert expected.empty
             continue

From a0934b8cb24abe8fe1957d546207ae7e9e726587 Mon Sep 17 00:00:00 2001
From: endre bakken stovner <endrebakkenstovner@endres-MacBook-Air.local>
Date: Fri, 26 May 2023 12:08:11 +0200
Subject: [PATCH 07/10] Add types to readers

---
 pyranges/data.py            |  33 +++---
 pyranges/genomicfeatures.py |  98 ++++++++----------
 pyranges/get_fasta.py       |  28 ++++-
 pyranges/helpers.py         |   8 +-
 pyranges/multioverlap.py    |  14 +--
 pyranges/pyranges_main.py   |  12 +--
 pyranges/readers.py         | 199 ++++++++++++++++--------------------
 tests/unit/test_io.py       |   8 ++
 8 files changed, 193 insertions(+), 207 deletions(-)

diff --git a/pyranges/data.py b/pyranges/data.py
index a20d946a..90df509f 100644
--- a/pyranges/data.py
+++ b/pyranges/data.py
@@ -20,6 +20,7 @@
 Stranded PyRanges object has 3 rows and 6 columns from 1 chromosomes.
 For printing, the PyRanges was sorted on Chromosome and Strand.
 """
+from pathlib import Path
 
 import pandas as pd
 import pkg_resources
@@ -43,17 +44,17 @@
 ]
 
 
-def get_example_path(basename):
+def get_example_path(basename) -> Path:
     full_path = pkg_resources.resource_filename("pyranges", "example_data/{}".format(basename))
 
     if full_path.endswith(".bam"):
         # hack to load index too
         pkg_resources.resource_filename("pyranges", "example_data/{}.bai".format(basename))
 
-    return full_path
+    return Path(full_path)
 
 
-def aorta():
+def aorta() -> "pr.PyRanges":
     """
     >>> # +--------------+-----------+-----------+------------+-----------+--------------+
     >>> # | Chromosome   | Start     | End       | Name       | Score     | Strand       |
@@ -78,7 +79,7 @@ def aorta():
     return pr.read_bed(full_path)
 
 
-def aorta2():
+def aorta2() -> "pr.PyRanges":
     """
     >>> # +--------------+-----------+-----------+------------+-----------+--------------+
     >>> # | Chromosome   | Start     | End       | Name       | Score     | Strand       |
@@ -103,13 +104,13 @@ def aorta2():
     return pr.read_bed(full_path)
 
 
-def bw():
+def bw() -> "pr.PyRanges":
     full_path = get_example_path("bw.bw")
 
     return pr.read_bigwig(full_path)
 
 
-def chipseq():
+def chipseq() -> "pr.PyRanges":
     """
     >>> # +--------------+-----------+-----------+------------+-----------+--------------+
     >>> # | Chromosome   | Start     | End       | Name       | Score     | Strand       |
@@ -134,7 +135,7 @@ def chipseq():
     return pr.read_bed(full_path)
 
 
-def chipseq_background():
+def chipseq_background() -> "pr.PyRanges":
     """
     >>> # +--------------+-----------+-----------+------------+-----------+--------------+
     >>> # | Chromosome   | Start     | End       | Name       | Score     | Strand       |
@@ -159,7 +160,7 @@ def chipseq_background():
     return pr.read_bed(full_path)
 
 
-def chromsizes():
+def chromsizes() -> "pr.PyRanges":
     """
     >>> # +--------------+-----------+-----------+
     >>> # | Chromosome   | Start     | End       |
@@ -184,7 +185,7 @@ def chromsizes():
     return pr.read_bed(full_path)
 
 
-def control_bam():
+def control_bam() -> "pr.PyRanges":
     """
     >>> # +--------------+-----------+-----------+--------------+------------+
     >>> # | Chromosome   | Start     | End       | Strand       | Flag       |
@@ -209,7 +210,7 @@ def control_bam():
     return pr.read_bam(full_path)
 
 
-def cpg():
+def cpg() -> "pr.PyRanges":
     """
     >>> # +--------------+-----------+-----------+-----------+
     >>> # | Chromosome   | Start     | End       | CpG       |
@@ -236,7 +237,7 @@ def cpg():
     return pr.PyRanges(df)
 
 
-def ensembl_gtf():
+def ensembl_gtf() -> "pr.PyRanges":
     """
     >>> # +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------+
     >>> # | Chromosome   | Source     | Feature      | Start     | End       | Score      | Strand       | Frame      | gene_biotype                       | +19   |
@@ -262,7 +263,7 @@ def ensembl_gtf():
     return pr.read_gtf(full_path)
 
 
-def exons():
+def exons() -> "pr.PyRanges":
     """
     >>> # +--------------+-----------+-----------+----------------------------------------+-----------+--------------+
     >>> # | Chromosome   | Start     | End       | Name                                   | Score     | Strand       |
@@ -287,7 +288,7 @@ def exons():
     return pr.read_bed(full_path)
 
 
-def f1():
+def f1() -> "pr.PyRanges":
     """
     >>> # +--------------+-----------+-----------+------------+-----------+--------------+
     >>> # | Chromosome   |     Start |       End | Name       |     Score | Strand       |
@@ -306,7 +307,7 @@ def f1():
     return pr.read_bed(full_path)
 
 
-def f2():
+def f2() -> "pr.PyRanges":
     """
     >>> # +--------------+-----------+-----------+------------+-----------+--------------+
     >>> # | Chromosome   |     Start |       End | Name       |     Score | Strand       |
@@ -324,7 +325,7 @@ def f2():
     return pr.read_bed(full_path)
 
 
-def gencode_gtf():
+def gencode_gtf() -> "pr.PyRanges":
     """
     >>> # +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+-------------------+-------+
     >>> # | Chromosome   | Source     | Feature      | Start     | End       | Score      | Strand       | Frame      | gene_id           | +15   |
@@ -350,7 +351,7 @@ def gencode_gtf():
     return pr.read_gtf(full_path)
 
 
-def ucsc_bed():
+def ucsc_bed() -> "pr.PyRanges":
     """
     >>> # +--------------+-----------+-----------+------------+------------+-----------------+--------------+---------------+-------------------+
     >>> # | Chromosome   | Start     | End       | Feature    | gene_id    | transcript_id   | Strand       | exon_number   | transcript_name   |
diff --git a/pyranges/genomicfeatures.py b/pyranges/genomicfeatures.py
index 26145a1b..bb8d909a 100644
--- a/pyranges/genomicfeatures.py
+++ b/pyranges/genomicfeatures.py
@@ -4,6 +4,9 @@
 
 import pyranges as pr
 from pyranges.multithreaded import pyrange_apply
+from pandas.core.frame import DataFrame
+from pyranges.pyranges_main import PyRanges
+from typing import Dict, Optional
 
 __all__ = ["genome_bounds", "tile_genome", "GenomicFeaturesMethods"]
 
@@ -14,12 +17,10 @@ class GenomicFeaturesMethods:
 
     Accessed through `gr.features`."""
 
-    pr = None
-
-    def __init__(self, pr):
+    def __init__(self, pr: PyRanges) -> None:
         self.pr = pr
 
-    def tss(self):
+    def tss(self) -> PyRanges:
         """Return the transcription start sites.
 
         Returns the 5' for every interval with feature "transcript".
@@ -83,7 +84,7 @@ def tss(self):
 
         return pr
 
-    def tes(self, slack=0):
+    def tes(self) -> PyRanges:
         """Return the transcription end sites.
 
         Returns the 3' for every interval with feature "transcript".
@@ -147,7 +148,7 @@ def tes(self, slack=0):
 
         return pr
 
-    def introns(self, by="gene", nb_cpu=1):
+    def introns(self, by: str = "gene") -> PyRanges:
         """Return the introns.
 
         Parameters
@@ -155,11 +156,6 @@ def introns(self, by="gene", nb_cpu=1):
         by : str, {"gene", "transcript"}, default "gene"
             Whether to find introns per gene or transcript.
 
-        nb_cpu: int, default 1
-
-            How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple.
-            Will only lead to speedups on large datasets.
-
         See Also
         --------
         pyranges.genomicfeatures.GenomicFeaturesMethods.tss : return the transcription start sites
@@ -223,7 +219,7 @@ def introns(self, by="gene", nb_cpu=1):
         For printing, the PyRanges was sorted on Chromosome and Strand.
         """
 
-        kwargs = {"by": by, "nb_cpu": nb_cpu}
+        kwargs = {"by": by}
         kwargs = pr.pyranges_main.fill_kwargs(kwargs)
 
         assert by in ["gene", "transcript"]
@@ -244,22 +240,26 @@ def introns(self, by="gene", nb_cpu=1):
         return pr.from_dfs(result)
 
 
-def _outside_bounds(df, **kwargs):
+def _outside_bounds(df: DataFrame, **kwargs) -> DataFrame:
     df = df.copy()
 
-    chromsizes = kwargs.get("chromsizes")
+    _chromsizes = kwargs.get("chromsizes")
 
-    if not isinstance(chromsizes, dict):
-        size_df = chromsizes.df
+    if isinstance(_chromsizes, PyRanges):
+        size_df = _chromsizes.df
+        if not size_df.Chromosome.is_unique:
+            raise ValueError("Chromosomes must be unique in chromsizes.")
         chromsizes = {k: v for k, v in zip(size_df.Chromosome, size_df.End)}
+    else:
+        assert isinstance(_chromsizes, dict)
+        chromsizes = _chromsizes
 
     size = int(chromsizes[df.Chromosome.iloc[0]])
     clip = kwargs.get("clip", False)
     only_right = kwargs.get("only_right", False)
 
     ends_outright = df.End > size
-    if not only_right:
-        starts_outleft = df.Start < 0
+    starts_outleft = df.Start < 0
 
     if not clip:  # i.e. remove
         if only_right:
@@ -288,7 +288,7 @@ def _outside_bounds(df, **kwargs):
     return df
 
 
-def genome_bounds(gr, chromsizes, clip=False, only_right=False):
+def genome_bounds(gr: PyRanges, chromsizes: Dict[str, int], clip: bool = False, only_right: bool = False) -> PyRanges:
     """Remove or clip intervals outside of genome bounds.
 
     Parameters
@@ -391,18 +391,15 @@ def genome_bounds(gr, chromsizes, clip=False, only_right=False):
     return gr.apply(_outside_bounds, chromsizes=chromsizes, clip=clip, only_right=only_right)
 
 
-def _last_tile(df, **kwargs):
-    # do not need copy, since it is only used internally by
-    # tile_genome
-    # df = df.copy()
-    sizes = kwargs.get("sizes")
+def _last_tile(df: DataFrame, sizes: pd.DataFrame, **kwargs) -> DataFrame:
+    # do not need copy, since it is only used internally by tile_genome
     size = sizes[df.Chromosome.iloc[0]].End.iloc[0]
     df.loc[df.tail(1).index, "End"] = size
 
     return df
 
 
-def tile_genome(genome, tile_size, tile_last=False):
+def tile_genome(chromsizes: PyRanges, tile_size: int, tile_last: bool = False) -> PyRanges:
     """Create a tiled genome.
 
     Parameters
@@ -464,20 +461,20 @@ def tile_genome(genome, tile_size, tile_last=False):
     For printing, the PyRanges was sorted on Chromosome.
     """
 
-    if isinstance(genome, dict):
-        chromosomes, ends = list(genome.keys()), list(genome.values())
+    if isinstance(chromsizes, dict):
+        chromosomes, ends = list(chromsizes.keys()), list(chromsizes.values())
         df = pd.DataFrame({"Chromosome": chromosomes, "Start": 0, "End": ends})
-        genome = pr.PyRanges(df)
+        chromsizes = pr.PyRanges(df)
 
-    gr = genome.tile(tile_size)
+    gr = chromsizes.tile(tile_size)
 
     if not tile_last:
-        gr = gr.apply(_last_tile, sizes=genome)
+        gr = gr.apply(_last_tile, sizes=chromsizes)
 
     return gr
 
 
-def _keep_transcript_with_most_exons(df):
+def _keep_transcript_with_most_exons(df: pd.DataFrame) -> DataFrame:
     transcripts_with_most_exons = []
 
     for _, gdf in df.groupby("gene_id"):
@@ -491,13 +488,11 @@ def _keep_transcript_with_most_exons(df):
     return pd.concat(transcripts_with_most_exons).reset_index(drop=True)
 
 
-def filter_transcripts(df, keep="most_exons"):
+def filter_transcripts(df: pd.DataFrame) -> DataFrame:
     return _keep_transcript_with_most_exons(df)
 
 
-def _tss(df, slack=0):
-    intype = df.Start.dtype
-
+def _tss(df: DataFrame, slack: int = 0) -> DataFrame:
     tss_pos = df.loc[df.Strand == "+"]
 
     tss_neg = df.loc[df.Strand == "-"].copy()
@@ -512,17 +507,12 @@ def _tss(df, slack=0):
     tss.Start = tss.Start - slack
     tss.loc[tss.Start < 0, "Start"] = 0
 
-    tss.index = range(len(tss))
-
-    tss[["Start", "End"]] = tss[["Start", "End"]].astype(intype)
+    tss.index = pd.Index(range(len(tss)))
 
     return tss
 
 
-def _tes(df, slack=0):
-    intype = df.Start.dtype
-    # df = self.df
-
+def _tes(df: DataFrame, slack: int = 0) -> DataFrame:
     tes_pos = df.loc[df.Strand == "+"]
 
     tes_neg = df.loc[df.Strand == "-"].copy()
@@ -537,9 +527,7 @@ def _tes(df, slack=0):
     tes.Start = tes.Start - slack
     tes.loc[tes.Start < 0, "Start"] = 0
 
-    tes.index = range(len(tes))
-
-    tes[["Start", "End"]] = tes[["Start", "End"]].astype(intype)
+    tes.index = pd.Index(range(len(tes)))
 
     return tes
 
@@ -547,11 +535,11 @@ def _tes(df, slack=0):
 by_to_id = {"gene": "gene_id", "transcript": "transcript_id"}
 
 
-def _introns2(df, exons, **kwargs):
+def _introns2(df: DataFrame, exons: DataFrame, **kwargs) -> DataFrame:
     """TODO: refactor"""
 
     if df.empty or exons.empty:
-        return None
+        return pd.DataFrame(columns=df.columns)
 
     original_order = df.columns
     by = kwargs["by"]
@@ -559,12 +547,12 @@ def _introns2(df, exons, **kwargs):
 
     exons = exons[["Start", "End", id_column]]
     genes = df[["Start", "End", id_column]]
-    exons.columns = ["Start", "End", "by_id"]
-    genes.columns = ["Start", "End", "by_id"]
+    exons.columns = pd.Index(["Start", "End", "by_id"])
+    genes.columns = pd.Index(["Start", "End", "by_id"])
 
     intersection = pd.Series(np.intersect1d(exons["by_id"], genes["by_id"]))
     if len(intersection) == 0:
-        return None
+        return pd.DataFrame(columns=df.columns)
 
     exons = exons[exons["by_id"].isin(intersection)].reset_index(drop=True).sort_values(["by_id", "Start"])
     genes = genes[genes["by_id"].isin(intersection)].reset_index(drop=True).sort_values(["by_id", "Start"])
@@ -604,13 +592,17 @@ def _introns2(df, exons, **kwargs):
     )
 
     vc = introns["by_id"].value_counts(sort=False).to_frame().reset_index()
-    vc.columns = ["by_id", "counts"]
+    vc.columns = pd.Index(["by_id", "counts"])
 
-    genes_without_introns = pd.DataFrame(data={"by_id": np.setdiff1d(by_ids.values, vc.by_id.values), "counts": 0})
+    genes_without_introns = pd.DataFrame(data={"by_id": np.setdiff1d(
+        np.array(by_ids.values),
+        np.array(vc.by_id.values)),
+        "counts": 0}
+    )
 
     vc = pd.concat([vc, genes_without_introns]).sort_values("by_id")
 
-    original_ids = np.repeat(vc.by_id, vc.counts).to_frame()
+    original_ids = pd.Series(np.repeat(vc.by_id, vc.counts)).to_frame()
     original_ids = original_ids.merge(
         df[["__temp__", id_column]],
         right_on="__temp__",
diff --git a/pyranges/get_fasta.py b/pyranges/get_fasta.py
index eee4be0b..e1cfa7e9 100644
--- a/pyranges/get_fasta.py
+++ b/pyranges/get_fasta.py
@@ -1,11 +1,19 @@
 import sys
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
 
 import pandas as pd
+from pandas.core.frame import DataFrame
+from pandas.core.series import Series
 
 import pyranges as pr  # noqa: F401
+from pyranges.pyranges_main import PyRanges
 
+if TYPE_CHECKING:
+    import pyfaidx  # type: ignore
 
-def get_sequence(gr, path=None, pyfaidx_fasta=None):
+
+def get_sequence(gr: PyRanges, path: Optional[Path] = None, pyfaidx_fasta: Optional["pyfaidx.Fasta"] = None) -> Series:
     """Get the sequence of the intervals from a fasta file
 
     Parameters
@@ -14,7 +22,7 @@ def get_sequence(gr, path=None, pyfaidx_fasta=None):
 
         Coordinates.
 
-    path : str
+    path : Path
 
         Path to fasta file. It will be indexed using pyfaidx if an index is not found
 
@@ -128,7 +136,9 @@ def get_fasta(*args, **kwargs):
     return get_sequence(*args, **kwargs)
 
 
-def get_transcript_sequence(gr, group_by, path=None, pyfaidx_fasta=None):
+def get_transcript_sequence(
+    gr: PyRanges, group_by: str, path: Optional[Path] = None, pyfaidx_fasta: Optional["pyfaidx.Fasta"] = None
+) -> DataFrame:
     """Get the sequence of mRNAs, e.g. joining intervals corresponding to exons of the same transcript
 
     Parameters
@@ -141,7 +151,7 @@ def get_transcript_sequence(gr, group_by, path=None, pyfaidx_fasta=None):
 
         intervals are grouped by this/these ID column(s): these are exons belonging to same transcript
 
-    path : str
+    path : Optional Path
 
         Path to fasta file. It will be indexed using pyfaidx if an index is not found
 
@@ -223,3 +233,13 @@ def get_transcript_sequence(gr, group_by, path=None, pyfaidx_fasta=None):
     z["Sequence"] = get_sequence(gr, path=path, pyfaidx_fasta=pyfaidx_fasta)
 
     return z.groupby(group_by, as_index=False).agg({"Sequence": "".join})
+
+
+def _test():
+    import doctest
+
+    doctest.testmod()
+
+
+if __name__ == "__main__":
+    _test()
diff --git a/pyranges/helpers.py b/pyranges/helpers.py
index cb4f099c..232e5072 100644
--- a/pyranges/helpers.py
+++ b/pyranges/helpers.py
@@ -1,9 +1,9 @@
-from typing import Tuple, Union
+from typing import Tuple, Union, List
 
 import pandas as pd
 
 
-def get_chromosomes_from_dict(dfs):
+def get_chromosomes_from_dict(dfs) -> List[str]:
     keys = list(dfs.keys())
     if isinstance(keys[0], tuple):
         chromosomes = [k[0] for k in keys]
@@ -13,7 +13,7 @@ def get_chromosomes_from_dict(dfs):
     return chromosomes
 
 
-def get_strands_from_dict(dfs):
+def get_strands_from_dict(dfs) -> Union[List[str], List[Tuple[str, str]]]:
     keys = list(dfs.keys())
     if isinstance(keys[0], tuple):
         strands = [k[1] for k in keys]
@@ -32,7 +32,7 @@ def get_key_from_df(df: pd.DataFrame) -> Union[str, Tuple[str, str]]:
     return chromosome
 
 
-def single_value_key(df):
+def single_value_key(df: pd.DataFrame) -> bool:
     if "Strand" in df:
         return len(df[["Chromosome", "Strand"]].drop_duplicates(["Chromosome", "Strand"])) == 1
     else:
diff --git a/pyranges/multioverlap.py b/pyranges/multioverlap.py
index 0dc5fa8d..d331525b 100644
--- a/pyranges/multioverlap.py
+++ b/pyranges/multioverlap.py
@@ -1,9 +1,11 @@
 import numpy as np
 
 import pyranges as pr
+from pyranges.pyranges_main import PyRanges
+from typing import Dict, Optional
 
 
-def count_overlaps(grs, features=None, strandedness=None, how=None, nb_cpu=1):
+def count_overlaps(grs: Dict[str, PyRanges], features: Optional[PyRanges] = None, strandedness: Optional[str] = None, how: Optional[str] = None) -> PyRanges:
     """Count overlaps in multiple pyranges.
 
     Parameters
@@ -27,11 +29,6 @@ def count_overlaps(grs, features=None, strandedness=None, how=None, nb_cpu=1):
         What intervals to report. By default reports all overlapping intervals. "containment"
         reports intervals where the overlapping is contained within it.
 
-    nb_cpu : int, default 1
-
-        How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple.
-        Will only lead to speedups on large datasets.
-
     Examples
     --------
 
@@ -136,10 +133,7 @@ def count_overlaps(grs, features=None, strandedness=None, how=None, nb_cpu=1):
 
     kwargs = {
         "as_pyranges": False,
-        "nb_cpu": nb_cpu,
-        "strandedness": strandedness,
         "how": how,
-        "nb_cpu": nb_cpu,
     }
     names = list(grs.keys())
 
@@ -154,7 +148,7 @@ def count_overlaps(grs, features=None, strandedness=None, how=None, nb_cpu=1):
         gr = gr.drop()
 
         kwargs["name"] = name
-        features.apply_pair(gr, _count_overlaps, **kwargs)  # count overlaps modifies the ranges in-place
+        features.apply_pair(gr, _count_overlaps, strandedness, **kwargs)  # count overlaps modifies the ranges in-place
 
     def to_int(df):
         df[names] = df[names].astype(np.int64)
diff --git a/pyranges/pyranges_main.py b/pyranges/pyranges_main.py
index 2b3b4590..5cd8e97f 100644
--- a/pyranges/pyranges_main.py
+++ b/pyranges/pyranges_main.py
@@ -609,7 +609,7 @@ def apply_general(
 
         return pyrange_apply_single(f, self, **kwargs)
 
-    def apply_pair(self, other: "PyRanges", f: Callable, strandedness: None = None, **kwargs) -> "PyRanges":
+    def apply_pair(self, other: "PyRanges", f: Callable, strandedness: Optional[str] = None, **kwargs) -> "PyRanges":
         """Apply a function to a pair of PyRanges.
 
         The function is applied to each chromosome or chromosome/strand pair found in at least one
@@ -633,11 +633,6 @@ def apply_pair(self, other: "PyRanges", f: Callable, strandedness: None = None,
             Whether to return as a PyRanges or dict. If `f` does not return a pd.DataFrame valid for
             PyRanges, `as_pyranges` must be False.
 
-        nb_cpu: int, default 1
-
-            How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple.
-            Will only lead to speedups on large datasets.
-
         **kwargs
             Additional keyword arguments to pass as keyword arguments to `f`
 
@@ -5214,10 +5209,9 @@ def to_bigwig(
 
         from pyranges.out import _to_bigwig
 
-        if chromosome_sizes is None:
-            chromosome_sizes = pr.data.chromsizes()
+        _chromosome_sizes = pr.data.chromsizes() if chromosome_sizes is None else chromosome_sizes
 
-        result = _to_bigwig(self, path, chromosome_sizes, rpm, divide, value_col, dryrun)
+        result = _to_bigwig(self, path, _chromosome_sizes, rpm, divide, value_col, dryrun)
 
         if dryrun:
             return result
diff --git a/pyranges/readers.py b/pyranges/readers.py
index cbf3cbb7..cc3207bc 100644
--- a/pyranges/readers.py
+++ b/pyranges/readers.py
@@ -1,6 +1,8 @@
 from __future__ import print_function
 
 import sys
+from pathlib import Path
+from typing import Union, Optional, List
 
 import pandas as pd
 from natsort import natsorted  # type: ignore
@@ -9,7 +11,7 @@
 from pyranges.pyranges_main import PyRanges
 
 
-def read_bed(f, as_df=False, nrows=None):
+def read_bed(f: Union[str, Path], /, nrows: Optional[int] = None) -> pr.PyRanges:
     """Return bed file as PyRanges.
 
     This is a reader for files that follow the bed format. They can have from
@@ -24,11 +26,7 @@ def read_bed(f, as_df=False, nrows=None):
 
         Path to bed file
 
-    as_df : bool, default False
-
-        Whether to return as pandas DataFrame instead of PyRanges.
-
-    nrows : int, default None
+    nrows : Optional int, default None
 
         Number of rows to return.
 
@@ -55,27 +53,18 @@ def read_bed(f, as_df=False, nrows=None):
     +--------------+-----------+-----------+------------+-----------+--------------+
     Stranded PyRanges object has 5 rows and 6 columns from 1 chromosomes.
     For printing, the PyRanges was sorted on Chromosome and Strand.
-
-    >>> pr.read_bed(path, as_df=True, nrows=5)
-      Chromosome  Start    End      Name  Score Strand
-    0       chr1   9916  10115  H3K27me3      5      -
-    1       chr1   9939  10138  H3K27me3      7      +
-    2       chr1   9951  10150  H3K27me3      8      -
-    3       chr1   9953  10152  H3K27me3      5      +
-    4       chr1   9978  10177  H3K27me3      7      -
-
     """
 
     columns = (
         "Chromosome Start End Name Score Strand ThickStart ThickEnd ItemRGB BlockCount BlockSizes BlockStarts".split()
     )
-
-    if f.endswith(".gz"):
+    path = Path(f)
+    if path.name.endswith(".gz"):
         import gzip
 
-        first_start = gzip.open(f).readline().split()[1]
+        first_start = gzip.open(path).readline().decode().split()[1]
     else:
-        first_start = open(f).readline().split()[1]
+        first_start = open(path).readline().split()[1]
 
     header = None
 
@@ -85,22 +74,19 @@ def read_bed(f, as_df=False, nrows=None):
         header = 0
 
     df = pd.read_csv(
-        f,
-        dtype={"Chromosome": "category", "Strand": "category"},
+        path,
+        dtype={"Chromosome": "category", "Strand": "category"},  # type: ignore
         nrows=nrows,
         header=header,
         sep="\t",
     )
 
-    df.columns = columns[: df.shape[1]]
+    df.columns = pd.Index(columns[: df.shape[1]])
 
-    if not as_df:
-        return PyRanges(df)
-    else:
-        return df
+    return PyRanges(df)
 
 
-def read_bam(f, sparse=True, as_df=False, mapq=0, required_flag=0, filter_flag=1540):
+def read_bam(f: Union[str, Path], /, sparse=True, mapq=0, required_flag=0, filter_flag=1540) -> pr.PyRanges:
     """Return bam file as PyRanges.
 
     Parameters
@@ -113,10 +99,6 @@ def read_bam(f, sparse=True, as_df=False, mapq=0, required_flag=0, filter_flag=1
 
         Whether to return only.
 
-    as_df : bool, default False
-
-        Whether to return as pandas DataFrame instead of PyRanges.
-
     mapq : int, default 0
 
         Minimum mapping quality score.
@@ -159,7 +141,7 @@ def read_bam(f, sparse=True, as_df=False, mapq=0, required_flag=0, filter_flag=1
     Stranded PyRanges object has 10,000 rows and 5 columns from 25 chromosomes.
     For printing, the PyRanges was sorted on Chromosome and Strand.
     """
-
+    path = Path(f)
     try:
         import bamread  # type: ignore
     except ImportError:
@@ -185,22 +167,17 @@ def read_bam(f, sparse=True, as_df=False, mapq=0, required_flag=0, filter_flag=1
         sys.exit(1)
 
     if sparse:
-        df = bamread.read_bam(f, mapq, required_flag, filter_flag)
+        df = bamread.read_bam(path, mapq, required_flag, filter_flag)
     else:
         try:
-            df = bamread.read_bam_full(f, mapq, required_flag, filter_flag)
+            df = bamread.read_bam_full(path, mapq, required_flag, filter_flag)
         except AttributeError:
             print("bamread version 0.0.6 or higher is required to read bam non-sparsely.")
 
-    if as_df:
-        return df
-    else:
-        return PyRanges(df)
-
-    # return bamread.read_bam(f, mapq, required_flag, filter_flag)
+    return PyRanges(df)
 
 
-def _fetch_gene_transcript_exon_id(attribute, annotation=None):
+def _fetch_gene_transcript_exon_id(attribute: pd.Series, annotation: Optional[str] = None) -> pd.DataFrame:
     no_quotes = attribute.str.replace('"', "").str.replace("'", "")
 
     df = no_quotes.str.extract(
@@ -208,48 +185,48 @@ def _fetch_gene_transcript_exon_id(attribute, annotation=None):
         expand=True,
     )  # .iloc[:, [1, 2, 3]]
 
-    df.columns = "gene_id transcript_id exon_number exon_id".split()
+    df.columns = pd.Index("gene_id transcript_id exon_number exon_id".split())
 
     if annotation == "ensembl":
-        newdf = []
+        newdfs = []
         for c in "gene_id transcript_id exon_id".split():
             r = df[c].astype(str).str.extract(r"(\d+)").astype(float)
-            newdf.append(r)
+            newdfs.append(r)
 
-        newdf = pd.concat(newdf, axis=1)
+        newdf = pd.concat(newdfs, axis=1)
         newdf.insert(2, "exon_number", df["exon_number"])
         df = newdf
 
     return df
 
 
-def skiprows(f):
+def skiprows(f: Path) -> int:
     try:
         import gzip
 
-        fh = gzip.open(f)
-        for i, l in enumerate(fh):
-            if l.decode()[0] != "#":
+        zh = gzip.open(f)
+        for i, zl in enumerate(zh):
+            if zl.decode()[0] != "#":
                 break
+        zh.close()
     except (OSError, TypeError):  # not a gzipped file, or StringIO
         fh = open(f)
         for i, l in enumerate(fh):
             if l[0] != "#":
                 break
-
-    fh.close()
+        fh.close()
 
     return i
 
 
 def read_gtf(
-    f,
+    f: Union[str, Path],
+    /,
     full=True,
-    as_df=False,
     nrows=None,
     duplicate_attr=False,
     ignore_bad: bool = False,
-):
+) -> pr.PyRanges:
     """Read files in the Gene Transfer Format.
 
     Parameters
@@ -262,10 +239,6 @@ def read_gtf(
 
         Whether to read and interpret the annotation column.
 
-    as_df : bool, default False
-
-        Whether to return as pandas DataFrame instead of PyRanges.
-
     nrows : int, default None
 
         Number of rows to read. Default None, i.e. all.
@@ -282,8 +255,7 @@ def read_gtf(
     ----
 
     The GTF format encodes both Start and End as 1-based included.
-    PyRanges (and also the DF returned by this function, if as_df=True), instead
-    encodes intervals as 0-based, Start included and End excluded.
+    PyRanges encodes intervals as 0-based, Start included and End excluded.
 
     See Also
     --------
@@ -315,31 +287,32 @@ def read_gtf(
     >>> # 18 hidden columns: gene_name, gene_source, gene_biotype, transcript_id, transcript_version, transcript_name, transcript_source, transcript_biotype, tag, transcript_support_level, ... (+ 8 more.)
     """
 
-    _skiprows = skiprows(f)
+    path = Path(f)
+    _skiprows = skiprows(path)
 
     if full:
-        gr = read_gtf_full(f, as_df, nrows, _skiprows, duplicate_attr, ignore_bad=ignore_bad)
+        gr = read_gtf_full(path, nrows, _skiprows, duplicate_attr, ignore_bad=ignore_bad)
     else:
-        gr = read_gtf_restricted(f, _skiprows, as_df=False, nrows=None)
+        gr = read_gtf_restricted(path, _skiprows, nrows=None)
 
     return gr
 
 
 def read_gtf_full(
-    f,
-    as_df=False,
-    nrows=None,
-    skiprows=0,
-    duplicate_attr=False,
+    f: Union[str, Path],
+    nrows = None,
+    skiprows = 0,
+    duplicate_attr = False,
     ignore_bad: bool = False,
     chunksize: int = int(1e5),  # for unit-testing purposes
-):
+) -> pr.PyRanges:
     dtypes = {"Chromosome": "category", "Feature": "category", "Strand": "category"}
 
     names = "Chromosome Source Feature Start End Score Strand Frame Attribute".split()
+    path = Path(f)
 
     df_iter = pd.read_csv(
-        f,
+        path,
         sep="\t",
         header=None,
         names=names,
@@ -353,7 +326,7 @@ def read_gtf_full(
 
     dfs = []
     for df in df_iter:
-        extra = _to_rows(df.Attribute, ignore_bad=ignore_bad)
+        extra = _to_rows(df.Attribute.astype(str), ignore_bad=ignore_bad)
         df = df.drop("Attribute", axis=1)
         extra.set_index(df.index, inplace=True)
         ndf = pd.concat([df, extra], axis=1, sort=False)
@@ -362,30 +335,25 @@ def read_gtf_full(
     df = pd.concat(dfs, sort=False)
     df.loc[:, "Start"] = df.Start - 1
 
-    if not as_df:
-        return PyRanges(df)
-    else:
-        return df
+    return PyRanges(df)
 
 
-def parse_kv_fields(line):
+def parse_kv_fields(line: str) -> List[List[str]]:
     # rstrip: allows for GFF not having a last ";", or having final spaces
     return [kv.replace('""', '"NA"').replace('"', "").split(None, 1) for kv in line.rstrip("; ").split("; ")]
 
 
-def to_rows(anno, ignore_bad: bool = False):
-    rowdicts = []
+def to_rows(anno: pd.Series, ignore_bad: bool = False) -> pd.DataFrame:
     try:
-        line = anno.head(1)
-        for line in line:
-            line.replace('"', "").replace(";", "").split()
+        row = anno.head(1)
+        for entry in row:
+            str(entry).replace('"', "").replace(";", "").split()
     except AttributeError:
         raise Exception(
-            "Invalid attribute string: {line}. If the file is in GFF3 format, use pr.read_gff3 instead.".format(
-                line=line
-            )
+            f"Invalid attribute string: {entry}. If the file is in GFF3 format, use pr.read_gff3 instead."
         )
 
+    rowdicts = []
     try:
         for line in anno:
             rowdicts.append({k: v for k, v in parse_kv_fields(line)})
@@ -397,7 +365,7 @@ def to_rows(anno, ignore_bad: bool = False):
     return pd.DataFrame.from_records(rowdicts)
 
 
-def to_rows_keep_duplicates(anno, ignore_bad: bool = False):
+def to_rows_keep_duplicates(anno: pd.Series, ignore_bad: bool = False) -> pd.DataFrame:
     rowdicts = []
     try:
         for line in anno:
@@ -406,11 +374,9 @@ def to_rows_keep_duplicates(anno, ignore_bad: bool = False):
             # rstrip: allows for GFF not having a last ";", or having final spaces
             for k, v in tuple(parse_kv_fields(line)):
                 if k not in rowdict:
-                    rowdict[k] = v
-                elif k in rowdict and isinstance(rowdict[k], list):
-                    rowdict[k].append(v)
+                    rowdict[k] = [v]
                 else:
-                    rowdict[k] = [rowdict[k], v]
+                    rowdict[k].append(v)
 
             rowdicts.append({k: ",".join(v) if isinstance(v, list) else v for k, v in rowdict.items()})
     except ValueError:
@@ -421,7 +387,7 @@ def to_rows_keep_duplicates(anno, ignore_bad: bool = False):
     return pd.DataFrame.from_records(rowdicts)
 
 
-def read_gtf_restricted(f, skiprows, as_df=False, nrows=None):
+def read_gtf_restricted(f: Union[str, Path], skiprows: Optional[int], nrows: Optional[int] = None) -> pr.PyRanges:
     """seqname - name of the chromosome or scaffold; chromosome names can be given with or without the 'chr' prefix. Important note: the seqname must be one used within Ensembl, i.e. a standard chromosome name or an Ensembl identifier such as a scaffold ID, without any additional content such as species or assembly. See the example GFF output below.
     # source - name of the program that generated this feature, or the data source (database or project name)
     feature - feature type name, e.g. Gene, Variation, Similarity
@@ -433,17 +399,18 @@ def read_gtf_restricted(f, skiprows, as_df=False, nrows=None):
     attribute - A semicolon-separated list of tag-value pairs, providing additional information about each feature.
     """
     dtypes = {"Chromosome": "category", "Feature": "category", "Strand": "category"}
+    path = Path(f)
 
     df_iter = pd.read_csv(
-        f,
+        path,
         sep="\t",
         comment="#",
         usecols=[0, 2, 3, 4, 5, 6, 8],
         header=None,
         names="Chromosome Feature Start End Score Strand Attribute".split(),
-        dtype=dtypes,
+        dtype=dtypes,  # type: ignore
         chunksize=int(1e5),
-        skiprows=skiprows,
+        skiprows=skiprows if skiprows is not None else False,
         nrows=nrows,
     )
 
@@ -455,7 +422,7 @@ def read_gtf_restricted(f, skiprows, as_df=False, nrows=None):
             cols_to_concat = "Chromosome Start End Strand Feature Score".split()
 
         extract = _fetch_gene_transcript_exon_id(df.Attribute)
-        extract.columns = "gene_id transcript_id exon_number exon_id".split()
+        extract.columns = pd.Index("gene_id transcript_id exon_number exon_id".split())
 
         extract.exon_number = extract.exon_number.astype(float)
 
@@ -468,13 +435,10 @@ def read_gtf_restricted(f, skiprows, as_df=False, nrows=None):
 
     df.loc[:, "Start"] = df.Start - 1
 
-    if not as_df:
-        return PyRanges(df)
-    else:
-        return df
+    return PyRanges(df)
 
 
-def to_rows_gff3(anno):
+def to_rows_gff3(anno) -> pd.DataFrame:
     rowdicts = []
 
     for line in list(anno):
@@ -485,7 +449,7 @@ def to_rows_gff3(anno):
     return pd.DataFrame.from_records(rowdicts).set_index(anno.index)
 
 
-def read_gff3(f, full=True, annotation=None, as_df=False, nrows=None):
+def read_gff3(f: Union[str, Path], full: bool = True, as_df: bool = False, nrows: Optional[int] = None) -> pr.PyRanges:
     """Read files in the General Feature Format.
 
     Parameters
@@ -519,22 +483,23 @@ def read_gff3(f, full=True, annotation=None, as_df=False, nrows=None):
     pyranges.read_gtf : read files in the Gene Transfer Format
     """
 
-    _skiprows = skiprows(f)
+    path = Path(f)
+    _skiprows = skiprows(path)
 
     if not full:
-        return read_gtf_restricted(f, _skiprows, as_df=as_df, nrows=nrows)
+        return read_gtf_restricted(path, _skiprows, nrows=nrows)
 
     dtypes = {"Chromosome": "category", "Feature": "category", "Strand": "category"}
 
     names = "Chromosome Source Feature Start End Score Strand Frame Attribute".split()
 
     df_iter = pd.read_csv(
-        f,
+        path,
         comment="#",
         sep="\t",
         header=None,
         names=names,
-        dtype=dtypes,
+        dtype=dtypes,  # type: ignore
         chunksize=int(1e5),
         skiprows=_skiprows,
         nrows=nrows,
@@ -552,13 +517,10 @@ def read_gff3(f, full=True, annotation=None, as_df=False, nrows=None):
 
     df.loc[:, "Start"] = df.Start - 1
 
-    if not as_df:
-        return PyRanges(df)
-    else:
-        return df
+    return PyRanges(df)
 
 
-def read_bigwig(f, as_df=False):
+def read_bigwig(f: Union[str, Path]) -> pr.PyRanges:
     try:
         import pyBigWig  # type: ignore
     except ModuleNotFoundError:
@@ -589,7 +551,8 @@ def read_bigwig(f, as_df=False):
     >>> gr
     """
 
-    bw = pyBigWig.open(f)
+    path = Path(f)
+    bw = pyBigWig.open(path)
 
     size = int(1e5)
     chromosomes = bw.chroms()
@@ -626,4 +589,18 @@ def read_bigwig(f, as_df=False):
             }
         )
 
-    return pr.PyRanges(dfs)
+    return pr.from_dfs(dfs)
+
+
+
+
+
+
+def _test():
+    import doctest
+
+    doctest.testmod()
+
+
+if __name__ == "__main__":
+    _test()
diff --git a/tests/unit/test_io.py b/tests/unit/test_io.py
index 4160571f..b85caf23 100644
--- a/tests/unit/test_io.py
+++ b/tests/unit/test_io.py
@@ -1,7 +1,9 @@
 import numpy as np
+import pandas as pd
 from pandas.testing import assert_frame_equal
 
 import pyranges as pr
+from pyranges.readers import to_rows_keep_duplicates
 
 ensembl_gtf = "tests/unit/test_data/ensembl.gtf"
 
@@ -57,3 +59,9 @@ def test_read_gff3():
 
 def test_read_bed():
     pr.read_bed("pyranges/example_data/chipseq.bed")
+
+
+def test_to_rows_keep_duplicates():
+    anno = pd.Series(["gene DDX11L1; gene sonic; unique hi"])
+    result = to_rows_keep_duplicates(anno)
+    assert result.to_dict(orient="index") == {0: {'gene': 'DDX11L1,sonic', "unique": "hi"}}

From 841e1fbe3dab9361c7298d62739e49440b675dcb Mon Sep 17 00:00:00 2001
From: endre bakken stovner <endrebakkenstovner@endres-MacBook-Air.local>
Date: Fri, 26 May 2023 13:35:58 +0200
Subject: [PATCH 08/10] Add types to out.py

---
 pyranges/__init__.py        | 192 ------------------------------------
 pyranges/genomicfeatures.py |  11 +--
 pyranges/helpers.py         |   2 +-
 pyranges/multioverlap.py    |  10 +-
 pyranges/out.py             |  95 +++++++++++-------
 pyranges/pyranges_main.py   |  33 ++-----
 pyranges/readers.py         |  26 +----
 7 files changed, 87 insertions(+), 282 deletions(-)

diff --git a/pyranges/__init__.py b/pyranges/__init__.py
index 4b89d8ac..d8db666f 100644
--- a/pyranges/__init__.py
+++ b/pyranges/__init__.py
@@ -3,7 +3,6 @@
 import itertools
 import sys
 from collections import defaultdict
-from pathlib import Path
 from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
 
 import numpy as np
@@ -413,196 +412,6 @@ def random(
 pyranges.statistics : statistcal methods for genomics."""
 
 
-def to_bigwig(gr: PyRanges, path: Path, chromosome_sizes=Optional[Chromsizes]):
-    """Write df to bigwig.
-
-    Must contain the columns Chromosome, Start, End and Score. All others are ignored.
-
-    Parameters
-    ----------
-    gr: PyRanges
-        Intervals to write.
-
-    path : Path
-
-        Where to write bigwig.
-
-    chromosome_sizes : PyRanges or dict
-
-        If dict: map of chromosome names to chromosome length.
-
-    Examples
-    --------
-
-    Extended example with how to prepare your data for writing bigwigs:
-
-    >>> d =  {'Chromosome': ['chr1', 'chr1', 'chr1'], 'Start': [1, 4, 6],
-    ...       'End': [7, 8, 10], 'Strand': ['+', '-', '-'],
-    ...       'Value': [10, 20, 30]}
-    >>> import pyranges as pr
-    >>> gr = pr.from_dict(d)
-    >>> hg19 = pr.data.chromsizes()
-    >>> print(hg19)
-    +--------------+-----------+-----------+
-    | Chromosome   | Start     | End       |
-    | (category)   | (int64)   | (int64)   |
-    |--------------+-----------+-----------|
-    | chr1         | 0         | 249250621 |
-    | chr2         | 0         | 243199373 |
-    | chr3         | 0         | 198022430 |
-    | chr4         | 0         | 191154276 |
-    | ...          | ...       | ...       |
-    | chr22        | 0         | 51304566  |
-    | chrM         | 0         | 16571     |
-    | chrX         | 0         | 155270560 |
-    | chrY         | 0         | 59373566  |
-    +--------------+-----------+-----------+
-    Unstranded PyRanges object has 25 rows and 3 columns from 25 chromosomes.
-    For printing, the PyRanges was sorted on Chromosome.
-
-    Overlapping intervals are invalid in bigwigs:
-
-    >>> to_bigwig(gr, "outpath.bw", hg19)
-    Traceback (most recent call last):
-    ...
-    AssertionError: Can only write one strand at a time. Use an unstranded PyRanges or subset on strand first.
-
-    >>> to_bigwig(gr["-"], "outpath.bw", hg19)
-    Traceback (most recent call last):
-    ...
-    AssertionError: Intervals must not overlap.
-
-    >>> gr
-    +--------------+-----------+-----------+--------------+-----------+
-    | Chromosome   |     Start |       End | Strand       |     Value |
-    | (category)   |   (int64) |   (int64) | (category)   |   (int64) |
-    |--------------+-----------+-----------+--------------+-----------|
-    | chr1         |         1 |         7 | +            |        10 |
-    | chr1         |         4 |         8 | -            |        20 |
-    | chr1         |         6 |        10 | -            |        30 |
-    +--------------+-----------+-----------+--------------+-----------+
-    Stranded PyRanges object has 3 rows and 5 columns from 1 chromosomes.
-    For printing, the PyRanges was sorted on Chromosome and Strand.
-
-    >>> value = gr.to_rle(rpm=False, value_col="Value")
-    >>> value
-    chr1 +
-    --
-    +--------+-----+------+
-    | Runs   | 1   | 6    |
-    |--------+-----+------|
-    | Values | 0.0 | 10.0 |
-    +--------+-----+------+
-    Rle of length 7 containing 2 elements (avg. length 3.5)
-    <BLANKLINE>
-    chr1 -
-    --
-    +--------+-----+------+------+------+
-    | Runs   | 4   | 2    | 2    | 2    |
-    |--------+-----+------+------+------|
-    | Values | 0.0 | 20.0 | 50.0 | 30.0 |
-    +--------+-----+------+------+------+
-    Rle of length 10 containing 4 elements (avg. length 2.5)
-    RleDict object with 2 chromosomes/strand pairs.
-
-    >>> raw = gr.to_rle(rpm=False)
-    >>> raw
-    chr1 +
-    --
-    +--------+-----+-----+
-    | Runs   | 1   | 6   |
-    |--------+-----+-----|
-    | Values | 0.0 | 1.0 |
-    +--------+-----+-----+
-    Rle of length 7 containing 2 elements (avg. length 3.5)
-    <BLANKLINE>
-    chr1 -
-    --
-    +--------+-----+-----+-----+-----+
-    | Runs   | 4   | 2   | 2   | 2   |
-    |--------+-----+-----+-----+-----|
-    | Values | 0.0 | 1.0 | 2.0 | 1.0 |
-    +--------+-----+-----+-----+-----+
-    Rle of length 10 containing 4 elements (avg. length 2.5)
-    RleDict object with 2 chromosomes/strand pairs.
-
-    >>> result = (value / raw).apply_values(np.log10)
-    >>> result
-    chr1 +
-    --
-    +--------+-----+-----+
-    | Runs   | 1   | 6   |
-    |--------+-----+-----|
-    | Values | nan | 1.0 |
-    +--------+-----+-----+
-    Rle of length 7 containing 2 elements (avg. length 3.5)
-    <BLANKLINE>
-    chr1 -
-    --
-    +--------+-----+--------------------+--------------------+--------------------+
-    | Runs   | 4   | 2                  | 2                  | 2                  |
-    |--------+-----+--------------------+--------------------+--------------------|
-    | Values | nan | 1.3010300397872925 | 1.3979400396347046 | 1.4771212339401245 |
-    +--------+-----+--------------------+--------------------+--------------------+
-    Rle of length 10 containing 4 elements (avg. length 2.5)
-    RleDict object with 2 chromosomes/strand pairs.
-
-    >>> out = result.numbers_only().to_ranges()
-    >>> out
-    +--------------+-----------+-----------+-------------+--------------+
-    | Chromosome   |     Start |       End |       Score | Strand       |
-    | (category)   |   (int64) |   (int64) |   (float64) | (category)   |
-    |--------------+-----------+-----------+-------------+--------------|
-    | chr1         |         1 |         7 |     1       | +            |
-    | chr1         |         4 |         6 |     1.30103 | -            |
-    | chr1         |         6 |         8 |     1.39794 | -            |
-    | chr1         |         8 |        10 |     1.47712 | -            |
-    +--------------+-----------+-----------+-------------+--------------+
-    Stranded PyRanges object has 4 rows and 5 columns from 1 chromosomes.
-    For printing, the PyRanges was sorted on Chromosome and Strand.
-
-    >>> to_bigwig(out["-"], "deleteme_reverse.bw", hg19)
-    >>> to_bigwig(out["+"], "deleteme_forward.bw", hg19)
-    """
-
-    try:
-        import pyBigWig  # type: ignore
-    except ModuleNotFoundError:
-        print(
-            "pybigwig must be installed to create bigwigs. Use `conda install -c bioconda pybigwig` or `pip install pybigwig` to install it."
-        )
-        import sys
-
-        sys.exit(1)
-
-    assert (
-        len(gr.strands) <= 1
-    ), "Can only write one strand at a time. Use an unstranded PyRanges or subset on strand first."
-    lengths = gr.lengths()
-    assert isinstance(lengths, pd.Series)
-    assert np.sum(lengths) == gr.merge().length, "Intervals must not overlap."
-
-    df = gr.df
-
-    unique_chromosomes = list(df.Chromosome.drop_duplicates())
-
-    if not isinstance(chromosome_sizes, dict):
-        size_df = chromosome_sizes.df
-        chromosome_sizes = {k: v for k, v in zip(size_df.Chromosome, size_df.End)}
-
-    header = [(c, int(chromosome_sizes[c])) for c in unique_chromosomes]
-
-    bw = pyBigWig.open(path, "w")
-    bw.addHeader(header)
-
-    chromosomes = df.Chromosome.tolist()
-    starts = df.Start.tolist()
-    ends = df.End.tolist()
-    values = df.Score.tolist()
-
-    bw.addEntries(chromosomes, starts, ends=ends, values=values)
-
-
 def version_info() -> None:
     import importlib
 
@@ -637,7 +446,6 @@ def update_version_info(_version_info, library) -> None:
 __all__ = [
     "from_string",
     "from_dict",
-    "to_bigwig",
     "count_overlaps",
     "random",
     "itergrs",
diff --git a/pyranges/genomicfeatures.py b/pyranges/genomicfeatures.py
index bb8d909a..298f3bcc 100644
--- a/pyranges/genomicfeatures.py
+++ b/pyranges/genomicfeatures.py
@@ -1,12 +1,13 @@
+from typing import Dict
+
 import numpy as np
 import pandas as pd
+from pandas.core.frame import DataFrame
 from sorted_nearest.src.introns import find_introns  # type: ignore
 
 import pyranges as pr
 from pyranges.multithreaded import pyrange_apply
-from pandas.core.frame import DataFrame
 from pyranges.pyranges_main import PyRanges
-from typing import Dict, Optional
 
 __all__ = ["genome_bounds", "tile_genome", "GenomicFeaturesMethods"]
 
@@ -594,10 +595,8 @@ def _introns2(df: DataFrame, exons: DataFrame, **kwargs) -> DataFrame:
     vc = introns["by_id"].value_counts(sort=False).to_frame().reset_index()
     vc.columns = pd.Index(["by_id", "counts"])
 
-    genes_without_introns = pd.DataFrame(data={"by_id": np.setdiff1d(
-        np.array(by_ids.values),
-        np.array(vc.by_id.values)),
-        "counts": 0}
+    genes_without_introns = pd.DataFrame(
+        data={"by_id": np.setdiff1d(np.array(by_ids.values), np.array(vc.by_id.values)), "counts": 0}
     )
 
     vc = pd.concat([vc, genes_without_introns]).sort_values("by_id")
diff --git a/pyranges/helpers.py b/pyranges/helpers.py
index 232e5072..4f238e8f 100644
--- a/pyranges/helpers.py
+++ b/pyranges/helpers.py
@@ -1,4 +1,4 @@
-from typing import Tuple, Union, List
+from typing import List, Tuple, Union
 
 import pandas as pd
 
diff --git a/pyranges/multioverlap.py b/pyranges/multioverlap.py
index d331525b..61275b43 100644
--- a/pyranges/multioverlap.py
+++ b/pyranges/multioverlap.py
@@ -1,11 +1,17 @@
+from typing import Dict, Optional
+
 import numpy as np
 
 import pyranges as pr
 from pyranges.pyranges_main import PyRanges
-from typing import Dict, Optional
 
 
-def count_overlaps(grs: Dict[str, PyRanges], features: Optional[PyRanges] = None, strandedness: Optional[str] = None, how: Optional[str] = None) -> PyRanges:
+def count_overlaps(
+    grs: Dict[str, PyRanges],
+    features: Optional[PyRanges] = None,
+    strandedness: Optional[str] = None,
+    how: Optional[str] = None,
+) -> PyRanges:
     """Count overlaps in multiple pyranges.
 
     Parameters
diff --git a/pyranges/out.py b/pyranges/out.py
index cd682197..72eb8245 100644
--- a/pyranges/out.py
+++ b/pyranges/out.py
@@ -1,8 +1,13 @@
 import csv
+from pathlib import Path
+from typing import Dict, List, Optional, Union
 
 import numpy as np
 import pandas as pd
 from natsort import natsorted  # type: ignore
+from pandas.core.frame import DataFrame
+
+from pyranges.pyranges_main import PyRanges
 
 _gtf_columns = {
     "seqname": "Chromosome",
@@ -13,7 +18,6 @@
     "score": "Score",
     "strand": "Strand",
     "frame": "Frame",
-    # "attribute": "Attribute"  # filled with all others columns
 }
 
 _gff3_columns = _gtf_columns.copy()
@@ -43,39 +47,39 @@
 ]
 
 
-def _fill_missing(df, all_columns):
+def _fill_missing(df: DataFrame, all_columns: List[str]) -> DataFrame:
     columns = list(df.columns)
 
-    if not df.get(all_columns) is None:
-        outdf = df.get(all_columns)
+    if set(columns).intersection(set(all_columns)) == set(all_columns):
+        return df[all_columns]
     else:
         missing = set(all_columns) - set(columns)
         missing_idx = {all_columns.index(m): m for m in missing}
         not_missing = set(columns).intersection(set(all_columns))
         not_missing_ordered = sorted(not_missing, key=all_columns.index)
-        outdf = df.get(not_missing_ordered)
+        outdf = df[not_missing_ordered]
 
-        for idx, missing in sorted(missing_idx.items()):
-            outdf.insert(idx, missing, ".")
+        for idx, _missing in sorted(missing_idx.items()):
+            outdf.insert(idx, _missing, ".")
 
-    return outdf
+        return outdf
 
 
-def _bed(df, keep):
+def _bed(df: DataFrame, keep: bool) -> DataFrame:
     all_columns = "Chromosome Start End Name Score Strand".split()
 
     outdf = _fill_missing(df, all_columns)
 
-    noncanonical = set(df.columns) - set(all_columns)
+    noncanonical = list(set(df.columns) - set(all_columns))
     noncanonical = [c for c in df.columns if c in noncanonical]
 
     if keep:
-        return pd.concat([outdf, df.get(noncanonical)], axis=1)
+        return pd.concat([outdf, df[noncanonical]], axis=1)
     else:
         return outdf
 
 
-def _gtf(df, mapping):
+def _gtf(df: DataFrame, mapping: Dict[str, str]) -> DataFrame:
     pr_col2gff_col = {v: k for k, v in mapping.items()}
 
     df = df.rename(columns=pr_col2gff_col)  # copying here
@@ -86,17 +90,17 @@ def _gtf(df, mapping):
     outdf = _fill_missing(df, all_columns)
 
     if "attribute" in df.columns:
-        attribute = mapping["attribute"] + ' "' + df.attribute + '";'
+        attribute = pd.Series([mapping["attribute"] + ' "' + df.attribute + '";'])
     else:
         # gotten all needed columns, need to join the rest
-        rest = set(df.columns) - set(all_columns)
-        rest = sorted(rest, key=columns.index)
-        rest_df = df.get(rest).copy()
+        _rest = set(df.columns) - set(all_columns)
+        rest = sorted(_rest, key=columns.index)
+        rest_df = df[rest].copy()
         for c in rest_df:
-            col = rest_df[c]
+            col = pd.Series(rest_df[c])
             isnull = col.isnull()
             col = col.astype(str).str.replace("nan", "")
-            new_val = c + ' "' + col + '";'
+            new_val = str(c) + ' "' + col + '";'
             rest_df.loc[:, c] = rest_df[c].astype(str)
             rest_df.loc[~isnull, c] = new_val
             rest_df.loc[isnull, c] = ""
@@ -107,7 +111,9 @@ def _gtf(df, mapping):
     return outdf
 
 
-def _to_gtf(self, path=None, compression="infer", map_cols=None):
+def _to_gtf(
+    self: PyRanges, path: Optional[str] = None, compression: str = "infer", map_cols: Optional[Dict[str, str]] = None
+) -> Optional[str]:
     mapping = _gtf_columns.copy()
     if map_cols:
         mapping.update(map_cols)
@@ -121,19 +127,26 @@ def _to_gtf(self, path=None, compression="infer", map_cols=None):
         for outdf in outdfs:
             outdf.to_csv(
                 path,
+                sep="\t",
                 index=False,
                 header=False,
                 compression=compression,
                 mode=mode,
-                sep="\t",
                 quoting=csv.QUOTE_NONE,
-            )
+            )  # type: ignore
             mode = "a"
+        return None
     else:
         return "".join([outdf.to_csv(index=False, header=False, sep="\t", quoting=csv.QUOTE_NONE) for outdf in outdfs])
 
 
-def _to_csv(self, path=None, sep=",", header=True, compression="infer"):
+def _to_csv(
+    self: PyRanges,
+    path: Optional[Union[Path, str]] = None,
+    sep: str = ",",
+    header: bool = True,
+    compression: str = "infer",
+) -> Optional[str]:
     gr = self
 
     if path:
@@ -150,6 +163,7 @@ def _to_csv(self, path=None, sep=",", header=True, compression="infer"):
             )
             mode = "a"
             header = False
+        return None
     else:
         return "".join(
             [
@@ -159,7 +173,9 @@ def _to_csv(self, path=None, sep=",", header=True, compression="infer"):
         )
 
 
-def _to_bed(self, path=None, sep="\t", keep=True, compression="infer"):
+def _to_bed(
+    self: PyRanges, path: Optional[str] = None, sep: str = "\t", keep: bool = True, compression: str = "infer"
+) -> Optional[str]:
     gr = self
 
     outdfs = natsorted(gr.dfs.items())
@@ -176,15 +192,23 @@ def _to_bed(self, path=None, sep="\t", keep=True, compression="infer"):
                 mode=mode,
                 sep="\t",
                 quoting=csv.QUOTE_NONE,
-            )
+            )  # type: ignore
             mode = "a"
-
+        return None
     else:
         res = "".join([outdf.to_csv(index=False, header=False, sep="\t", quoting=csv.QUOTE_NONE) for outdf in outdfs])
         return res
 
 
-def _to_bigwig(self, path, chromosome_sizes, rpm=True, divide=False, value_col=None, dryrun=False):
+def _to_bigwig(
+    self: PyRanges,
+    path: None,
+    chromosome_sizes: Union[PyRanges, dict],
+    rpm: bool = True,
+    divide: Optional[bool] = False,
+    value_col: Optional[str] = None,
+    dryrun: bool = False,
+) -> Optional[PyRanges]:
     try:
         import pyBigWig  # type: ignore
     except ModuleNotFoundError:
@@ -237,8 +261,12 @@ def _to_bigwig(self, path, chromosome_sizes, rpm=True, divide=False, value_col=N
 
         bw.addEntries(chromosomes, starts, ends=ends, values=values)
 
+    return None
+
 
-def _to_gff3(self, path=None, compression="infer", map_cols=None):
+def _to_gff3(
+    self: PyRanges, path: None = None, compression: str = "infer", map_cols: Optional[Dict[str, str]] = None
+) -> str:
     mapping = _gff3_columns.copy()
     if map_cols:
         mapping.update(map_cols)
@@ -266,7 +294,7 @@ def _to_gff3(self, path=None, compression="infer", map_cols=None):
         )
 
 
-def _gff3(df, mapping):
+def _gff3(df: DataFrame, mapping: Dict[str, str]) -> DataFrame:
     pr_col2gff_col = {v: k for k, v in mapping.items()}
 
     df = df.rename(columns=pr_col2gff_col)  # copying here
@@ -279,14 +307,15 @@ def _gff3(df, mapping):
     if "attribute" in mapping:
         attribute_name = mapping["attribute"]
         attribute_value = df.attribute.iloc[0]
-        attribute = f"{attribute_name}={attribute_value}"
+        attribute = pd.Series([f"{attribute_name}={attribute_value}"])
     else:
         # gotten all needed columns, need to join the rest
-        rest = set(df.columns) - set(all_columns)
-        rest = sorted(rest, key=columns.index)
-        rest_df = df.get(rest).copy()
+        _rest = set(df.columns) - set(all_columns)
+        rest = sorted(_rest, key=columns.index)
+        rest_df = df[rest].copy()
         total_cols = rest_df.shape[1]
-        for i, c in enumerate(rest_df, 1):
+        for i, _c in enumerate(rest_df, 1):
+            c = str(_c)
             col = rest_df[c]
             isnull = col.isnull()
             col = col.astype(str).str.replace("nan", "")
diff --git a/pyranges/pyranges_main.py b/pyranges/pyranges_main.py
index 5cd8e97f..02055748 100644
--- a/pyranges/pyranges_main.py
+++ b/pyranges/pyranges_main.py
@@ -5009,9 +5009,7 @@ def three_end(self) -> "PyRanges":
     #         >>>
     #         """
 
-    def to_bed(
-        self, path: Optional[str] = None, keep: bool = True, compression: str = "infer", chain: bool = False
-    ) -> Union[str, "PyRanges"]:
+    def to_bed(self, path: Optional[str] = None, keep: bool = True, compression: str = "infer") -> Optional[str]:
         r"""Write to bed.
 
         Parameters
@@ -5027,9 +5025,6 @@ def to_bed(
         compression : str, compression type to use, by default infer based on extension.
             See pandas.DataFree.to_csv for more info.
 
-        chain : bool, default False
-            Whether to return the PyRanges after writing.
-
         Examples
         --------
 
@@ -5063,16 +5058,7 @@ def to_bed(
         chr1	1	5	.	.	+
         chr1	6	8	.	.	-
 
-        >>> gr.to_bed("test.bed", chain=True)
-        +--------------+-----------+-----------+--------------+-----------+
-        | Chromosome   |     Start |       End | Strand       |      Gene |
-        | (category)   |   (int64) |   (int64) | (category)   |   (int64) |
-        |--------------+-----------+-----------+--------------+-----------|
-        | chr1         |         1 |         5 | +            |         1 |
-        | chr1         |         6 |         8 | -            |         2 |
-        +--------------+-----------+-----------+--------------+-----------+
-        Stranded PyRanges object has 2 rows and 5 columns from 1 chromosomes.
-        For printing, the PyRanges was sorted on Chromosome and Strand.
+        >>> gr.to_bed("test.bed")
 
         >>> open("test.bed").readlines()
         ['chr1\t1\t5\t.\t.\t+\t1\n', 'chr1\t6\t8\t.\t.\t-\t2\n']
@@ -5081,10 +5067,7 @@ def to_bed(
 
         result = _to_bed(self, path, keep=keep, compression=compression)
 
-        if path and chain:
-            return self
-        else:
-            return result
+        return result
 
     def to_bigwig(
         self,
@@ -5223,7 +5206,7 @@ def to_bigwig(
 
     def to_csv(
         self, path: Optional["Path"] = None, sep: str = ",", header: bool = True, compression: str = "infer"
-    ) -> Union[str, "PyRanges"]:
+    ) -> Optional[str]:
         r"""Write to comma- or other value-separated file.
 
         Parameters
@@ -5391,9 +5374,8 @@ def to_gtf(
         self,
         path: None = None,
         compression: str = "infer",
-        chain: bool = False,
         map_cols: Optional[Dict[str, str]] = None,
-    ) -> str:
+    ) -> Optional[str]:
         """Write to Gene Transfer Format.
 
         The GTF format consists of a tab-separated file without header.
@@ -5479,10 +5461,7 @@ def to_gtf(
 
         result = _to_gtf(self, path, compression=compression, map_cols=map_cols)
 
-        if path and chain:
-            return self
-        else:
-            return result
+        return result
 
     def to_rle(
         self, value_col: Optional[str] = None, strand: Optional[bool] = None, rpm: bool = False, nb_cpu: int = 1
diff --git a/pyranges/readers.py b/pyranges/readers.py
index cc3207bc..d90f8a0c 100644
--- a/pyranges/readers.py
+++ b/pyranges/readers.py
@@ -2,7 +2,7 @@
 
 import sys
 from pathlib import Path
-from typing import Union, Optional, List
+from typing import List, Optional, Union
 
 import pandas as pd
 from natsort import natsorted  # type: ignore
@@ -300,9 +300,9 @@ def read_gtf(
 
 def read_gtf_full(
     f: Union[str, Path],
-    nrows = None,
-    skiprows = 0,
-    duplicate_attr = False,
+    nrows=None,
+    skiprows=0,
+    duplicate_attr=False,
     ignore_bad: bool = False,
     chunksize: int = int(1e5),  # for unit-testing purposes
 ) -> pr.PyRanges:
@@ -349,9 +349,7 @@ def to_rows(anno: pd.Series, ignore_bad: bool = False) -> pd.DataFrame:
         for entry in row:
             str(entry).replace('"', "").replace(";", "").split()
     except AttributeError:
-        raise Exception(
-            f"Invalid attribute string: {entry}. If the file is in GFF3 format, use pr.read_gff3 instead."
-        )
+        raise Exception(f"Invalid attribute string: {entry}. If the file is in GFF3 format, use pr.read_gff3 instead.")
 
     rowdicts = []
     try:
@@ -590,17 +588,3 @@ def read_bigwig(f: Union[str, Path]) -> pr.PyRanges:
         )
 
     return pr.from_dfs(dfs)
-
-
-
-
-
-
-def _test():
-    import doctest
-
-    doctest.testmod()
-
-
-if __name__ == "__main__":
-    _test()

From cc0eb202d346e6be9106cb771e0742d5c98a6a58 Mon Sep 17 00:00:00 2001
From: endre bakken stovner <endrebakkenstovner@endres-MacBook-Air.local>
Date: Fri, 26 May 2023 14:30:41 +0200
Subject: [PATCH 09/10] Add types to statistics

---
 pyranges/__init__.py      |  10 ---
 pyranges/pyranges_main.py |  29 +++----
 pyranges/statistics.py    | 173 ++++++++++++++++++++------------------
 3 files changed, 105 insertions(+), 107 deletions(-)

diff --git a/pyranges/__init__.py b/pyranges/__init__.py
index d8db666f..fb914b2c 100644
--- a/pyranges/__init__.py
+++ b/pyranges/__init__.py
@@ -457,13 +457,3 @@ def update_version_info(_version_info, library) -> None:
     "PyRanges",
     "version_info",
 ]
-
-
-def _test():
-    import doctest
-
-    doctest.testmod()
-
-
-if __name__ == "__main__":
-    _test()
diff --git a/pyranges/pyranges_main.py b/pyranges/pyranges_main.py
index 02055748..cbb70bd7 100644
--- a/pyranges/pyranges_main.py
+++ b/pyranges/pyranges_main.py
@@ -2289,7 +2289,7 @@ def items(self) -> Union[List[Tuple[str, pd.DataFrame]], List[Tuple[Tuple[str, s
     def join(
         self,
         other: "PyRanges",
-        strandedness: None = None,
+        strandedness: Optional[str] = None,
         how: Optional[str] = None,
         report_overlap: bool = False,
         slack: int = 0,
@@ -2552,15 +2552,15 @@ def length(self) -> int:
         5
         """
 
-        lengths = self.lengths(as_dict=False)
+        lengths = self.lengths()
         assert isinstance(lengths, pd.Series)
         length = lengths.sum()
         assert isinstance(length, (np.int64, int))
         return int(length)
 
     def lengths(
-        self, as_dict: bool = False
-    ) -> Union[pd.Series, Dict[Tuple[str, str], pd.Series], Dict[str, pd.Series]]:
+        self
+    ) -> pd.Series:
         """Return the length of each interval.
 
         Parameters
@@ -2617,18 +2617,15 @@ def lengths(
         For printing, the PyRanges was sorted on Chromosome and Strand.
         """
 
-        if as_dict:
-            return {k: df.End - df.Start for k, df in self.items()}  # type: ignore
-        else:
-            _lengths: List[pd.Series] = []
-            if not len(self):
-                return pd.Series([], dtype=np.int64)
-            for _, df in self:
-                _lengths.append(df.End - df.Start)
+        _lengths: List[pd.Series] = []
+        if not len(self):
+            return pd.Series([], dtype=np.int64)
+        for _, df in self:
+            _lengths.append(df.End - df.Start)
 
-            ls = pd.concat(_lengths).reset_index(drop=True)
-            assert isinstance(ls, pd.Series)
-            return ls
+        ls = pd.concat(_lengths).reset_index(drop=True)
+        assert isinstance(ls, pd.Series)
+        return ls
 
     def max_disjoint(self, strand: Optional[bool] = None, slack: int = 0, **kwargs) -> "PyRanges":
         """Find the maximal disjoint set of intervals.
@@ -3561,7 +3558,7 @@ def sample(self, n: int = 8, replace: bool = False) -> "PyRanges":
     def set_intersect(
         self,
         other: "PyRanges",
-        strandedness: None = None,
+        strandedness: Optional[str] = None,
         how: Optional[str] = None,
         new_pos: bool = False,
         nb_cpu: int = 1,
diff --git a/pyranges/statistics.py b/pyranges/statistics.py
index b2361acd..e6d5c2aa 100644
--- a/pyranges/statistics.py
+++ b/pyranges/statistics.py
@@ -9,6 +9,11 @@
 import pyranges as pr
 from pyranges.methods.statistics import _relative_distance
 from pyranges.multithreaded import pyrange_apply
+from numpy import float64, int64, ndarray
+from pandas.core.frame import DataFrame
+from pandas.core.series import Series
+from pyranges.pyranges_main import PyRanges
+from typing import Dict, List, Optional, Union, Any
 
 __all__ = [
     "simes",
@@ -22,7 +27,7 @@
 ]
 
 
-def fdr(p_vals):
+def fdr(p_vals: Series) -> Series:
     """Adjust p-values with Benjamini-Hochberg.
 
     Parameters
@@ -76,7 +81,7 @@ def fdr(p_vals):
     return fdr
 
 
-def fisher_exact(tp, fp, fn, tn, pseudocount=0):
+def fisher_exact(tp: Series, fp: Series, fn: Series, tn: Series, pseudocount: int = 0) -> DataFrame:
     """Fisher's exact for contingency tables.
 
     Computes the hypotheses two-sided, less and greater at the same time.
@@ -149,10 +154,10 @@ def fisher_exact(tp, fp, fn, tn, pseudocount=0):
         )
         sys.exit(-1)
 
-    tp = np.array(tp, dtype=np.uint)
-    fp = np.array(fp, dtype=np.uint)
-    fn = np.array(fn, dtype=np.uint)
-    tn = np.array(tn, dtype=np.uint)
+    tp = pd.Series(np.array(tp, dtype=np.uint))
+    fp = pd.Series(np.array(fp, dtype=np.uint))
+    fn = pd.Series(np.array(fn, dtype=np.uint))
+    tn = pd.Series(np.array(tn, dtype=np.uint))
 
     left, right, twosided = pvalue_npy(tp, fp, fn, tn)
 
@@ -163,7 +168,7 @@ def fisher_exact(tp, fp, fn, tn, pseudocount=0):
     return df
 
 
-def mcc(grs, genome=None, labels=None, strand=False, verbose=False):
+def mcc(grs: List[PyRanges], genome: Optional[Union[pr.PyRanges, pd.DataFrame, Dict[str, int]]] = None, labels: Optional[str] = None, strand: bool = False, verbose: bool = False) -> DataFrame:
     """Compute Matthew's correlation coefficient for PyRanges overlaps.
 
     Parameters
@@ -218,9 +223,28 @@ def mcc(grs, genome=None, labels=None, strand=False, verbose=False):
     import sys
     from itertools import chain, combinations_with_replacement
 
+    if genome is None:
+        genome = defaultdict(int)
+        for gr in grs:
+            for k, v in gr:
+                genome[k] = max(genome[k], v.End.max())
+
+
+    if not isinstance(genome, dict):
+        _genome = genome
+        genome_length = int(_genome.End.sum())
+    else:
+        _genome = pd.DataFrame(
+            {
+                "Chromosome": list(genome.keys()),
+                "Start": 0,
+                "End": list(genome.values())
+            }
+        )
+        genome_length = sum(genome.values())
+
     if labels is None:
-        _labels = list(range(len(grs)))
-        _labels = combinations_with_replacement(_labels, r=2)
+        _labels = combinations_with_replacement(np.arange(len(grs)), r=2)
     else:
         assert len(labels) == len(grs)
         _labels = combinations_with_replacement(labels, r=2)
@@ -228,18 +252,15 @@ def mcc(grs, genome=None, labels=None, strand=False, verbose=False):
     # remove all non-loc columns before computation
     grs = [gr.merge(strand=strand) for gr in grs]
 
-    if genome is not None:
-        if isinstance(genome, (pd.DataFrame, pr.PyRanges)):
-            genome_length = int(genome.End.sum())
-        else:
-            genome_length = sum(genome.values())
+    if _genome is not None:
+        genome_length = int(_genome.End.sum())
 
         if verbose:
             # check that genome definition does not have many more
             # chromosomes than datafiles
-            gr_cs = set(chain(*[gr.chromosomes for gr in grs]))
+            gr_cs = set(chain(*[gr.Chromosome for gr in grs]))
 
-            g_cs = set(genome.chromosomes)
+            g_cs = set(_genome.keys())
             surplus = g_cs - gr_cs
             if len(surplus):
                 print(
@@ -257,15 +278,7 @@ def make_stranded(df):
                 df2.insert(df2.shape[1], "Strand", "-")
                 return pd.concat([df, df2])
 
-            genome = genome.apply(make_stranded)
-
-    else:
-        d = defaultdict(int)
-        for gr in grs:
-            for k, v in gr:
-                d[k] = max(d[k], v.End.max())
-
-        genome_length = sum(d.values())
+            _genome = _genome.apply(make_stranded)
 
     strandedness = "same" if strand else None
 
@@ -282,7 +295,7 @@ def make_stranded(df):
                 fp = 0
                 rowdicts.append({"T": lt, "F": lf, "TP": tp, "FP": fp, "TN": tn, "FN": fn, "MCC": 1})
             else:
-                for strand in "+ -".split():
+                for _strand in "+ -".split():
                     tp = t[strand].length
                     fn = 0
                     tn = genome_length - tp
@@ -291,7 +304,7 @@ def make_stranded(df):
                         {
                             "T": lt,
                             "F": lf,
-                            "Strand": strand,
+                            "Strand": _strand,
                             "TP": tp,
                             "FP": fp,
                             "TN": tn,
@@ -305,17 +318,17 @@ def make_stranded(df):
             j = t.join(f, strandedness=strandedness)
             tp_gr = j.new_position("intersection").merge(strand=strand)
             if strand:
-                for strand in "+ -".split():
-                    tp = tp_gr[strand].length
-                    fp = f[strand].length - tp
-                    fn = t[strand].length - tp
+                for _strand in "+ -".split():
+                    tp = tp_gr[_strand].length
+                    fp = f[_strand].length - tp
+                    fn = t[_strand].length - tp
                     tn = genome_length - (tp + fp + fn)
                     mcc = _mcc(tp, fp, tn, fn)
                     rowdicts.append(
                         {
                             "T": lt,
                             "F": lf,
-                            "Strand": strand,
+                            "Strand": _strand,
                             "TP": tp,
                             "FP": fp,
                             "TN": tn,
@@ -327,7 +340,7 @@ def make_stranded(df):
                         {
                             "T": lf,
                             "F": lt,
-                            "Strand": strand,
+                            "Strand": _strand,
                             "TP": tp,
                             "FP": fn,
                             "TN": tn,
@@ -365,12 +378,12 @@ def make_stranded(df):
                     }
                 )
 
-    df = pd.DataFrame.from_dict(rowdicts).sort_values(["T", "F"])
+    df = pd.DataFrame.from_records(rowdicts).sort_values(["T", "F"])
 
     return df
 
 
-def rowbased_spearman(x, y):
+def rowbased_spearman(x: ndarray, y: ndarray) -> ndarray:
     """Fast row-based Spearman's correlation.
 
     Parameters
@@ -416,7 +429,7 @@ def rowbased_spearman(x, y):
     return rowbased_pearson(rx, ry)
 
 
-def rowbased_pearson(x, y):
+def rowbased_pearson(x: Union[ndarray, DataFrame], y: Union[ndarray, DataFrame]) -> ndarray:
     """Fast row-based Pearson's correlation.
 
     Parameters
@@ -475,7 +488,7 @@ def ss(a, axis):
     return r
 
 
-def rowbased_rankdata(data):
+def rowbased_rankdata(data: ndarray) -> DataFrame:
     """Rank order of entries in each row.
 
     Same as SciPy rankdata with method=mean.
@@ -519,17 +532,14 @@ def rowbased_rankdata(data):
 
     obs = np.column_stack([np.ones(len(res), dtype=bool), res])
 
-    dense = np.take_along_axis(np.apply_along_axis(np.cumsum, 1, obs), inv, 1)
+    dense = pd.DataFrame(np.take_along_axis(np.apply_along_axis(np.cumsum, 1, obs), inv, 1))
 
     len_r = obs.shape[1]
 
     nonzero = np.count_nonzero(obs, axis=1)
-    obs = pd.DataFrame(obs)
-    nonzero = pd.Series(nonzero)
-    dense = pd.DataFrame(dense)
 
-    ranks = []
-    for _nonzero, nzdf in obs.groupby(nonzero, sort=False):
+    _ranks = []
+    for _nonzero, nzdf in pd.DataFrame(obs).groupby(pd.Series(nonzero), sort=False):
         nz = np.apply_along_axis(lambda r: np.nonzero(r)[0], 1, nzdf)
 
         _count = np.column_stack([nz, np.ones(len(nz)) * len_r])
@@ -538,14 +548,14 @@ def rowbased_rankdata(data):
         _result = 0.5 * (np.take_along_axis(_count, _dense, 1) + np.take_along_axis(_count, _dense - 1, 1) + 1)
 
         result = pd.DataFrame(_result, index=nzdf.index)
-        ranks.append(result)
+        _ranks.append(result)
 
-    final = pd.concat(ranks).sort_index(kind="mergesort")
+    final = pd.concat(_ranks).sort_index(kind="mergesort")
 
     return final
 
 
-def simes(df, groupby, pcol, keep_position=False):
+def simes(df: DataFrame, groupby: Union[str, List[str]], pcol: str, keep_position: bool = False) -> DataFrame:
     """Apply Simes method for giving dependent events a p-value.
 
     Parameters
@@ -634,9 +644,9 @@ def simes(df, groupby, pcol, keep_position=False):
     sdf = df[positions + sorter].sort_values(sorter)
     g = sdf.groupby(positions + groupby)
 
-    ranks = g.cumcount().values + 1
-    size = g.size().values
-    size = np.repeat(size, size)
+    ranks = pd.Series(g.cumcount().values) + 1
+    _size = np.array(g.size().values)
+    size = np.repeat(a=_size, repeats=_size)
     multiplied = sdf[pcol].values * size
 
     simes = multiplied / ranks
@@ -665,15 +675,15 @@ def simes(df, groupby, pcol, keep_position=False):
     return simes
 
 
-def chromsizes_as_int(chromsizes):
-    if isinstance(chromsizes, int):
-        pass
-    elif isinstance(chromsizes, dict):
-        chromsizes = sum(chromsizes.values())
+def chromsizes_as_int(chromsizes: Union[PyRanges, DataFrame, Dict[Any, int]]) -> int:
+    if isinstance(chromsizes, dict):
+        _chromsizes = sum(chromsizes.values())
     elif isinstance(chromsizes, (pd.DataFrame, pr.PyRanges)):
-        chromsizes = chromsizes.End.sum()
+        _chromsizes = chromsizes.End.sum()
+    else:
+        raise TypeError("chromsizes must be dict, DataFrame or PyRanges, was {}".format(type(chromsizes)))
 
-    return chromsizes
+    return _chromsizes
 
 
 class StatisticsMethods:
@@ -682,12 +692,10 @@ class StatisticsMethods:
 
     Accessed with gr.stats."""
 
-    pr = None
-
-    def __init__(self, pr):
+    def __init__(self, pr: PyRanges) -> None:
         self.pr = pr
 
-    def forbes(self, other, chromsizes, strandedness=None):
+    def forbes(self, other: PyRanges, chromsizes: PyRanges, strandedness: Optional[str] = None) -> float64:
         """Compute Forbes coefficient.
 
         Ratio which represents observed versus expected co-occurence.
@@ -728,27 +736,24 @@ def forbes(self, other, chromsizes, strandedness=None):
         >>> gr.stats.forbes(gr2, chromsizes=chromsizes)
         1.7168314674978278"""
 
-        chromsizes = chromsizes_as_int(chromsizes)
+        _chromsizes = chromsizes_as_int(chromsizes)
 
-        self = self.pr
-
-        kwargs = {}
-        kwargs["sparse"] = {"self": True, "other": True}
+        kwargs = {"sparse": {"self": True, "other": True}}
         kwargs = pr.pyranges_main.fill_kwargs(kwargs)
         strand = True if kwargs.get("strandedness") else False
 
-        reference_length = self.merge(strand=strand).length
+        reference_length = self.pr.merge(strand=strand).length
         query_length = other.merge(strand=strand).length
 
         intersection_sum = sum(
-            v.sum() for v in self.set_intersect(other, strandedness=strandedness).lengths(as_dict=True).values()
+            v.sum() for v in self.pr.set_intersect(other, strandedness=strandedness).lengths()
         )
 
-        forbes = chromsizes * intersection_sum / (reference_length * query_length)
+        forbes = _chromsizes * intersection_sum / (reference_length * query_length)
 
         return forbes
 
-    def jaccard(self, other, **kwargs):
+    def jaccard(self, other: PyRanges, **kwargs) -> float:
         """Compute Jaccards coefficient.
 
         Ratio of the intersection and union of two sets.
@@ -787,27 +792,25 @@ def jaccard(self, other, **kwargs):
         >>> gr.stats.jaccard(gr2, chromsizes=chromsizes)
         6.657941988519211e-05"""
 
-        self = self.pr
-
         kwargs["sparse"] = {"self": True, "other": True}
         kwargs = pr.pyranges_main.fill_kwargs(kwargs)
         strand = True if kwargs.get("strandedness") else False
 
-        intersection_sum = sum(v.sum() for v in self.set_intersect(other).lengths(as_dict=True).values())
+        intersection_sum = sum(v.sum() for v in self.pr.set_intersect(other).lengths())
 
         union_sum = 0
-        for gr in [self, other]:
-            union_sum += sum(v.sum() for v in gr.merge(strand=strand).lengths(as_dict=True).values())
+        for gr in [self.pr, other]:
+            union_sum += sum(v.sum() for v in gr.merge(strand=strand).lengths())
 
         denominator = union_sum - intersection_sum
         if denominator == 0:
-            return 1
+            return 1.0
         else:
             jc = intersection_sum / denominator
 
         return jc
 
-    def relative_distance(self, other, **kwargs):
+    def relative_distance(self, other: PyRanges, **kwargs) -> DataFrame:
         """Compute spatial correllation between two sets.
 
         Metric which describes relative distance between each interval in one
@@ -899,14 +902,12 @@ def relative_distance(self, other, **kwargs):
         49     0.49    194   9956  0.019486
         """
 
-        self = self.pr
-
         kwargs["sparse"] = {"self": True, "other": True}
         kwargs = pr.pyranges_main.fill_kwargs(kwargs)
 
-        result = pyrange_apply(_relative_distance, self, other, **kwargs)  # pylint: disable=E1132
+        dfs = pyrange_apply(_relative_distance, self.pr, other, **kwargs)
 
-        result = pd.Series(np.concatenate(list(result.values())))
+        result = pd.Series(np.concatenate(list(dfs.values())))
 
         not_nan = ~np.isnan(result)
         result.loc[not_nan] = np.floor(result[not_nan] * 100) / 100
@@ -920,7 +921,7 @@ def relative_distance(self, other, **kwargs):
         return vc
 
 
-def _mcc(tp, fp, tn, fn):
+def _mcc(tp: int, fp: int, tn: int, fn: int) -> float:
     # https://stackoverflow.com/a/56875660/992687
     x = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
     return ((tp * tn) - (fp * fn)) / sqrt(x)
@@ -950,3 +951,13 @@ def _mcc(tp, fp, tn, fn):
 #     _tetrachoric = cos(180/(1 + sqrt((b * c) / (a * d))))
 
 #     return _tetrachoric
+
+
+def _test():
+    import doctest
+
+    doctest.testmod()
+
+
+if __name__ == "__main__":
+    _test()

From 02b458f1d5991fda1cdcc11e685f20fc4830d69c Mon Sep 17 00:00:00 2001
From: endre bakken stovner <endrebakkenstovner@endres-MacBook-Air.local>
Date: Fri, 26 May 2023 17:44:47 +0200
Subject: [PATCH 10/10] Add types to getitem/subset

---
 pyranges/methods/getitem.py |   9 ++-
 pyranges/methods/summary.py |  13 ++--
 pyranges/out.py             |  15 ++--
 pyranges/pyranges_main.py   |  17 +----
 pyranges/statistics.py      |  65 ++++++++---------
 pyranges/subset.py          | 135 +++++++++++++++++++-----------------
 6 files changed, 126 insertions(+), 128 deletions(-)

diff --git a/pyranges/methods/getitem.py b/pyranges/methods/getitem.py
index bdcd01b4..8d9d941a 100644
--- a/pyranges/methods/getitem.py
+++ b/pyranges/methods/getitem.py
@@ -3,7 +3,7 @@
 
 import pyranges as pr
 from pyranges.methods.drop import _keep
-from pyranges.subset import get_booldict, get_slice, get_string, get_tuple
+from pyranges.subset import get_2_tuple, get_booldict, get_chromosome_strand_loc, get_slice, get_string
 
 
 def _getitem(self, val):
@@ -12,7 +12,12 @@ def _getitem(self, val):
     elif isinstance(val, str):
         dfs = get_string(self, val)
     elif isinstance(val, tuple):
-        dfs = get_tuple(self, val)
+        if len(val) == 2:
+            dfs = get_2_tuple(self, val[0], val[1])
+        elif len(val) == 3:
+            dfs = get_chromosome_strand_loc(self, val[0], val[1], val[2])
+        else:
+            raise ValueError("Indexing tuple must be of length 2 or 3. Tuple was: {}".format(str(val)))
     elif isinstance(val, slice):
         dfs = get_slice(self, val)
     elif isinstance(val, dict):
diff --git a/pyranges/methods/summary.py b/pyranges/methods/summary.py
index 166f6f57..4b6c6e57 100644
--- a/pyranges/methods/summary.py
+++ b/pyranges/methods/summary.py
@@ -7,28 +7,27 @@
 def _summary(self, to_stdout=True, return_df=False):
     lengths = {}
     total_lengths = {}
-    lengths["pyrange"] = self.lengths(as_dict=True)
+    lengths["pyrange"] = self.lengths()
     total_lengths["pyrange"] = [self.length]
 
     if self.stranded:
         c = self.merge(strand=True)
-        lengths["coverage_forward"] = c["+"].lengths(as_dict=True)
-        lengths["coverage_reverse"] = c["-"].lengths(as_dict=True)
+        lengths["coverage_forward"] = c["+"].lengths()
+        lengths["coverage_reverse"] = c["-"].lengths()
         total_lengths["coverage_forward"] = [c["+"].length]
         total_lengths["coverage_reverse"] = [c["-"].length]
     else:
         c = self
 
     c = c.merge(strand=False)
-    lengths["coverage_unstranded"] = c.lengths(as_dict=True)
+    lengths["coverage_unstranded"] = c.lengths()
     total_lengths["coverage_unstranded"] = [c.length]
 
     summaries = OrderedDict()
 
     # statistics for lengths
-    for summary, d in lengths.items():
-        if d:
-            summaries[summary] = pd.concat(d.values()).describe()
+    for summary, s in lengths.items():
+        summaries[summary] = s.describe()
 
     summary = pd.concat(summaries.values(), axis=1)
     summary.columns = list(summaries)
diff --git a/pyranges/out.py b/pyranges/out.py
index 72eb8245..7e7872a2 100644
--- a/pyranges/out.py
+++ b/pyranges/out.py
@@ -90,7 +90,7 @@ def _gtf(df: DataFrame, mapping: Dict[str, str]) -> DataFrame:
     outdf = _fill_missing(df, all_columns)
 
     if "attribute" in df.columns:
-        attribute = pd.Series([mapping["attribute"] + ' "' + df.attribute + '";'])
+        attribute = mapping["attribute"] + ' "' + df.attribute + '";'
     else:
         # gotten all needed columns, need to join the rest
         _rest = set(df.columns) - set(all_columns)
@@ -294,7 +294,7 @@ def _to_gff3(
         )
 
 
-def _gff3(df: DataFrame, mapping: Dict[str, str]) -> DataFrame:
+def _gff3(df, mapping) -> pd.DataFrame:
     pr_col2gff_col = {v: k for k, v in mapping.items()}
 
     df = df.rename(columns=pr_col2gff_col)  # copying here
@@ -307,15 +307,14 @@ def _gff3(df: DataFrame, mapping: Dict[str, str]) -> DataFrame:
     if "attribute" in mapping:
         attribute_name = mapping["attribute"]
         attribute_value = df.attribute.iloc[0]
-        attribute = pd.Series([f"{attribute_name}={attribute_value}"])
+        attribute = f"{attribute_name}={attribute_value}"
     else:
         # gotten all needed columns, need to join the rest
-        _rest = set(df.columns) - set(all_columns)
-        rest = sorted(_rest, key=columns.index)
-        rest_df = df[rest].copy()
+        rest = set(df.columns) - set(all_columns)
+        _rest = sorted(rest, key=columns.index)
+        rest_df = df.get(_rest).copy()
         total_cols = rest_df.shape[1]
-        for i, _c in enumerate(rest_df, 1):
-            c = str(_c)
+        for i, c in enumerate(rest_df, 1):
             col = rest_df[c]
             isnull = col.isnull()
             col = col.astype(str).str.replace("nan", "")
diff --git a/pyranges/pyranges_main.py b/pyranges/pyranges_main.py
index cbb70bd7..6708d20a 100644
--- a/pyranges/pyranges_main.py
+++ b/pyranges/pyranges_main.py
@@ -2558,18 +2558,12 @@ def length(self) -> int:
         assert isinstance(length, (np.int64, int))
         return int(length)
 
-    def lengths(
-        self
-    ) -> pd.Series:
+    def lengths(self) -> pd.Series:
         """Return the length of each interval.
 
         Parameters
         ----------
 
-        as_dict : bool, default False
-
-            Whether to return lengths as pd.Series or dict of pd.Series per key.
-
         Returns
         -------
         pd.Series or dict of pd.Series with the lengths of each interval.
@@ -4283,11 +4277,6 @@ def subset(self, f: Callable, strand: Optional[bool] = None, **kwargs) -> "PyRan
             Whether to do operations on chromosome/strand pairs or chromosomes. If None, will use
             chromosome/strand pairs if the PyRanges is stranded.
 
-        nb_cpu : int, default 1
-
-            How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple.
-            Will only lead to speedups on large datasets.
-
         **kwargs
             Additional keyword arguments to pass as keyword arguments to `f`
 
@@ -5402,10 +5391,6 @@ def to_gtf(
 
             Which compression to use. Uses file extension to infer by default.
 
-        chain: bool, default False
-
-            Whether to return the PyRanges after writing.
-
         map_cols: dict, default None
 
             Override mapping between GTF and PyRanges fields for any number of columns.
diff --git a/pyranges/statistics.py b/pyranges/statistics.py
index e6d5c2aa..d7336779 100644
--- a/pyranges/statistics.py
+++ b/pyranges/statistics.py
@@ -2,18 +2,18 @@
 
 from collections import defaultdict
 from math import sqrt
+from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
 import pandas as pd
+from numpy import ndarray
+from pandas.core.frame import DataFrame
+from pandas.core.series import Series
 
 import pyranges as pr
 from pyranges.methods.statistics import _relative_distance
 from pyranges.multithreaded import pyrange_apply
-from numpy import float64, int64, ndarray
-from pandas.core.frame import DataFrame
-from pandas.core.series import Series
 from pyranges.pyranges_main import PyRanges
-from typing import Dict, List, Optional, Union, Any
 
 __all__ = [
     "simes",
@@ -154,21 +154,27 @@ def fisher_exact(tp: Series, fp: Series, fn: Series, tn: Series, pseudocount: in
         )
         sys.exit(-1)
 
-    tp = pd.Series(np.array(tp, dtype=np.uint))
-    fp = pd.Series(np.array(fp, dtype=np.uint))
-    fn = pd.Series(np.array(fn, dtype=np.uint))
-    tn = pd.Series(np.array(tn, dtype=np.uint))
+    _tp = np.array(tp, dtype=np.uint)
+    _fp = np.array(fp, dtype=np.uint)
+    _fn = np.array(fn, dtype=np.uint)
+    _tn = np.array(tn, dtype=np.uint)
 
-    left, right, twosided = pvalue_npy(tp, fp, fn, tn)
+    left, right, twosided = pvalue_npy(_tp, _fp, _fn, _tn)
 
-    OR = ((tp + pseudocount) / (fp + pseudocount)) / ((fn + pseudocount) / (tn + pseudocount))
+    OR = ((_tp + pseudocount) / (_fp + pseudocount)) / ((_fn + pseudocount) / (_tn + pseudocount))
 
     df = pd.DataFrame({"OR": OR, "P": twosided, "PLeft": left, "PRight": right})
 
     return df
 
 
-def mcc(grs: List[PyRanges], genome: Optional[Union[pr.PyRanges, pd.DataFrame, Dict[str, int]]] = None, labels: Optional[str] = None, strand: bool = False, verbose: bool = False) -> DataFrame:
+def mcc(
+    grs: List["PyRanges"],
+    genome: Optional[Union["PyRanges", pd.DataFrame, Dict[str, int]]] = None,
+    labels: Optional[str] = None,
+    strand: bool = False,
+    verbose: bool = False,
+) -> DataFrame:
     """Compute Matthew's correlation coefficient for PyRanges overlaps.
 
     Parameters
@@ -229,18 +235,11 @@ def mcc(grs: List[PyRanges], genome: Optional[Union[pr.PyRanges, pd.DataFrame, D
             for k, v in gr:
                 genome[k] = max(genome[k], v.End.max())
 
-
     if not isinstance(genome, dict):
         _genome = genome
         genome_length = int(_genome.End.sum())
     else:
-        _genome = pd.DataFrame(
-            {
-                "Chromosome": list(genome.keys()),
-                "Start": 0,
-                "End": list(genome.values())
-            }
-        )
+        _genome = pd.DataFrame({"Chromosome": list(genome.keys()), "Start": 0, "End": list(genome.values())})
         genome_length = sum(genome.values())
 
     if labels is None:
@@ -555,7 +554,7 @@ def rowbased_rankdata(data: ndarray) -> DataFrame:
     return final
 
 
-def simes(df: DataFrame, groupby: Union[str, List[str]], pcol: str, keep_position: bool = False) -> DataFrame:
+def simes(df, groupby, pcol, keep_position=False):
     """Apply Simes method for giving dependent events a p-value.
 
     Parameters
@@ -644,9 +643,9 @@ def simes(df: DataFrame, groupby: Union[str, List[str]], pcol: str, keep_positio
     sdf = df[positions + sorter].sort_values(sorter)
     g = sdf.groupby(positions + groupby)
 
-    ranks = pd.Series(g.cumcount().values) + 1
-    _size = np.array(g.size().values)
-    size = np.repeat(a=_size, repeats=_size)
+    ranks = g.cumcount().values + 1
+    size = g.size().values
+    size = np.repeat(size, size)
     multiplied = sdf[pcol].values * size
 
     simes = multiplied / ranks
@@ -692,10 +691,15 @@ class StatisticsMethods:
 
     Accessed with gr.stats."""
 
-    def __init__(self, pr: PyRanges) -> None:
+    def __init__(self, pr: "PyRanges") -> None:
         self.pr = pr
 
-    def forbes(self, other: PyRanges, chromsizes: PyRanges, strandedness: Optional[str] = None) -> float64:
+    def forbes(
+        self,
+        other: "PyRanges",
+        chromsizes: Union["PyRanges", DataFrame, Dict[Any, int]],
+        strandedness: Optional[str] = None,
+    ) -> float:
         """Compute Forbes coefficient.
 
         Ratio which represents observed versus expected co-occurence.
@@ -734,7 +738,8 @@ def forbes(self, other: PyRanges, chromsizes: PyRanges, strandedness: Optional[s
         >>> gr, gr2 = pr.data.chipseq(), pr.data.chipseq_background()
         >>> chromsizes = pr.data.chromsizes()
         >>> gr.stats.forbes(gr2, chromsizes=chromsizes)
-        1.7168314674978278"""
+        1.7168314674978278
+        """
 
         _chromsizes = chromsizes_as_int(chromsizes)
 
@@ -745,9 +750,7 @@ def forbes(self, other: PyRanges, chromsizes: PyRanges, strandedness: Optional[s
         reference_length = self.pr.merge(strand=strand).length
         query_length = other.merge(strand=strand).length
 
-        intersection_sum = sum(
-            v.sum() for v in self.pr.set_intersect(other, strandedness=strandedness).lengths()
-        )
+        intersection_sum = self.pr.set_intersect(other, strandedness=strandedness).lengths().sum()
 
         forbes = _chromsizes * intersection_sum / (reference_length * query_length)
 
@@ -796,11 +799,11 @@ def jaccard(self, other: PyRanges, **kwargs) -> float:
         kwargs = pr.pyranges_main.fill_kwargs(kwargs)
         strand = True if kwargs.get("strandedness") else False
 
-        intersection_sum = sum(v.sum() for v in self.pr.set_intersect(other).lengths())
+        intersection_sum = self.pr.set_intersect(other).lengths().sum()
 
         union_sum = 0
         for gr in [self.pr, other]:
-            union_sum += sum(v.sum() for v in gr.merge(strand=strand).lengths())
+            union_sum += gr.merge(strand=strand).lengths().sum()
 
         denominator = union_sum - intersection_sum
         if denominator == 0:
diff --git a/pyranges/subset.py b/pyranges/subset.py
index 11a9ba42..4ab1544c 100644
--- a/pyranges/subset.py
+++ b/pyranges/subset.py
@@ -1,12 +1,18 @@
+from typing import Any, Dict, List, Tuple, Union
+
 import pandas as pd
 from ncls import NCLS  # type: ignore
+from numpy import int64
+from pandas.core.frame import DataFrame
+
+from pyranges.pyranges_main import PyRanges
 
 
-def create_ncls(df):
+def create_ncls(df: DataFrame) -> NCLS:
     return NCLS(df.Start.values, df.End.values, df.index.values)
 
 
-def find_overlaps(df, start, end):
+def find_overlaps(df: DataFrame, start: int, end: Union[int64, int]) -> List[Union[int, Any]]:
     n = create_ncls(df)
 
     idxes = []
@@ -16,106 +22,107 @@ def find_overlaps(df, start, end):
     return idxes
 
 
-def get_slice(self, val):
+def get_slice(self: PyRanges, val: slice) -> Union[Dict[str, DataFrame], Dict[Tuple[str, str], DataFrame]]:
     # 100:999
 
-    d = {}
-
-    for k, df in self.items():
-        start = val.start or 0
-        stop = val.stop or max(df.End.max(), start)
-        idxes = find_overlaps(df, start, stop)
-        d[k] = df.reindex(idxes)
-
-    return d
+    if self.stranded:
+        sd = {}
+        for sk, sdf in self._dfs_with_strand.items():
+            start = val.start or 0
+            stop = val.stop or max(sdf.End.max(), start)
+            idxes = find_overlaps(sdf, start, stop)
+            sd[sk] = sdf.reindex(idxes)
+        return sd
+    else:
+        d = {}
+        for k, df in self._dfs_without_strand.items():
+            start = val.start or 0
+            stop = val.stop or max(df.End.max(), start)
+            idxes = find_overlaps(df, start, stop)
+            d[k] = df.reindex(idxes)
+        return d
 
 
-def get_string(self, val):
+def get_string(self: PyRanges, val: str) -> Union[Dict[Tuple[str, str], DataFrame], Dict[str, DataFrame]]:
     if val in self.chromosomes:
         if self.stranded:
-            return {k: self.dfs[k] for k in self.keys() if k[0] == val}
+            return {k: df for k, df in self._dfs_with_strand.items() if k[0] == val}
         else:
-            return {val: self.dfs[val]}
-
+            return {val: df for k, df in self._dfs_without_strand.items() if k == val}
     elif val in "+ -".split():
-        return {k: v for k, v in self.items() if k[1] == val}
+        return {k: v for k, v in self._dfs_with_strand.items() if k[1] == val}
     else:
-        return {}
-
-
-def get_tuple(self, val):
-    if len(val) == 2:
-        dfs = get_double(self, val)
-    elif len(val) == 3:
-        dfs = get_triple(self, val)
-
-    return dfs
+        d: Dict[str, DataFrame] = {}
+        return d
+
+
+def get_2_tuple(
+    self: PyRanges, first: str, second: Union[str, slice]
+) -> Union[Dict[str, DataFrame], Dict[Tuple[str, str], DataFrame]]:
+    if isinstance(first, str) and first in "+-" and isinstance(second, slice):
+        return get_strand_and_slice(self, strand=first, loc=second)
+    if isinstance(first, (int, str)) and isinstance(second, str):
+        return get_chromosome_and_strand(self, chromosome=first, strand=second)
+    if isinstance(first, (int, str)) and isinstance(second, slice):
+        return get_chromosome_and_slice(self, chromosome=first, loc=second)
+    else:
+        raise TypeError(f"Incorrect types: {type(first)}, {type(second)}")
 
 
-def get_double(self, val):
-    if len(val) == 2 and val[0] in self.chromosomes and isinstance(val[1], slice):
-        chromosome, loc = val
+def get_chromosome_and_slice(
+    self: PyRanges, chromosome: str, loc: slice
+) -> Union[Dict[str, DataFrame], Dict[Tuple[str, str], DataFrame]]:
+    if chromosome in self.chromosomes:
         start = loc.start or 0
         if self.stranded:
-            dfs = {k: df for k, df in self.items() if k[0] == chromosome}
-            max_end = max([df.End.max() for df in dfs.values()])
+            dfs = [df for (c, _), df in self._dfs_with_strand.items() if c == chromosome]
         else:
-            dfs = {val[0]: self.dfs[val[0]]}
-            max_end = list(dfs.values())[0].End.max()
+            dfs = [df for c, df in self._dfs_without_strand.items() if c == chromosome]
+        max_end = max([df.End.max() for df in dfs])
 
         # in case 1:None
         stop = loc.stop or max(max_end, start)
 
-        dfs2 = {}
-        for k, df in dfs.items():
-            idxes = find_overlaps(df, start, stop)
-            if idxes:
-                dfs2[k] = df.loc[idxes]
+        out_dfs = [df[find_overlaps(df, start, stop)] for df in dfs]
+
+    return PyRanges(pd.concat(out_dfs)).dfs
 
-        return dfs2
 
+def get_strand_and_slice(self: PyRanges, strand: str, loc: slice) -> Dict[Tuple[str, str], DataFrame]:
     # "+", 5:10
-    if len(val) == 2 and val[0] in "+ -".split() and isinstance(val[1], slice):
-        strand, loc = val
-        start = loc.start or 0
+    start = loc.start or 0
 
-        dfs = {k: df for k, df in self.items() if k[1] == strand}
-        max_end = max([df.End.max() for df in dfs.values()])
+    dfs = [df for (c, s), df in self._dfs_with_strand.items() if s == strand]
+    max_end = max([df.End.max() for df in dfs])
 
-        stop = loc.stop or max(max_end, start)
+    stop = loc.stop or max(max_end, start)
 
-        dfs2 = {}
-        for k, df in dfs.items():
-            idxes = find_overlaps(df, start, stop)
-            if idxes:
-                dfs2[k] = df.loc[idxes]
+    out_dfs = [df[find_overlaps(df, start, stop)] for df in dfs]
 
-        return dfs2
+    return {k: v for k, v in PyRanges(pd.concat(out_dfs))._dfs_with_strand.items()}
 
-    # "chr1", "+"
-    if len(val) == 2 and val[1] in "+ -".split():
-        chromosome, strand = val
 
-        if (chromosome, strand) in self.dfs:
-            return {(chromosome, strand): self.dfs[chromosome, strand]}
-        else:
-            return {}
+# "chr1", "+"
+def get_chromosome_and_strand(
+    self: PyRanges, chromosome: Union[int, str], strand: str
+) -> Dict[Tuple[str, str], DataFrame]:
+    return {k: df for k, df in self._dfs_with_strand.items() if k == (chromosome, strand)}
 
 
-def get_triple(self, val):
+def get_chromosome_strand_loc(
+    self: PyRanges, chromosome: str, strand: str, loc: slice
+) -> Dict[Tuple[str, str], DataFrame]:
     # "chr1", "+", 5:10
-    chromosome, strand, loc = val
     start = loc.start or 0
 
     if strand not in "+ -".split():
-        raise Exception("Strand '{}' invalid.".format(val))
+        raise Exception("Strand '{}' invalid.".format(strand))
 
     r = self[chromosome, strand].values()
     if len(r):
         df = r[0]
     else:
-        df = pd.DataFrame(columns="Chromosome Start End".split())
-        return df
+        return {}
 
     max_end = df.End.max()