Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Change Log

## 0.7.11
* Fixing dependency issue ([#170](https://github.com/shakedzy/dython/issues/170))

## 0.7.10
* _Dython now officially supports only Python 3.10 or above_
* Fix a bug in `model_utils.metric_graph`
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.7.10
0.7.11
13 changes: 4 additions & 9 deletions dython/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,8 @@
from . import nominal, model_utils, sampling, data_utils
from importlib.metadata import version
from ._private import set_is_jupyter

__all__ = ["__version__", "__dist_name__"]
__dist_name__ = "dython"
__version__ = version(__dist_name__)

def _get_version_from_setuptools():
from pkg_resources import get_distribution

return get_distribution("dython").version


__all__ = ["__version__"]
__version__ = _get_version_from_setuptools()
set_is_jupyter()
66 changes: 49 additions & 17 deletions dython/_private.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,14 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy.typing import NDArray
from typing import Optional, Any, Tuple, Union, List, Literal
from .typing import Number, OneDimArray
from typing import Any, Literal, cast, overload, Type
from .typing import OneDimArray, TwoDimArray


IS_JUPYTER: bool = False


def set_is_jupyter(force_to: Optional[bool] = None) -> None:
def set_is_jupyter(force_to: bool | None = None) -> None:
global IS_JUPYTER
if force_to is not None:
IS_JUPYTER = force_to
Expand All @@ -27,33 +26,63 @@ def plot_or_not(plot: bool) -> None:
plt.close(fig)


@overload
def convert(
data: Union[List[Number], NDArray, pd.DataFrame],
to: Literal["array", "list", "dataframe"],
data: OneDimArray | TwoDimArray,
to: Type[np.ndarray],
copy: bool = True,
) -> Union[List[Number], NDArray, pd.DataFrame]:
) -> np.ndarray:
...

@overload
def convert(
data: OneDimArray | TwoDimArray,
to: Type[list],
copy: bool = True,
) -> list:
...

@overload
def convert(
data: OneDimArray | TwoDimArray,
to: Type[pd.DataFrame],
copy: bool = True,
) -> pd.DataFrame:
...

def convert(
data: OneDimArray | TwoDimArray,
to: Type[list | pd.DataFrame | np.ndarray],
copy: bool = True,
) -> list | pd.DataFrame | np.ndarray:

converted = None
if to == "array":

if to == np.ndarray:
if isinstance(data, np.ndarray):
converted = data.copy() if copy else data
elif isinstance(data, pd.Series):
converted = data.values
elif isinstance(data, list):
converted = np.array(data)
elif isinstance(data, pd.DataFrame):
converted = data.values() # type: ignore
elif to == "list":
converted = data.values
converted = cast(np.ndarray, converted)

elif to == list:
if isinstance(data, list):
converted = data.copy() if copy else data
elif isinstance(data, pd.Series):
converted = data.values.tolist()
elif isinstance(data, np.ndarray):
converted = data.tolist()
elif to == "dataframe":

elif to == pd.DataFrame:
if isinstance(data, pd.DataFrame):
converted = data.copy(deep=True) if copy else data
elif isinstance(data, np.ndarray):
converted = pd.DataFrame(data)

else:
raise ValueError("Unknown data conversion: {}".format(to))
if converted is None:
Expand All @@ -63,12 +92,14 @@ def convert(
)
)
else:
return converted # type: ignore
return converted


def remove_incomplete_samples(
x: Union[List[Any], OneDimArray], y: Union[List[Any], OneDimArray]
) -> Tuple[Union[List[Any], OneDimArray], Union[List[Any], OneDimArray]]:
x: OneDimArray,
y: OneDimArray,
) -> tuple[OneDimArray, OneDimArray]:

x = [v if v is not None else np.nan for v in x]
y = [v if v is not None else np.nan for v in y]
arr = np.array([x, y]).transpose()
Expand All @@ -80,10 +111,11 @@ def remove_incomplete_samples(


def replace_nan_with_value(
x: Union[List[Any], OneDimArray],
y: Union[List[Any], OneDimArray],
x: OneDimArray,
y: OneDimArray,
value: Any,
) -> Tuple[NDArray, NDArray]:
) -> tuple[np.ndarray, np.ndarray]:

x = np.array(
[v if v == v and v is not None else value for v in x]
) # NaN != NaN
Expand Down
40 changes: 21 additions & 19 deletions dython/data_utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import Optional, Tuple, List, Any, Union
from numpy.typing import NDArray
from .typing import Number, TwoDimArray
from matplotlib.axes._axes import Axes
from typing import Any, Sequence, cast
from .typing import TwoDimArray, OneDimArray
from ._private import convert, plot_or_not


Expand All @@ -16,9 +16,9 @@


def one_hot_encode(
array: Union[List[Union[Number, str]], NDArray],
classes: Optional[int] = None,
) -> NDArray:
array: OneDimArray,
classes: int | None = None,
) -> np.ndarray:
"""
One-hot encode a 1D array.
Based on this StackOverflow answer: https://stackoverflow.com/a/29831596/5863503
Expand All @@ -41,15 +41,16 @@ def one_hot_encode(
[1., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 1.]])
"""
arr: NDArray = convert(array, "array").astype(int) # type: ignore
arr = convert(array, np.ndarray).astype(int)
if not len(arr.shape) == 1:
raise ValueError(
f"array must have only one dimension, but has shape: {arr.shape}"
)
if arr.min() < 0:
raise ValueError("array cannot contain negative values")
classes = classes if classes is not None else arr.max() + 1
h = np.zeros((arr.size, classes)) # type: ignore
classes = cast(int, classes)
h = np.zeros((arr.size, classes))
h[np.arange(arr.size), arr] = 1
return h

Expand All @@ -58,14 +59,14 @@ def split_hist(
dataset: pd.DataFrame,
values: str,
split_by: str,
title: Optional[str] = "",
xlabel: Optional[str] = "",
ylabel: Optional[str] = None,
figsize: Optional[Tuple[int, int]] = None,
legend: Optional[str] = "best",
title: str | None = "",
xlabel: str | None = "",
ylabel: str | None = None,
figsize: tuple[int, int] | None = None,
legend: str | None = "best",
plot: bool = True,
**hist_kwargs,
) -> plt.Axes:
) -> Axes:
"""
Plot a histogram of values from a given dataset, split by the values of a chosen column

Expand Down Expand Up @@ -125,8 +126,9 @@ def split_hist(


def identify_columns_by_type(
dataset: TwoDimArray, include: List[str]
) -> List[Any]:
dataset: TwoDimArray,
include: Sequence[str]
) -> list[Any]:
"""
Given a dataset, identify columns of the types requested.

Expand All @@ -147,8 +149,8 @@ def identify_columns_by_type(
['col2', 'col3']

"""
df: pd.DataFrame = convert(dataset, "dataframe") # type: ignore
columns = list(df.select_dtypes(include=include).columns)
df = convert(dataset, pd.DataFrame)
columns = list(df.select_dtypes(include=include).columns) # pyright: ignore[reportCallIssue, reportArgumentType]
return columns


Expand All @@ -173,7 +175,7 @@ def identify_columns_with_na(dataset: TwoDimArray) -> pd.DataFrame:
1 col2 2
0 col1 1
"""
df: pd.DataFrame = convert(dataset, "dataframe") # type: ignore
df = convert(dataset, pd.DataFrame)
na_count = [sum(df[cc].isnull()) for cc in df.columns]
return (
pd.DataFrame({"column": df.columns, "na_count": na_count})
Expand Down
22 changes: 11 additions & 11 deletions dython/examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ def roc_graph_example():

# Load data
iris = datasets.load_iris()
X = iris.data
y = label_binarize(iris.target, classes=[0, 1, 2])
X = iris.data # pyright: ignore[reportAttributeAccessIssue]
y = label_binarize(iris.target, classes=[0, 1, 2]) # pyright: ignore[reportAttributeAccessIssue]

# Add noisy features
random_state = np.random.RandomState(4)
Expand All @@ -43,7 +43,7 @@ def roc_graph_example():

# Plot ROC graphs
return metric_graph(
y_test, y_score, "roc", class_names_list=iris.target_names
y_test, y_score, "roc", class_names_list=iris.target_names # pyright: ignore[reportAttributeAccessIssue, reportCallIssue]
)


Expand All @@ -55,8 +55,8 @@ def pr_graph_example():

# Load data
iris = datasets.load_iris()
X = iris.data
y = label_binarize(iris.target, classes=[0, 1, 2])
X = iris.data # pyright: ignore[reportAttributeAccessIssue]
y = label_binarize(iris.target, classes=[0, 1, 2]) # pyright: ignore[reportAttributeAccessIssue]

# Add noisy features
random_state = np.random.RandomState(4)
Expand All @@ -76,7 +76,7 @@ def pr_graph_example():

# Plot PR graphs
return metric_graph(
y_test, y_score, "pr", class_names_list=iris.target_names
y_test, y_score, "pr", class_names_list=iris.target_names # pyright: ignore[reportAttributeAccessIssue, reportCallIssue]
)


Expand All @@ -91,10 +91,10 @@ def associations_iris_example():

# Convert int classes to strings to allow associations method
# to automatically recognize categorical columns
target = ["C{}".format(i) for i in iris.target]
target = ["C{}".format(i) for i in iris.target] # pyright: ignore[reportAttributeAccessIssue]

# Prepare data
X = pd.DataFrame(data=iris.data, columns=iris.feature_names)
X = pd.DataFrame(data=iris.data, columns=iris.feature_names) # pyright: ignore[reportAttributeAccessIssue]
y = pd.DataFrame(data=target, columns=["target"])
df = pd.concat([X, y], axis=1)

Expand Down Expand Up @@ -151,8 +151,8 @@ def split_hist_example():

# Load data and convert to DataFrame
data = datasets.load_breast_cancer()
df = pd.DataFrame(data=data.data, columns=data.feature_names)
df["malignant"] = [not bool(x) for x in data.target]
df = pd.DataFrame(data=data.data, columns=data.feature_names) # pyright: ignore[reportAttributeAccessIssue]
df["malignant"] = [not bool(x) for x in data.target] # pyright: ignore[reportAttributeAccessIssue]

# Plot histogram
return split_hist(df, "mean radius", "malignant", bins=20, figsize=(15, 7))
Expand All @@ -167,7 +167,7 @@ def ks_abc_example():
# Load and split data
data = datasets.load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=0.5, random_state=0
data.data, data.target, test_size=0.5, random_state=0 # pyright: ignore[reportAttributeAccessIssue]
)

# Train model and predict
Expand Down
Loading