-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path29_imbalanced_classification.py
More file actions
71 lines (61 loc) · 2.38 KB
/
29_imbalanced_classification.py
File metadata and controls
71 lines (61 loc) · 2.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# Imbalanced Classification: A classification predictive modeling problem
# where the distribution of examples across the classes is not equal.
from numpy import where
from matplotlib import pyplot as plt
from sklearn.datasets import make_blobs
# cluster_std = The standard deviation of the clusters.
# centers = Specifies the number of classes to generate, it is
# called centers because a class cluster tend to gather close to a center.
X,Y = make_blobs(n_samples=1000, centers=2,n_features=2, random_state=1, cluster_std=3)
for class_value in range(2):
# get row indexes for samples with this class
# The numpy.where() function returns the indices of elements in an input array
# where the given condition is satisfied.
row_ix = where(Y == class_value)
plt.scatter(X[row_ix, 0],X[row_ix,1])
plt.show()
# make_blobs always returns an equal class distribution
# make blobs with unequal class distribution
# create and plot synthetic dataset with a given class distribution
from numpy import unique
from numpy import hstack
from numpy import vstack
from numpy import where
from matplotlib import pyplot
from sklearn.datasets import make_blobs
# create a dataset with a given class distribution
def get_dataset(proportions):
# determine the number of classes
n_classes = len(proportions)
# determine the number of examples to generate for each class
largest = max([v for k,v in proportions.items()])
n_samples = largest * n_classes
# create dataset
X, y = make_blobs(n_samples=n_samples, centers=n_classes, n_features=2, random_state=1, cluster_std=3)
# collect the examples
X_list, y_list = list(), list()
for k,v in proportions.items():
row_ix = where(y == k)[0]
selected = row_ix[:v]
X_list.append(X[selected, :])
y_list.append(y[selected])
return vstack(X_list), hstack(y_list)
# scatter plot of dataset, different color for each class
def plot_dataset(X, y):
# create scatter plot for samples from each class
n_classes = len(unique(y))
for class_value in range(n_classes):
# get row indexes for samples with this class
row_ix = where(y == class_value)[0]
# create scatter of these samples
pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(class_value))
# show a legend
pyplot.legend()
# show the plot
pyplot.show()
# define the class distribution
proportions = {0:10000, 1:10}
# generate dataset
X, y = get_dataset(proportions)
# plot dataset
plot_dataset(X, y)