MLMastery/29_imbalanced_classification.py at master · pknotfound/MLMastery · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# Imbalanced Classification: A classification predictive modeling problem
# where the distribution of examples across the classes is not equal.

from numpy import where
from matplotlib import pyplot as plt
from sklearn.datasets import make_blobs

# cluster_std = The standard deviation of the clusters.
# centers = Specifies the number of classes to generate, it is
# called centers because a class cluster tend to gather close to a center.
X,Y = make_blobs(n_samples=1000, centers=2,n_features=2, random_state=1, cluster_std=3)

for class_value in range(2):
    # get row indexes for samples with this class
    # The numpy.where() function returns the indices of elements in an input array
    # where the given condition is satisfied.
    row_ix = where(Y == class_value)

    plt.scatter(X[row_ix, 0],X[row_ix,1])

plt.show()
# make_blobs always returns an equal class distribution


# make blobs with unequal class distribution
# create and plot synthetic dataset with a given class distribution
from numpy import unique
from numpy import hstack
from numpy import vstack
from numpy import where
from matplotlib import pyplot
from sklearn.datasets import make_blobs

# create a dataset with a given class distribution
def get_dataset(proportions):
	# determine the number of classes
	n_classes = len(proportions)
	# determine the number of examples to generate for each class
	largest = max([v for k,v in proportions.items()])
	n_samples = largest * n_classes
	# create dataset
	X, y = make_blobs(n_samples=n_samples, centers=n_classes, n_features=2, random_state=1, cluster_std=3)
	# collect the examples
	X_list, y_list = list(), list()
	for k,v in proportions.items():
		row_ix = where(y == k)[0]
		selected = row_ix[:v]
		X_list.append(X[selected, :])
		y_list.append(y[selected])
	return vstack(X_list), hstack(y_list)

# scatter plot of dataset, different color for each class
def plot_dataset(X, y):
	# create scatter plot for samples from each class
	n_classes = len(unique(y))
	for class_value in range(n_classes):
		# get row indexes for samples with this class
		row_ix = where(y == class_value)[0]
		# create scatter of these samples
		pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(class_value))
	# show a legend
	pyplot.legend()
	# show the plot
	pyplot.show()

# define the class distribution
proportions = {0:10000, 1:10}
# generate dataset
X, y = get_dataset(proportions)
# plot dataset
plot_dataset(X, y)