Automation_scripts/clustering.py at main · elichter/Automation_scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
# This script clusters various componenets of the data and then uses these
# clusters in downstream processes.

# To start, we will import required packages.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor as rfs
from sklearn.tree import export_graphviz
import pydot
from sklearn import metrics
from sklearn import tree
from dtreeviz.trees import *
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score)
from sklearn.ensemble import RandomForestClassifier
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score
import skopt


# In the first case, we will simply cluster the frequency column in order to
# determine the bin centers for frequency (heteroplasmy). The first step is
# simply reading in the data as a dataframe, and assigning a variable to the
# column of interest 'Frequency'
df = pd.read_csv('concat_data.csv')
x = df['Frequency']

# Python requires the column to be in the shape of (-1,1) when it's a single
# column that is being clustered or (1,-1) when it's a single sample. * need to
# read more on why *, which requires the data in the column to be converted
# from the current state, which is float, to a numpy array.
x = np.array(x)
x = x.reshape(-1,1)

# Determine the number for k * will need to follow up more thouroughly about
# various automation methods here. For now, went with the highest value for K
# possible for this dataset because that splits up the highly skewed data into
# individual clusters. This way there will be more bins, and  minor changes in
# heteroplasmy would be easier to detect.
K = range(1,17)

# Create the clusters, using the number of clusters as determined in the
# previous step, set an integer for random state so that each time this is run,
# the same cluster centers result. Finally, append the distortion list using
# the inertia from running the kmeans algorithm.
for k in K:
    kmeans = KMeans(n_clusters=k,random_state=0)
    kmeans.fit(x)

# Each time the algorithm is run, it spits out an unsorted list of cluster
# centers Furthermore, the clusters are floats. In the code below, if the
# clusters are not sorted, the following error appears (ValueError: Buffer has
# wrong number of dimensions (expected 1, got 2)), and if the cluster centers
# are not rounded here, not all datapoints will get their bin lables.
# Therefore, the cluster centers will both be sorted and rounded to
# the nearest integer here.
kmeans.cluster_centers_ = np.sort(kmeans.cluster_centers_,axis=None).round()

# The bins will now be created using the sorted/rounded kmeans clusters from
# above.
bins = kmeans.cluster_centers_

# The bin widths are calculated using the np.diff function. This width can be
# used downstream if we make a graph of the data.
width = np.diff(bins)

# The bin centers are defined by taking half of the sum of consecutive bins.
# The resulting bins are put into a new column in the dataframe called
# 'Binned_Heteroplasmy(%)'.
center = (bins[:-1] + bins[1:]) / 2
df['Binned_Heteroplasmy(%)'] = pd.cut(df['Frequency'], bins, labels=center)

# Now we will be moving into the machine learning portion of this work. The
# overall question to answer here is how well do animals cluster within groups
# (based on age, gender, and genotype), and how are they different from each
# other. The data in question, for now, will be the same dataframe df from
# above. ** will eventually use a different dataframe which will contain all
# the animals including the ones ommitted in the Concaten_clc_data.py script.
#**
df = pd.read_csv('df2.csv', usecols=['Reference Position', 'Type',
    'Length', 'Reference', 'Allele', 'Frequency', 'Strain', 'Gene_name',
    'Gene_type', 'Gene_element' ,'Gender'])

# Since the 'Reference' and 'Allele' columns represent what should be in the
# reference genome, and what is actually there respectively, they can be
# combined into a single column. This combined column called 'Mut' will be
# created and the original 'Reference' and 'Allele' columns will be dropped.
# This is done to make the dataset simpler and removing unnecessary columns.
df['Mut'] = df['Reference'] + df['Allele']
df.drop(['Reference', 'Allele'], axis=1, inplace=True)

# Convert the data to binary using pandas get_dummies. This converts
# categorical data (such as names of samples, mutations, etc) into numerical
# data without ordering them. It does so using one-hot encoding. This is
# acheived by creating columns for each and putting 0s in rows where something
# doesn't apply, and 1s in rows where things do apply.
features = pd.get_dummies(df)

# Create variables 'S', 'G', 'M', 'N', 'T', 'E', and 'Y' to represent Strain,
# Gender, Mut, Gene_name, ' Gene_type, Gene_element, and  Type respectively.
# This way, these can be generalized, and the new columns created for them from
# the get_dummies command above do not need to be called explictly individually.
S = [col for col in features if col.startswith('Strain')]
G = [col for col in features if col.startswith('Gender')]
M = [col for col in features if col.startswith('Mut')]
N = [col for col in features if col.startswith('Gene_name')]
T = [col for col in features if col.startswith('Gene_type')]
E = [col for col in features if col.startswith('Gene_element')]
Y = [col for col in features if col.startswith('Type')]

#print(features.columns)
# Labels are the values we want to predict. These are turned into an np array
# as scikit learn is built upon numpy arrays.
y = np.array(features[S])

# Remove the labels from the features so that the algorithm does not do math on
# it as these are the dependant variables.
x = features.drop(S, axis = 1)

# Saving feature names for later use
feature_list = list(x.columns)

# Convert to numpy array the entire features dataframe to a numpy array.
x = np.array(x)

# Split the data into training and testing sets
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.25, random_state = 42)

# Instantiate model with 1000 decision trees
#rf = rfs(n_estimators = 10000, random_state = 42)
rf = rfs(random_state = 42, n_jobs=-1, n_estimators= 10000, oob_score=True)

# Train the model on training data
rf.fit(train_x, train_y)
print('The oob score is:' , rf.oob_score_)

#viz = dtreeviz(rf, x, y, target_name = 'Strain', orientation='LR',
        #feature_names=feature_list)
#viz.view()
# Use the forest's predict method on the test data
y_pred = rf.predict(test_x)
#y_pred = pd.DataFrame(y_pred)
#test_y = pd.DataFrame(test_y)
y_pred +=1
test_y +=1
print('Mean Abs ERR' , metrics.mean_absolute_error(test_y, y_pred))
print('Mean Sq ERR' , metrics.mean_squared_error(test_y, y_pred))
print('Mean RMS ERR' , np.sqrt(metrics.mean_absolute_error(test_y, y_pred)))

# Calculate the absolute errors
errors = (abs(y_pred - test_y))
#errors = pd.DataFrame(errors)
#print(errors)
#errors +=1
#print(errors)
#test_labels = pd.DataFrame(test_labels)
#test_labels +=1
#print(test_labels)
#print(len(predictions))
#print(test_labels.mean())
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
#print(errors/test_y )
# Calculate mean absolute percentage error (MAPE)
mape = (100 * (errors) / test_y)
#print('mape',mape)
#print(test_y)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('the accuracy is:' , round(accuracy, 2), '%')
#print('Accuracy:', round(accuracy, 2), '%.')
#print(accuracy_score(test_features, predictions))

# Import tools needed for visualization

# Pull out one tree from the forest
tree = rf.estimators_[5]

# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list,
        rounded = True, precision = 1, class_names=S, filled=True)

# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')

# Write graph to a png file
graph.write_png('tree.png')

# Optimize hyperparameters. Here we will use the Bayesian optimization from
# pierpaolo28.github.io/blog/blog25/
space = {'criterion': hp.choice('criterion',
['squared_error', 'absolute_error', 'poisson']),
        'max_depth': hp.quniform('max_depth', 10, 1200, 10),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2',
            None]),
        'min_samples_leaf': hp.uniform ('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200])
        }

def objective(space):
    model = rfs(criterion = space['criterion'], max_depth = space['max_depth'],
            max_features = space['max_features'], min_samples_leaf =
            space['min_samples_leaf'], min_samples_split =
            space['min_samples_split'], n_estimators = space['n_estimators'])

    accuracy = cross_val_score(model, train_x, train_y, cv =4).mean()
    return{'loss': -accuracy, 'status': STATUS_OK}

trials = Trials()
best = fmin(fn=objective, space = space, algo = tpe.suggest, max_evals = 80,
        trials = trials)
crit = {0: 'squared_error',  1: 'absolute_error', 2: 'poisson'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 10, 1: 50, 2: 300, 3: 750, 4: 1200}
tff = rfs(criterion = crit[best['criterion']], max_depth = best['max_depth'],
        max_features = feat[best['max_features']], min_samples_leaf =
        best['min_samples_leaf'], min_samples_split =
        best['min_samples_split'], n_estimators =
        est[best['n_estimators']]).fit(train_x, train_y)

# Get numerical feature importances
importances = list(rf.feature_importances_)
importances1 = [i for i in importances if i >= 0.01]

# Limit depth of tree to 3 levels
rf_small = rfs(n_estimators=10, max_depth = 3)
rf_small.fit(train_x, train_y)

# Extract the small tree
tree_small = rf_small.estimators_[5]

# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names =
        feature_list, rounded = True, precision = 1)

(graph, ) = pydot.graph_from_dot_file('small_tree.dot')

graph.write_png('small_tree.png');

#overlap = list((set(feature_list).intersection(importances1)))
#print(overlap)
# List of tuples with variable and importance
feature_importances = [(x, round(importance, 2)) for x, importance
        in zip(feature_list, importances1)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse
        = True)

# Print out the feature and importances
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in
        feature_importances];

# New random forest with only the two most important variables
rf_most_important = rfs(n_estimators= 50000, random_state=42)

# Extract the two most important features
important_indices = [feature_list.index('Length')]
      #  feature_list.index('Length')]

train_important = train_x[:, important_indices]
test_important = test_x[:, important_indices]

# Train the random forest
rf_most_important.fit(train_important, train_y)

# Make predictions and determine the error
predictions = rf_most_important.predict(test_important)
errors = abs(predictions - test_y)

# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

#mape = (100 * (errors) / test_y)
mape = np.mean(100 * (errors / test_y))
accuracy = 100 - np.mean(mape)
accuracy1 = 100 - mape

print('the accuracy is:' , accuracy1)
# Set the style
plt.style.use('fivethirtyeight')

# list of x locations for plotting
x_values = list(range(len(importances)))

# Make a bar chart
plt.bar(x_values, importances, orientation = 'vertical')

# Tick labels for x axis
plt.xticks(x_values, feature_list, rotation='vertical')

# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable'
' Importances')
plt.savefig('importance.png', bbox_inches="tight")

#print(Strain = features[:, feature_list.index('Strain_WT')])