-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfeature_normalize.py
More file actions
63 lines (50 loc) · 2.02 KB
/
Copy pathfeature_normalize.py
File metadata and controls
63 lines (50 loc) · 2.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import numpy as np
from glob import glob
from tqdm import tqdm
from chemprop.features import load_features
from chemprop.data.scaler import StandardScaler
def get_dist(dirpath):
mean = list()
std = list()
count = list()
# Get mean and standard deviation of all the features across all the files
for fname in tqdm(sorted(glob(dirpath))):
feats = load_features(fname)
X = np.array(feats).astype(float)
means = np.nanmean(X, axis=0)
stds = np.nanstd(X, axis=0)
means = np.where(np.isnan(means), np.zeros(means.shape), means)
stds = np.where(np.isnan(stds), np.ones(stds.shape), stds)
stds = np.where(stds == 0, np.ones(stds.shape), stds)
mean.append(means)
std.append(stds)
count.append(X.shape[0])
return mean, std, count
def get_overall_dist(mean, std, count):
total = sum(count)
total_mean = np.zeros(mean[0].shape)
# Compute overall mean
for i in range(len(count)):
total_mean += (count[i]*mean[i])
total_mean = total_mean/total
term1 = np.zeros(std[0].shape)
term2 = np.zeros(std[0].shape)
# Compute overall standard deviation
for i in range(len(count)):
term1 += (count[i]*(std[i]**2))
term2 += (count[i]*((mean[i]-total_mean)**2))
total_std = ((term1 + term2)/total)**(0.5)
return total_mean, total_std
def normalize(dirpath, savepath, mean, std):
sc = StandardScaler(means=mean, stds=std, replace_nan_token=0)
for _,fname in tqdm(enumerate(sorted(glob(dirpath)))):
name = fname.split('/')[-1]
# if 'fda' in name:
feats = load_features(fname)
feats = sc.transform(feats)
np.save(savepath+name,feats)
if __name__ == "__main__":
mean, std, count = get_dist('./data/new_data_feats/train/df*.npy')
mean, std = get_overall_dist(mean, std, count)
normalize('./data/fda.npy', './data/new_data_norm_feats/test/', mean, std)
# normalize('./data/new_data_feats/*.npy', './data/new_data_norm_feats/', mean, std)