-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathutils.py
More file actions
203 lines (153 loc) · 7.26 KB
/
Copy pathutils.py
File metadata and controls
203 lines (153 loc) · 7.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import numpy as np
import pandas as pd
import quantipy as qp
def load_data(user_table, population_table):
user_df = pd.read_csv(user_table, header=0)
population_df = pd.read_csv(population_table, header=0)
user_df.set_index('cnty', inplace=True)
user_df.sort_index(inplace=True)
population_df.set_index('cnty', inplace=True)
population_df.sort_index(inplace=True)
return user_df, population_df
def apply_redistribution(user_table, demographics, redist_dem_bins, redist_dem_percents):
for dem in demographics:
user_data = user_table[dem]
redist_bins = redist_dem_bins[dem]
redist_percents = redist_dem_percents[dem]
cumulative_percents = np.cumsum(redist_percents) * 100
cumulative_percents[-1] = 100
percentiles = np.percentile(user_data, cumulative_percents)
bins = [0] + list(percentiles)
for redist_min_bin, redist_max_bin, min_bin, max_bin in zip(redist_bins, redist_bins[1:], bins, bins[1:]):
redist_max_bin = min(redist_max_bin, 1e9)
max_bin = min(max_bin, 1e9)
mask = user_data.between(min_bin, max_bin)
valid = user_table[mask]
user_table.loc[mask, dem] = (redist_max_bin - redist_min_bin) * (valid - min_bin) / (max_bin - min_bin) + redist_min_bin
return user_table
def bin_demographics(user_table, demographics, user_dem_bins, dem_bins):
for dem in demographics:
bins = user_dem_bins[dem]
labels = dem_bins[dem][:-1]
user_table[dem] = pd.cut(user_table[dem], bins=bins, labels=labels, include_lowest=True).astype(int)
return user_table
def collapse_bins(bin_boundaries, bin_counts, smoothing=0, min_bin_num=0, smooth_before_binning=False, population_percents=[]):
this_bin_boundaries = list(bin_boundaries)
this_bin_counts = list(bin_counts)
if smooth_before_binning:
this_bin_counts = [bc + smoothing * population_percents[bc_idx] for bc_idx, bc in enumerate(this_bin_counts)]
while True:
if len(this_bin_counts) <= 1 or all(p >= min_bin_num for p in this_bin_counts):
break
min_idx = this_bin_counts.index(min(this_bin_counts))
del_idx = min_idx
if min_idx == 0:
min_adjacent_idx = min_idx + 1
del_idx = min_idx + 1
elif min_idx == len(this_bin_counts) - 1:
min_adjacent_idx = min_idx - 1
else:
if this_bin_counts[min_idx - 1] >= this_bin_counts[min_idx + 1]:
min_adjacent_idx = min_idx + 1
else:
min_adjacent_idx = min_idx - 1
if min_adjacent_idx >= min_idx:
del_idx = min_idx + 1
this_p = this_bin_counts[min_idx]
this_bin = this_bin_boundaries[del_idx]
this_bin_counts[min_adjacent_idx] += this_bin_counts[min_idx]
del this_bin_counts[min_idx]
del this_bin_boundaries[del_idx]
if smooth_before_binning:
del population_percents[min_idx]
return this_bin_boundaries, this_bin_counts
def get_population_percents(population_data, population_table_cols, user_dem_bins, bins):
if len(bins) < 3:
return np.array([1])
if len(population_table_cols) == 2:
# binary demographic
return population_data[population_table_cols].tolist()
else:
percents = []
base_idx = 0
for b in bins[1:]:
bin_idx = user_dem_bins.index(b)
bin_total = np.sum(population_data[population_table_cols[base_idx:bin_idx]])
percents.append(bin_total)
base_idx = bin_idx
# normalize array sum to 100
percents = percents / np.sum(percents)
return percents
def get_bins(user_data, population_data, dem, population_table_cols, user_dem_bins, smoothing=0, min_bin_num=0, smooth_before_binning=False):
# get bins
bins = user_dem_bins
values = user_data[dem]
bins_counts, _ = np.histogram(values, bins=bins)
if min_bin_num > 0:
bins, bins_counts = collapse_bins(bins, bins_counts, smoothing, min_bin_num)
# get percentages
user_percents = np.array([x / len(values) for x in bins_counts])
population_percents = get_population_percents(population_data, population_table_cols, user_dem_bins, bins)
# recalculate if smooth before binning
if smooth_before_binning:
bins, bins_counts = collapse_bins(bins, bins_counts, smoothing, min_bin_num, smooth_before_binning, population_percents)
population_percents = get_population_percents(population_data, population_table_cols, user_dem_bins, bins)
return bins, user_percents, population_percents
def create_banded_dataset(user_data, population_data, demographics, smoothing, min_bin_num, smooth_before_binning, user_dem_bins, population_dem_cols):
dataset = qp.DataSet(name="_".join(demographics)+'_dataset', dimensions_comp=False)
bands = {}
user_dem_percents = {}
population_dem_percents = {}
for dem in demographics:
bins, user_percents, population_percents = get_bins(
user_data=user_data,
population_data=population_data,
dem=dem,
population_table_cols=population_dem_cols[dem],
user_dem_bins=user_dem_bins[dem],
smoothing=smoothing,
min_bin_num=min_bin_num,
smooth_before_binning=smooth_before_binning,
)
band = [(bins[i], bins[i + 1] - 1) for i in range(len(bins[:-1]))]
band[-1] = (bins[-2], bins[-1])
bands[dem] = band
user_dem_percents[dem] = {i+1:perc for i, perc in enumerate(user_percents)}
population_dem_percents[dem] = {i+1:perc for i, perc in enumerate(population_percents)}
dataset.from_components(user_data[['user_id'] + demographics])
for band in bands:
dataset.band(band, bands[band])
return dataset, user_dem_percents, population_dem_percents
def rakeonvar(df, dem, bin_marginals):
for bin_class, bin_perc in bin_marginals.items():
# 1. subset df where column=dem and column value=bin_class
subset_df = df[(df[dem] == bin_class)]
index_array = (df[dem] == bin_class)
# 2. multiply by census prob, divide by sum of "perc"
if sum(subset_df['perc']) == 0:
data = subset_df['perc']
else:
data = subset_df['perc'] * (bin_perc / sum(subset_df['perc']))
# 3. replace the `perc` column in df with the updated subset_df[perc]
df.loc[index_array, 'perc'] = data
return df
def rake(df, population_marginals):
convcrit = 0.01
pct_still = 1 - convcrit
diff_error = 999999
diff_error_old = 99999999999
max_iterations = 1000
initial_weights = df['perc'].copy()
# run the raking: for each iteration, rake over each key in census_marginals
for iteration in range(1, max_iterations+1):
old_weights = df['perc'].copy()
if not diff_error < pct_still * diff_error_old:
break
for dem, bin_marginals in population_marginals.items():
rakeonvar(df, dem, bin_marginals)
diff_error_old = diff_error
diff_error = sum(abs(df['perc'] - old_weights))
if iteration == max_iterations:
print('Convergence did not occur in %s iterations' % iteration)
df['perc'] = df['perc'].div(initial_weights,0)
return df