-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathplotpubltype.py
More file actions
78 lines (70 loc) · 3.05 KB
/
Copy pathplotpubltype.py
File metadata and controls
78 lines (70 loc) · 3.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import argparse
import pandas as pd
pd.set_option('display.max_rows',200)
import numpy as np
import matplotlib.pyplot as plt
def main():
parser = argparse.ArgumentParser()
parser.add_argument('inputfile')
parser.add_argument('-l','--limit', type=int, default=None)
parser.add_argument('--quiet', action='store_true')
parser.add_argument('--plot-percentage', action='store_true')
parser.add_argument('-d','--delimiter', type=str, default='\t')
parser.add_argument('outputfile')
args = parser.parse_args()
###########################################################################
# 1. Read data files
###########################################################################
df = pd.read_csv(args.inputfile, sep=args.delimiter, nrows=args.limit)
df = df.replace({np.nan:None})
print('Read {} of {} grouped IPNI name rows'.format(args.inputfile, len(df)))
###########################################################################
# 2. Preparation
###########################################################################
# 2.1 Add placeholder for NULL values in publ_type fields
df.publ_type.fillna('n/a',inplace=True)
df.publ_type = df.publ_type.astype(str)
#
# 2.2 Rename columns
column_renames = {'publ_type':'Publication type'}
df.rename(columns=column_renames,inplace=True)
#
# 2.3 Reshape data
# 2.3.1 Group and sum to get a total
dfg = df.groupby(['year','Publication type']).n.sum().reset_index()
# 2.3.2 Pivot table to get a column per Publication type (book, serial or n/a), values are totals
dfg = dfg[['year','Publication type','n']].pivot_table(index='year',columns='Publication type',values='n')
dfg.columns = dfg.columns.get_level_values('Publication type')
print(dfg)
cats=['book','serial']
dfg = dfg[cats]
print(dfg)
#
# 2.4 Convert to a percentage data structure
if (args.plot_percentage):
dfg['total']=dfg.sum(axis=1)
dfg.columns=['book','serial','total']
for col in ['book','serial']:
dfg[col] = dfg[col]/dfg['total']
dfg.drop(columns='total',inplace=True)
dfg = dfg*100
###########################################################################
# 3. Plot and save figure to outputfile
###########################################################################
colour_mapper = {'book':'#79be78','serial':'#c5c5c5', 'n/a':'#ffffff'}
colours = [colour_mapper[cat] for cat in cats]
dfg.plot(kind='bar', stacked=True, linewidth=1, edgecolor='k', color=colours)
plt.legend(title='Publication type', loc='upper right')
if args.plot_percentage:
plt.ylim((0,100))
plt.ylabel("Percentage of nomenclatural acts")
plt.legend(bbox_to_anchor=(1.0, 1.0))
else:
plt.ylim((0,12000))
plt.ylabel("Number of nomenclatural acts")
plt.title("Numbers of nomenclatural acts by publication type")
plt.xlabel("Year")
plt.tight_layout()
plt.savefig(args.outputfile, dpi=300)
if __name__ == "__main__":
main()