This repository was archived by the owner on Sep 23, 2022. It is now read-only.
forked from hacetin/KeyDev-PROMISE20
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_commenters.py
More file actions
143 lines (114 loc) · 4.36 KB
/
extract_commenters.py
File metadata and controls
143 lines (114 loc) · 4.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from data_manager import DataManager, SlidingNotPossible
from preprocess import pig_author_mapping, hive_author_mapping, hadoop_author_mapping
from util import execute_db_query, sort_dict, highest_k, date_to_str
from collections import defaultdict
# Since all 3 projects belong to Apache community and they all use the same jira system,
# we combined the author mapping of all projects.
combined_author_mapping = {
**pig_author_mapping,
**hive_author_mapping,
**hadoop_author_mapping,
# After this point, there are new mappings that we found while
# we were inspecting comment data, not from Git names.
**{"linte": "alexandre linte", "aniket mokashi": "aniket namadeo mokashi"},
}
# Some comments are generated automatically
# Following commenters are ignored
ignored_commenters = set(
[
"asf github bot",
"jiraposter@reviews.apache.org",
"hbase review board",
"hadoop qa",
"hudson",
"hive qa",
"phabricator",
]
)
def generate_issue_to_commenters(project_name):
"""
Generate a mapping from issue ids to commenters of the issues.
Parameters
----------
project_name (str):
Name of the project.
Returns
--------
dict:
Mapping from issue ids to commenters of the issues.
"""
query_results = execute_db_query(
"data/{}.sqlite3".format(project_name),
"""
SELECT issue_id, display_name
FROM issue_comment
""",
)
issue_to_commenters = defaultdict(list)
for issue_id, commenter in query_results:
# Clear whitespaces and make lower case.
commenter = commenter.strip().lower()
# Check ignore commenters
if commenter in ignored_commenters:
continue
# Replace the commenters name if it is in author mapping
commenter = combined_author_mapping.get(commenter, commenter)
# New issue
issue_to_commenters[issue_id].append(commenter)
return issue_to_commenters
def generate_date_to_top_commenters(project_name):
"""
Generate a mapping from date to number of comment made until that date.
Large change sets are not exluded because the comments made to the issues related
to the large change sets still exist.
Parameters
----------
project_name (str):
Name of the project
Returns
--------
dict:
Mapping from date to top commenters and their numbers of comments in the sliding
window ending that date.
"""
issue_to_commenters = generate_issue_to_commenters(project_name)
data_manager = DataManager("data/{}_change_sets.json".format(project_name), 365)
# Get initial change sets to add and remove
change_sets_add = data_manager.get_initial_window()
change_sets_remove = {}
top_commenters = defaultdict(lambda: 0)
date_to_top_commenters = {}
while True:
# Add change sets
for change_set in change_sets_add:
for issue_id in change_set.issues:
for commenter in issue_to_commenters.get(issue_id, []):
top_commenters[commenter] += 1
# Remove change sets
for change_set in change_sets_remove:
for issue_id in change_set.issues:
for commenter in issue_to_commenters.get(issue_id, []):
top_commenters[commenter] -= 1
if top_commenters[commenter] <= 0:
del top_commenters[commenter]
date = data_manager.get_last_included_date()
date_to_top_commenters[date] = sort_dict(
top_commenters, by_value=True, reverse=True
)
try:
change_sets_add, change_sets_remove = data_manager.forward_one_day()
except SlidingNotPossible:
break
return date_to_top_commenters
if __name__ == "__main__":
# Lets extract top10 commenters into csv files to check with eyes
for project_name in ["pig", "hive", "hadoop"]:
date_to_top_commenters = generate_date_to_top_commenters(project_name)
text = ""
for date, counter in date_to_top_commenters.items():
top10_commenters = highest_k(counter, 10)
text += date_to_str(date) + "," + ",".join(top10_commenters) + "\n"
with open(
"data/{}_top_commenters.csv".format(project_name), "w", encoding="utf8"
) as f:
f.write(text)