KeyDev-PROMISE20/extract_commenters.py at master · bilsengroup/KeyDev-PROMISE20 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from data_manager import DataManager, SlidingNotPossible
from preprocess import pig_author_mapping, hive_author_mapping, hadoop_author_mapping
from util import execute_db_query, sort_dict, highest_k, date_to_str
from collections import defaultdict

# Since all 3 projects belong to Apache community and they all use the same jira system,
# we combined the author mapping of all projects.
combined_author_mapping = {
    **pig_author_mapping,
    **hive_author_mapping,
    **hadoop_author_mapping,
    # After this point, there are new mappings that we found while
    # we were inspecting comment data, not from Git names.
    **{"linte": "alexandre linte", "aniket mokashi": "aniket namadeo mokashi"},
}


# Some comments are generated automatically
# Following commenters are ignored
ignored_commenters = set(
    [
        "asf github bot",
        "jiraposter@reviews.apache.org",
        "hbase review board",
        "hadoop qa",
        "hudson",
        "hive qa",
        "phabricator",
    ]
)


def generate_issue_to_commenters(project_name):
    """
    Generate a mapping from issue ids to commenters of the issues.

    Parameters
    ----------
    project_name (str):
        Name of the project.

    Returns
    --------
    dict:
        Mapping from issue ids to commenters of the issues.
    """

    query_results = execute_db_query(
        "data/{}.sqlite3".format(project_name),
        """
        SELECT issue_id, display_name
        FROM issue_comment
        """,
    )

    issue_to_commenters = defaultdict(list)
    for issue_id, commenter in query_results:
        # Clear whitespaces and make lower case.
        commenter = commenter.strip().lower()
        # Check ignore commenters
        if commenter in ignored_commenters:
            continue

        # Replace the commenters name if it is in author mapping
        commenter = combined_author_mapping.get(commenter, commenter)

        # New issue
        issue_to_commenters[issue_id].append(commenter)

    return issue_to_commenters


def generate_date_to_top_commenters(project_name):
    """
    Generate a mapping from date to number of comment made until that date.

    Large change sets are not exluded because the comments made to the issues related
    to the large change sets still exist.

    Parameters
    ----------
    project_name (str):
        Name of the project

    Returns
    --------
    dict:
        Mapping from date to top commenters and their numbers of comments in the sliding
        window ending that date.
    """

    issue_to_commenters = generate_issue_to_commenters(project_name)

    data_manager = DataManager("data/{}_change_sets.json".format(project_name), 365)

    # Get initial change sets to add and remove
    change_sets_add = data_manager.get_initial_window()
    change_sets_remove = {}
    top_commenters = defaultdict(lambda: 0)

    date_to_top_commenters = {}
    while True:
        # Add change sets
        for change_set in change_sets_add:
            for issue_id in change_set.issues:
                for commenter in issue_to_commenters.get(issue_id, []):
                    top_commenters[commenter] += 1

        # Remove change sets
        for change_set in change_sets_remove:
            for issue_id in change_set.issues:
                for commenter in issue_to_commenters.get(issue_id, []):
                    top_commenters[commenter] -= 1
                    if top_commenters[commenter] <= 0:
                        del top_commenters[commenter]

        date = data_manager.get_last_included_date()
        date_to_top_commenters[date] = sort_dict(
            top_commenters, by_value=True, reverse=True
        )

        try:
            change_sets_add, change_sets_remove = data_manager.forward_one_day()
        except SlidingNotPossible:
            break

    return date_to_top_commenters


if __name__ == "__main__":
    # Lets extract top10 commenters into csv files to check with eyes
    for project_name in ["pig", "hive", "hadoop"]:
        date_to_top_commenters = generate_date_to_top_commenters(project_name)

        text = ""
        for date, counter in date_to_top_commenters.items():
            top10_commenters = highest_k(counter, 10)
            text += date_to_str(date) + "," + ",".join(top10_commenters) + "\n"

        with open(
            "data/{}_top_commenters.csv".format(project_name), "w", encoding="utf8"
        ) as f:
            f.write(text)