java-meta-tracker/collect_workflow_metrics.py at main · jaydeluca/java-meta-tracker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
"""
Collects workflow run duration metrics from GitHub.

This script tracks workflow run durations using deduplication to prevent
double-counting when using overlapping lookback windows. This allows catching
builds that take longer than the collection interval to complete.
"""

import os
import time
from datetime import datetime, timedelta
from github import Github, Auth

from opentelemetry import metrics
from opentelemetry.sdk.metrics import MeterProvider, Histogram
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader, AggregationTemporality
from opentelemetry.sdk.metrics.view import View
from opentelemetry.sdk.metrics._internal.aggregation import ExplicitBucketHistogramAggregation
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
from opentelemetry.sdk.resources import Resource

from workflow_state import load_processed_runs, save_processed_runs, get_state_file_path


# OpenTelemetry setup
resource = Resource.create({"service.name": "github-workflow-metrics"})

workflow_duration_view = View(
    instrument_name="workflow.run.duration",
    aggregation=ExplicitBucketHistogramAggregation(
        boundaries=(5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 60, 75, 90, 120, 180)
    )
)

job_duration_view = View(
    instrument_name="workflow.job.duration",
    aggregation=ExplicitBucketHistogramAggregation(
        boundaries=(1, 2, 3, 5, 7, 10, 15, 20, 25, 30, 40, 50, 60, 90, 120)
    )
)

# OTLP exporter with cumulative temporality for Prometheus/Mimir in Grafana
otlp_exporter = OTLPMetricExporter(
    preferred_temporality={
        Histogram: AggregationTemporality.CUMULATIVE,
    }
)
otlp_reader = PeriodicExportingMetricReader(otlp_exporter, export_interval_millis=5000)

provider = MeterProvider(
    resource=resource,
    metric_readers=[otlp_reader],
    views=[workflow_duration_view, job_duration_view]
)
metrics.set_meter_provider(provider)

meter = metrics.get_meter("github.workflow.metrics.meter")

workflow_duration_histogram = meter.create_histogram(
    name="workflow.run.duration",
    description="Duration of workflow runs in minutes",
    unit="minutes"
)

job_duration_histogram = meter.create_histogram(
    name="workflow.job.duration",
    description="Duration of individual workflow jobs in minutes",
    unit="minutes"
)


def fetch_job_metrics(run, base_attributes: dict):
    """
    Fetches and records metrics for individual jobs within a workflow run.

    Args:
        run: WorkflowRun object from GitHub API
        base_attributes: Base attributes to include with each job metric (repo, workflow, event, etc.)
    """
    try:
        jobs = run.jobs()
        jobs_recorded = 0

        for job in jobs:
            # Skip jobs that haven't completed
            if job.status != "completed":
                continue

            # Calculate job duration
            if job.started_at and job.completed_at:
                duration_seconds = (job.completed_at - job.started_at).total_seconds()
                duration_minutes = duration_seconds / 60

                # Create attributes for this job
                job_attributes = base_attributes.copy()
                job_attributes["job_name"] = job.name
                job_attributes["job_conclusion"] = job.conclusion or "unknown"

                # Record the metric
                job_duration_histogram.record(duration_minutes, job_attributes)
                jobs_recorded += 1

        return jobs_recorded

    except Exception as e:
        print(f"    Warning: Could not fetch jobs for run {run.id}: {e}")
        return 0


def fetch_workflow_run_metrics(github_client: Github, lookback_hours: int = 3):
    """
    Fetches workflow run duration metrics for main branch builds and PR builds.
    Uses deduplication to avoid counting the same run multiple times.

    Args:
        github_client: Authenticated GitHub client
        lookback_hours: Number of hours to look back for workflow runs
    """
    print(f"Fetching workflow run metrics (last {lookback_hours} hours)...")

    # Load previously processed runs
    state_file = get_state_file_path()
    processed_runs = load_processed_runs(state_file)
    repo_name = "open-telemetry/opentelemetry-java-instrumentation"

    try:
        repo = github_client.get_repo(repo_name)

        since_date = datetime.now() - timedelta(hours=lookback_hours)
        date_filter = since_date.strftime("%Y-%m-%dT%H:%M:%S")

        # Find both "Build" and "Build pull request" workflows
        workflows = repo.get_workflows()
        build_workflows = []
        for wf in workflows:
            if wf.name == "Build" or "build.yml" in wf.path:
                build_workflows.append(wf)
                print(f"  Found workflow: {wf.name} ({wf.path})")
            elif wf.name == "Build pull request" or "build-pull-request.yml" in wf.path:
                build_workflows.append(wf)
                print(f"  Found workflow: {wf.name} ({wf.path})")

        if not build_workflows:
            print("  Warning: No build workflows found")
            return

        runs_processed = 0
        runs_total = 0
        runs_incomplete = 0
        runs_skipped_branch = 0
        runs_skipped_duplicate = 0
        runs_skipped_cancelled = 0
        jobs_recorded = 0
        newly_processed = set()

        for build_workflow in build_workflows:
            print(f"  Processing workflow: {build_workflow.name}")

            # Get workflow runs (both push and pull_request events)
            runs = build_workflow.get_runs(
                created=f">={date_filter}"
            )

            for run in runs:
                runs_total += 1

                if run.id in processed_runs:
                    runs_skipped_duplicate += 1
                    continue

                # Track main branch builds (push events) and all PR builds (pull_request events)
                is_test_pr = False
                build_type = "other"

                if run.event == "pull_request":
                    # Track all PR builds, but mark PR #15213 specially
                    build_type = "pr"
                    try:
                        if hasattr(run, 'pull_requests') and len(run.pull_requests) > 0:
                            pr_number = run.pull_requests[0].number
                            if pr_number == 15213:
                                is_test_pr = True
                    except Exception:
                        pass
                elif run.event == "push" and run.head_branch == "main":
                    # Main branch builds (merged PRs)
                    build_type = "main"
                else:
                    # Skip all other events/branches (e.g., release branches, scheduled runs)
                    runs_skipped_branch += 1
                    continue

                # Only process completed runs
                if run.status != "completed":
                    runs_incomplete += 1
                    continue

                # Skip cancelled runs - they didn't complete the full build
                if run.conclusion == "cancelled":
                    runs_skipped_cancelled += 1
                    continue

                try:
                    timing_data = run.timing()

                    if not timing_data:
                        continue

                    if not hasattr(timing_data, 'run_duration_ms'):
                        continue

                    duration_ms = timing_data.run_duration_ms
                    duration_minutes = duration_ms / 1000 / 60

                    attributes = {
                        "repo": "opentelemetry-java-instrumentation",
                        "workflow": "build",
                        "conclusion": run.conclusion or "unknown",
                        "event": run.event,
                        "build_type": build_type,
                        "is_build_test": "true" if is_test_pr else "false"
                    }

                    workflow_duration_histogram.record(duration_minutes, attributes)

                    if runs_processed == 0:
                        print(f"  Debug: Recording histogram value {duration_minutes} minutes with attributes {attributes}")

                    # Fetch and record job-level metrics
                    jobs_count = fetch_job_metrics(run, attributes)
                    jobs_recorded += jobs_count

                    # Mark this run as processed
                    newly_processed.add(run.id)
                    runs_processed += 1

                    if runs_processed <= 5:  # Print first 5 for debugging
                        print(f"  - Run #{run.run_number} (event={run.event}): {duration_minutes:.1f} minutes, conclusion={run.conclusion}")

                except Exception as e:
                    print(f"  Warning: Could not fetch timing for run {run.id}: {e}")
                    continue

        print(f"  Total runs found: {runs_total}")
        print(f"  Duplicate runs skipped: {runs_skipped_duplicate}")
        print(f"  Non-main branch runs skipped: {runs_skipped_branch}")
        print(f"  Incomplete runs skipped: {runs_incomplete}")
        print(f"  Cancelled runs skipped: {runs_skipped_cancelled}")
        print(f"  Processed {runs_processed} new workflow runs")
        print(f"  Recorded {jobs_recorded} job-level metrics")

        # Update and save state
        all_processed = processed_runs.union(newly_processed)
        save_processed_runs(all_processed, state_file)

    except Exception as e:
        print(f"Error fetching workflow run metrics for {repo_name}: {e}")


if __name__ == "__main__":
    github_token = os.environ.get("GITHUB_TOKEN")
    if not github_token:
        raise ValueError("GITHUB_TOKEN environment variable not set.")

    auth = Auth.Token(github_token)
    g = Github(auth=auth)

    # Get lookback period for workflow metrics (default 3 hours)
    workflow_lookback_hours = int(os.environ.get("WORKFLOW_LOOKBACK_HOURS", "3"))

    print("=" * 60)
    print("GitHub Workflow Metrics Collection")
    print("=" * 60)

    fetch_workflow_run_metrics(g, lookback_hours=workflow_lookback_hours)

    print("\nAll workflow metrics collected. Flushing metrics before exit...")
    provider.force_flush()

    print("Metrics flushed. The script will exit after 5 seconds.")
    time.sleep(5)