-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmerge_jsonl_to_csv.py
More file actions
executable file
·97 lines (79 loc) · 2.88 KB
/
merge_jsonl_to_csv.py
File metadata and controls
executable file
·97 lines (79 loc) · 2.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import csv
import json
from pathlib import Path
from typing import Any
FIELDNAMES = [
"name",
"operator_name",
"country",
"city",
"address",
"latitude",
"longitude",
"external_website",
"services",
"power_mw",
"whitespace",
"detail_url",
]
def get_path(record: dict[str, Any], path: str) -> Any:
value: Any = record
for part in path.split("."):
if not isinstance(value, dict):
return ""
value = value.get(part)
return value if value is not None else ""
def load_jsonl(path: Path) -> list[dict[str, Any]]:
records: list[dict[str, Any]] = []
with path.open("r", encoding="utf-8") as handle:
for line in handle:
text = line.strip()
if text:
records.append(json.loads(text))
return records
def row_from_record(record: dict[str, Any]) -> dict[str, Any]:
services = get_path(record, "specs.services")
if isinstance(services, list):
services_text = " | ".join(str(item) for item in services)
else:
services_text = str(services or "")
return {
"name": get_path(record, "identity.name"),
"operator_name": get_path(record, "identity.operator_name"),
"country": get_path(record, "address.country"),
"city": get_path(record, "address.city"),
"address": get_path(record, "address.address"),
"latitude": get_path(record, "location.latitude"),
"longitude": get_path(record, "location.longitude"),
"external_website": get_path(record, "links.external_website"),
"services": services_text,
"power_mw": get_path(record, "specs.capacity.power_mw"),
"whitespace": get_path(record, "specs.capacity.whitespace"),
"detail_url": get_path(record, "source.detail_url"),
}
def main() -> None:
parser = argparse.ArgumentParser(description="Merge DataCenterMap JSONL files into a CSV.")
parser.add_argument("jsonl_paths", type=Path, nargs="+")
parser.add_argument("--output", type=Path, required=True)
args = parser.parse_args()
rows: list[dict[str, Any]] = []
seen_urls: set[str] = set()
for path in args.jsonl_paths:
for record in load_jsonl(path):
detail_url = str(get_path(record, "source.detail_url") or "")
if detail_url and detail_url in seen_urls:
continue
if detail_url:
seen_urls.add(detail_url)
rows.append(row_from_record(record))
args.output.parent.mkdir(parents=True, exist_ok=True)
with args.output.open("w", encoding="utf-8-sig", newline="") as handle:
writer = csv.DictWriter(handle, fieldnames=FIELDNAMES)
writer.writeheader()
writer.writerows(rows)
print(f"Wrote {len(rows)} rows -> {args.output}")
if __name__ == "__main__":
main()