-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgithub.py
More file actions
77 lines (59 loc) · 2.77 KB
/
Copy pathgithub.py
File metadata and controls
77 lines (59 loc) · 2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import base64
import logging
from typing import Optional
import httpx
from fastapi import HTTPException
from config import GITHUB_API, GITHUB_TOKEN
logger = logging.getLogger(__name__)
def parse_github_url(url: str) -> tuple[str, str]:
"""Extract (owner, repo) from a GitHub URL."""
import re
m = re.match(r"https?://github\.com/([^/]+)/([^/]+)", url)
assert m is not None # guaranteed — URL was already validated by Pydantic
owner, repo = m.group(1), m.group(2)
return owner, repo.removesuffix(".git")
def github_headers() -> dict:
"""Auth header if a token is set — raises rate limit from 60 to 5000 req/hr."""
h = {"Accept": "application/vnd.github+json", "X-GitHub-Api-Version": "2022-11-28"}
if GITHUB_TOKEN:
h["Authorization"] = f"Bearer {GITHUB_TOKEN}"
return h
async def get_repo_tree(client: httpx.AsyncClient, owner: str, repo: str) -> list[dict]:
"""
Fetch the complete recursive file tree from GitHub.
Returns a flat list of blob objects: [{path, size, sha, ...}, ...]
One API call — GitHub handles the recursion.
"""
url = f"{GITHUB_API}/repos/{owner}/{repo}/git/trees/HEAD"
resp = await client.get(url, headers=github_headers(), params={"recursive": "1"})
if resp.status_code == 404:
raise HTTPException(404, {"status": "error", "message": "Repository not found or is private."})
if resp.status_code == 403:
raise HTTPException(429, {"status": "error", "message": "GitHub rate limit exceeded. Set GITHUB_TOKEN to increase it."})
if resp.status_code != 200:
raise HTTPException(502, {"status": "error", "message": f"GitHub API error: {resp.status_code}"})
data = resp.json()
if data.get("truncated"):
logger.warning(f"GitHub tree truncated for {owner}/{repo} — repo is very large.")
return [item for item in data.get("tree", []) if item["type"] == "blob"]
async def fetch_file(client: httpx.AsyncClient, owner: str, repo: str, path: str) -> Optional[str]:
"""
Fetch a single file's content. GitHub returns it base64-encoded.
Returns decoded text, or None if the file is binary or unreachable.
"""
url = f"{GITHUB_API}/repos/{owner}/{repo}/contents/{path}"
try:
resp = await client.get(url, headers=github_headers())
if resp.status_code != 200:
return None
data = resp.json()
if data.get("encoding") != "base64" or not data.get("content"):
return None
raw = base64.b64decode(data["content"])
# Text files never contain null bytes — fast binary detection
if b"\x00" in raw[:512]:
return None
return raw.decode("utf-8", errors="replace")
except Exception as e:
logger.debug(f"Could not fetch {path}: {e}")
return None