Skip to content

Commit 127da2e

Browse files
author
farfarfun
committed
Improve batch upload handling for large files with splitting logic
1 parent b127136 commit 127da2e

3 files changed

Lines changed: 77 additions & 13 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "funread"
3-
version = "1.1.78"
3+
version = "1.1.79"
44
description = "一个用于管理和处理阅读源(Legado 阅读 APP 的书源和 RSS 源)的 Python 工具库"
55
readme = "README.md"
66
requires-python = ">=3.8"

src/funread/legado/manage/download/generate.py

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
DEFAULT_DIR_PATH = "funread/legado/snapshot/lasted"
2323
EXPORT_BATCH_SIZE = 500
2424
INITIAL_COUNTER = 1000
25+
MIN_UPLOAD_BATCH_SIZE = 20
26+
2527

2628
# 组织仓库列表
2729
ORG_REPOS = [
@@ -257,24 +259,46 @@ def _export_and_upload(self, path: str) -> None:
257259
counter = INITIAL_COUNTER
258260
for data in runner.export_sources(size=EXPORT_BATCH_SIZE):
259261
if data:
260-
self._upload_batch(data, counter)
261-
counter += 1
262+
counter = self._upload_batch(data, counter)
262263
except Exception as e:
263264
logger.error(f"Failed to export and upload: {e}")
264265
raise
265266

266-
def _upload_batch(self, data: List[Dict[str, Any]], counter: int) -> None:
267-
"""上传数据批次"""
267+
@staticmethod
268+
def _is_file_too_large_error(error: Exception) -> bool:
269+
"""判断是否为远端文件过大错误"""
270+
message = str(error).lower()
271+
return "too large" in message or "422" in message
272+
273+
def _upload_single_batch(self, data: List[Dict[str, Any]], counter: int) -> None:
274+
"""上传单个数据批次"""
275+
git_path = f"{self.dir_path}/progress-{counter}.json"
276+
self.drive.upload_file(
277+
content=json.dumps(data),
278+
fid=self.dir_path,
279+
filepath=None,
280+
filename=f"progress-{counter}.json",
281+
)
282+
logger.info(f"Uploaded {len(data)} sources to {git_path}")
283+
284+
def _upload_batch(self, data: List[Dict[str, Any]], counter: int) -> int:
285+
"""上传数据批次,必要时自动拆分为更小文件"""
268286
try:
269-
git_path = f"{self.dir_path}/progress-{counter}.json"
270-
self.drive.upload_file(
271-
content=json.dumps(data),
272-
fid=self.dir_path,
273-
filepath=None,
274-
filename=f"progress-{counter}.json",
275-
)
276-
logger.info(f"Uploaded {len(data)} sources to {git_path}")
287+
self._upload_single_batch(data, counter)
288+
return counter + 1
277289
except Exception as e:
290+
if self._is_file_too_large_error(e) and len(data) > MIN_UPLOAD_BATCH_SIZE:
291+
split_size = max(len(data) // 2, MIN_UPLOAD_BATCH_SIZE)
292+
logger.warning(
293+
f"Batch {counter} too large with {len(data)} sources, split into chunks of {split_size}"
294+
)
295+
next_counter = counter
296+
for start in range(0, len(data), split_size):
297+
next_counter = self._upload_batch(
298+
data[start : start + split_size], next_counter
299+
)
300+
return next_counter
301+
278302
logger.error(f"Failed to upload batch {counter}: {e}")
279303
raise
280304

tests/test_download_base.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from pathlib import Path
22

33
import funread.legado.manage.download.book as book_module
4+
import funread.legado.manage.download.generate as generate_module
45
import funread.legado.manage.download.rss as rss_module
56

67
from funread.legado.manage.download.base import DownloadSource
@@ -103,3 +104,42 @@ def test_rss_loader_reads_source_download_iterator(monkeypatch, tmp_path: Path)
103104

104105
exported = next(source.export_sources(size=10))
105106
assert exported[0]["sourceUrl"].startswith("https://rss.example.com/feed#")
107+
108+
109+
class _FakeDrive:
110+
def __init__(self, fail_threshold=None):
111+
self.fail_threshold = fail_threshold
112+
self.calls = []
113+
114+
def upload_file(self, content, fid, filepath, filename):
115+
payload = generate_module.json.loads(content)
116+
self.calls.append((filename, len(payload)))
117+
if self.fail_threshold is not None and len(payload) > self.fail_threshold:
118+
raise RuntimeError("GitHub API返回422: Sorry, the file is too large to be processed.")
119+
120+
121+
def test_upload_batch_splits_large_payloads() -> None:
122+
generator = generate_module.GenerateSourceType.__new__(generate_module.GenerateSourceType)
123+
generator.dir_path = "funread/legado/snapshot/lasted/book"
124+
generator.drive = _FakeDrive(fail_threshold=2)
125+
126+
next_counter = generator._upload_batch([{"i": i} for i in range(5)], 1000)
127+
128+
assert next_counter == 1003
129+
assert generator.drive.calls == [
130+
("progress-1000.json", 5),
131+
("progress-1000.json", 2),
132+
("progress-1001.json", 2),
133+
("progress-1002.json", 1),
134+
]
135+
136+
137+
def test_upload_batch_increments_counter_on_success() -> None:
138+
generator = generate_module.GenerateSourceType.__new__(generate_module.GenerateSourceType)
139+
generator.dir_path = "funread/legado/snapshot/lasted/book"
140+
generator.drive = _FakeDrive()
141+
142+
next_counter = generator._upload_batch([{"i": 1}, {"i": 2}], 1000)
143+
144+
assert next_counter == 1001
145+
assert generator.drive.calls == [("progress-1000.json", 2)]

0 commit comments

Comments
 (0)