From ba7c0f54d643c3942b59775ef0951a64284e67ff Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 6 May 2026 23:32:47 +0000 Subject: [PATCH 1/2] add controller launch_hooks to `BrowserPool` --- .../browser_pool_launch_hooks_example.py | 48 +++++++++++ docs/guides/playwright_crawler.mdx | 9 +- src/crawlee/browsers/_browser_pool.py | 36 +++++++- tests/unit/browsers/test_browser_pool.py | 86 ++++++++++++++++++- 4 files changed, 174 insertions(+), 5 deletions(-) create mode 100644 docs/guides/code_examples/playwright_crawler/browser_pool_launch_hooks_example.py diff --git a/docs/guides/code_examples/playwright_crawler/browser_pool_launch_hooks_example.py b/docs/guides/code_examples/playwright_crawler/browser_pool_launch_hooks_example.py new file mode 100644 index 0000000000..b4379f50ed --- /dev/null +++ b/docs/guides/code_examples/playwright_crawler/browser_pool_launch_hooks_example.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +import asyncio +import logging +from typing import TYPE_CHECKING + +from crawlee.browsers import BrowserPool +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + +if TYPE_CHECKING: + from crawlee.browsers._browser_controller import BrowserController + from crawlee.browsers._browser_plugin import BrowserPlugin + +logger = logging.getLogger(__name__) + + +async def main() -> None: + async with BrowserPool() as browser_pool: + + @browser_pool.pre_launch_hook + async def log_browser_launch(page_id: str, plugin: BrowserPlugin) -> None: + """Log before a new browser instance is launched.""" + logger.info(f'Launching {plugin.browser_type} browser for page {page_id}...') + + @browser_pool.post_launch_hook + async def log_browser_launched( + page_id: str, controller: BrowserController + ) -> None: + """Log after a new browser instance has been launched.""" + logger.info(f'Browser launched for page {page_id}, controller: {controller}') + + crawler = PlaywrightCrawler( + browser_pool=browser_pool, + max_requests_per_crawl=5, + ) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + await context.enqueue_links() + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/playwright_crawler.mdx b/docs/guides/playwright_crawler.mdx index 17eebcc465..907b912f95 100644 --- a/docs/guides/playwright_crawler.mdx +++ b/docs/guides/playwright_crawler.mdx @@ -11,6 +11,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import MultipleLaunchExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/multiple_launch_example.py'; import BrowserConfigurationExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_configuration_example.py'; import NavigationHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/navigation_hooks_example.py'; +import BrowserPoolLaunchHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_pool_launch_hooks_example.py'; import BrowserPoolPageHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_pool_page_hooks_example.py'; import PluginBrowserConfigExample from '!!raw-loader!./code_examples/playwright_crawler/plugin_browser_configuration_example.py'; @@ -57,6 +58,12 @@ You can also configure each plugin used by `Brow For an example of how to implement a custom browser plugin, see the [Camoufox example](../examples/playwright-crawler-with-camoufox). [Camoufox](https://camoufox.com/) is a stealth browser plugin designed to reduce detection by anti-scraping measures and is fully compatible with `PlaywrightCrawler`. +The `BrowserPool` also exposes `pre_launch_hook` and `post_launch_hook` - called once per browser instance, before and after it is launched. Use them for logging, metrics, or any setup at the browser level. Note that these hooks are not called when a new page is created in an already-running browser. + + + {BrowserPoolLaunchHooksExample} + + ## Page configuration with lifecycle page hooks For additional setup or event-driven actions around page creation and closure, the `BrowserPool` exposes four lifecycle hooks: `pre_page_create_hook`, `post_page_create_hook`, `pre_page_close_hook`, and `post_page_close_hook`. To use them, create a `BrowserPool` instance and pass it to `PlaywrightCrawler` via the `browser_pool` argument. @@ -75,4 +82,4 @@ Navigation hooks allow for additional configuration at specific points during pa ## Conclusion -This guide introduced the `PlaywrightCrawler` and explained how to configure it using `BrowserPool` and `PlaywrightBrowserPlugin`. You learned how to launch multiple browsers, configure browser and context settings, use `BrowserPool` lifecycle page hooks, and apply navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! +This guide introduced the `PlaywrightCrawler` and explained how to configure it using `BrowserPool` and `PlaywrightBrowserPlugin`. You learned how to launch multiple browsers, configure browser and context settings, use `BrowserPool` lifecycle hooks, and apply navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/src/crawlee/browsers/_browser_pool.py b/src/crawlee/browsers/_browser_pool.py index 90f7027cb8..884ec39bcd 100644 --- a/src/crawlee/browsers/_browser_pool.py +++ b/src/crawlee/browsers/_browser_pool.py @@ -99,6 +99,9 @@ def __init__( self._pages = WeakValueDictionary[str, CrawleePage]() # Track the pages in the pool self._plugins_cycle = itertools.cycle(self._plugins) # Cycle through the plugins + # Hooks for custom behavior at different stages of the browser and page lifecycles. + self._pre_launch_hooks: list[Callable[[str, BrowserPlugin], Awaitable[None]]] = [] + self._post_launch_hooks: list[Callable[[str, BrowserController], Awaitable[None]]] = [] self._pre_page_create_hooks: list[ Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]] ] = [] @@ -307,7 +310,7 @@ async def _get_new_page( try: if not browser_controller: - browser_controller = await asyncio.wait_for(self._launch_new_browser(plugin), timeout) + browser_controller = await asyncio.wait_for(self._launch_new_browser(page_id, plugin), timeout) browser_new_context_options = dict(plugin.browser_new_context_options) await self._execute_hooks( @@ -356,9 +359,17 @@ def _retire_browser(self, browser: BrowserController) -> None: self._active_browsers.remove(browser) self._inactive_browsers.append(browser) - async def _launch_new_browser(self, plugin: BrowserPlugin) -> BrowserController: + async def _launch_new_browser(self, page_id: str, plugin: BrowserPlugin) -> BrowserController: """Launch a new browser instance using the specified plugin.""" + await self._execute_hooks(self._pre_launch_hooks, page_id, plugin) browser = await plugin.new_browser() + + try: + await self._execute_hooks(self._post_launch_hooks, page_id, browser) + except Exception: + await browser.close(force=True) + raise + self._active_browsers.append(browser) return browser @@ -395,6 +406,27 @@ async def close_with_hooks(*args: Any, **kwargs: Any) -> None: crawlee_page.page.close: Callable[..., Awaitable[None]] = close_with_hooks + def pre_launch_hook( + self, hook: Callable[[str, BrowserPlugin], Awaitable[None]] + ) -> Callable[[str, BrowserPlugin], Awaitable[None]]: + """Register a hook to be called just before a new browser is launched. + + The hook receives the page ID that triggered the launch and the `BrowserPlugin` being used. + Mutating `plugin.browser_launch_options` affects all future launches, not just the current one. + """ + self._pre_launch_hooks.append(hook) + return hook + + def post_launch_hook( + self, hook: Callable[[str, BrowserController], Awaitable[None]] + ) -> Callable[[str, BrowserController], Awaitable[None]]: + """Register a hook to be called right after a new browser is launched. + + The hook receives the page ID that triggered the launch and the newly created `BrowserController`. + """ + self._post_launch_hooks.append(hook) + return hook + def pre_page_create_hook( self, hook: Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]] ) -> Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]: diff --git a/tests/unit/browsers/test_browser_pool.py b/tests/unit/browsers/test_browser_pool.py index a8e054379f..15990cfec1 100644 --- a/tests/unit/browsers/test_browser_pool.py +++ b/tests/unit/browsers/test_browser_pool.py @@ -17,6 +17,7 @@ from yarl import URL + from crawlee.browsers._browser_plugin import BrowserPlugin from crawlee.proxy_configuration import ProxyInfo @@ -309,11 +310,19 @@ async def hook(page_id: str, controller: BrowserController) -> None: assert isinstance(controller, BrowserController) -async def test_page_hooks_execution_order() -> None: +async def test_hooks_execution_order() -> None: call_order: list[str] = [] async with BrowserPool() as browser_pool: + @browser_pool.pre_launch_hook + async def pre_launch(_page_id: str, _plugin: BrowserPlugin) -> None: + call_order.append('pre_launch') + + @browser_pool.post_launch_hook + async def post_launch(_page_id: str, _controller: BrowserController) -> None: + call_order.append('post_launch') + @browser_pool.pre_page_create_hook async def pre_create( _page_id: str, @@ -338,7 +347,7 @@ async def post_close(_page_id: str, _controller: BrowserController) -> None: page = await browser_pool.new_page() await page.page.close() - assert call_order == ['pre_create', 'post_create', 'pre_close', 'post_close'] + assert call_order == ['pre_launch', 'post_launch', 'pre_create', 'post_create', 'pre_close', 'post_close'] async def test_multiple_hooks_all_called() -> None: @@ -358,3 +367,76 @@ async def second(_crawlee_page: CrawleePage, _controller: BrowserController) -> await page.page.close() assert call_order == ['first', 'second'] + + +async def test_pre_launch_hook_is_called() -> None: + call_mock = AsyncMock() + + async with BrowserPool() as browser_pool: + + @browser_pool.pre_launch_hook + async def hook(page_id: str, plugin: BrowserPlugin) -> None: + await call_mock(page_id, plugin) + + test_page = await browser_pool.new_page() + await test_page.page.close() + + call_mock.assert_awaited_once() + page_id, plugin = call_mock.call_args[0] + + assert isinstance(page_id, str) + assert test_page.id == page_id + assert isinstance(plugin, PlaywrightBrowserPlugin) + + +async def test_post_launch_hook_is_called() -> None: + call_mock = AsyncMock() + + async with BrowserPool() as browser_pool: + + @browser_pool.post_launch_hook + async def hook(page_id: str, controller: BrowserController) -> None: + await call_mock(page_id, controller) + + test_page = await browser_pool.new_page() + await test_page.page.close() + + call_mock.assert_awaited_once() + page_id, controller = call_mock.call_args[0] + + assert isinstance(page_id, str) + assert test_page.id == page_id + assert isinstance(controller, BrowserController) + + +async def test_post_launch_hook_error_closes_browser() -> None: + async with BrowserPool() as browser_pool: + + @browser_pool.post_launch_hook + async def hook(_page_id: str, _controller: BrowserController) -> None: + raise ValueError('Hook failed') + + with pytest.raises(ValueError, match='Hook failed'): + await browser_pool.new_page() + + assert len(browser_pool.active_browsers) == 0 + assert len(browser_pool.inactive_browsers) == 0 + + +async def test_launch_hooks_not_called_for_existing_browser() -> None: + launch_hook_calls = 0 + + async with BrowserPool() as browser_pool: + + @browser_pool.pre_launch_hook + async def hook(_page_id: str, _plugin: BrowserPlugin) -> None: + nonlocal launch_hook_calls + launch_hook_calls += 1 + + page_1 = await browser_pool.new_page() + page_2 = await browser_pool.new_page() + + await page_1.page.close() + await page_2.page.close() + + assert launch_hook_calls == 1 From af528cae741015c5528dd2b3130c421ddf5cc74d Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 7 May 2026 10:29:18 +0200 Subject: [PATCH 2/2] Polishment --- docs/guides/playwright_crawler.mdx | 12 +++++++++--- src/crawlee/browsers/_browser_pool.py | 11 ++++++++--- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/docs/guides/playwright_crawler.mdx b/docs/guides/playwright_crawler.mdx index 907b912f95..effa8e99c2 100644 --- a/docs/guides/playwright_crawler.mdx +++ b/docs/guides/playwright_crawler.mdx @@ -58,15 +58,21 @@ You can also configure each plugin used by `Brow For an example of how to implement a custom browser plugin, see the [Camoufox example](../examples/playwright-crawler-with-camoufox). [Camoufox](https://camoufox.com/) is a stealth browser plugin designed to reduce detection by anti-scraping measures and is fully compatible with `PlaywrightCrawler`. -The `BrowserPool` also exposes `pre_launch_hook` and `post_launch_hook` - called once per browser instance, before and after it is launched. Use them for logging, metrics, or any setup at the browser level. Note that these hooks are not called when a new page is created in an already-running browser. +## Browser pool lifecycle hooks + +The `BrowserPool` exposes lifecycle hooks for both browser launches and page creation/closure. To use them, create a `BrowserPool` instance and pass it to `PlaywrightCrawler` via the `browser_pool` argument. + +### Browser launch hooks + +The `pre_launch_hook` and `post_launch_hook` are called once per browser instance, before and after it is launched. Use them for logging, metrics, or any setup at the browser level. Note that these hooks are not called when a new page is created in an already-running browser. {BrowserPoolLaunchHooksExample} -## Page configuration with lifecycle page hooks +### Page lifecycle hooks -For additional setup or event-driven actions around page creation and closure, the `BrowserPool` exposes four lifecycle hooks: `pre_page_create_hook`, `post_page_create_hook`, `pre_page_close_hook`, and `post_page_close_hook`. To use them, create a `BrowserPool` instance and pass it to `PlaywrightCrawler` via the `browser_pool` argument. +For additional setup or event-driven actions around page creation and closure, the `BrowserPool` exposes four hooks: `pre_page_create_hook`, `post_page_create_hook`, `pre_page_close_hook`, and `post_page_close_hook`. {BrowserPoolPageHooksExample} diff --git a/src/crawlee/browsers/_browser_pool.py b/src/crawlee/browsers/_browser_pool.py index 884ec39bcd..0571dc8055 100644 --- a/src/crawlee/browsers/_browser_pool.py +++ b/src/crawlee/browsers/_browser_pool.py @@ -366,8 +366,13 @@ async def _launch_new_browser(self, page_id: str, plugin: BrowserPlugin) -> Brow try: await self._execute_hooks(self._post_launch_hooks, page_id, browser) - except Exception: - await browser.close(force=True) + except BaseException: + # Catch BaseException to also clean up on CancelledError raised by the outer + # asyncio.wait_for(operation_timeout) wrapping this call. + try: + await browser.close(force=True) + except Exception: + logger.exception('Failed to close browser after post_launch_hook error.') raise self._active_browsers.append(browser) @@ -412,7 +417,7 @@ def pre_launch_hook( """Register a hook to be called just before a new browser is launched. The hook receives the page ID that triggered the launch and the `BrowserPlugin` being used. - Mutating `plugin.browser_launch_options` affects all future launches, not just the current one. + Use it for logging, metrics, or other side effects scoped to the browser launch. """ self._pre_launch_hooks.append(hook) return hook