Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from __future__ import annotations

import asyncio
import logging
from typing import TYPE_CHECKING

from crawlee.browsers import BrowserPool
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext

if TYPE_CHECKING:
from crawlee.browsers._browser_controller import BrowserController
from crawlee.browsers._browser_plugin import BrowserPlugin

logger = logging.getLogger(__name__)


async def main() -> None:
async with BrowserPool() as browser_pool:

@browser_pool.pre_launch_hook
async def log_browser_launch(page_id: str, plugin: BrowserPlugin) -> None:
"""Log before a new browser instance is launched."""
logger.info(f'Launching {plugin.browser_type} browser for page {page_id}...')

@browser_pool.post_launch_hook
async def log_browser_launched(
page_id: str, controller: BrowserController
) -> None:
"""Log after a new browser instance has been launched."""
logger.info(f'Browser launched for page {page_id}, controller: {controller}')

crawler = PlaywrightCrawler(
browser_pool=browser_pool,
max_requests_per_crawl=5,
)

@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

await context.enqueue_links()

# Run the crawler with the initial list of URLs.
await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
asyncio.run(main())
19 changes: 16 additions & 3 deletions docs/guides/playwright_crawler.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
import MultipleLaunchExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/multiple_launch_example.py';
import BrowserConfigurationExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_configuration_example.py';
import NavigationHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/navigation_hooks_example.py';
import BrowserPoolLaunchHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_pool_launch_hooks_example.py';
import BrowserPoolPageHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_pool_page_hooks_example.py';
import PluginBrowserConfigExample from '!!raw-loader!./code_examples/playwright_crawler/plugin_browser_configuration_example.py';

Expand Down Expand Up @@ -57,9 +58,21 @@ You can also configure each plugin used by <ApiLink to="class/BrowserPool">`Brow

For an example of how to implement a custom browser plugin, see the [Camoufox example](../examples/playwright-crawler-with-camoufox). [Camoufox](https://camoufox.com/) is a stealth browser plugin designed to reduce detection by anti-scraping measures and is fully compatible with <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>.

## Page configuration with lifecycle page hooks
## Browser pool lifecycle hooks

For additional setup or event-driven actions around page creation and closure, the <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> exposes four lifecycle hooks: <ApiLink to="class/BrowserPool#pre_page_create_hook">`pre_page_create_hook`</ApiLink>, <ApiLink to="class/BrowserPool#post_page_create_hook">`post_page_create_hook`</ApiLink>, <ApiLink to="class/BrowserPool#pre_page_close_hook">`pre_page_close_hook`</ApiLink>, and <ApiLink to="class/BrowserPool#post_page_close_hook">`post_page_close_hook`</ApiLink>. To use them, create a `BrowserPool` instance and pass it to <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> via the `browser_pool` argument.
The <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> exposes lifecycle hooks for both browser launches and page creation/closure. To use them, create a `BrowserPool` instance and pass it to <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> via the `browser_pool` argument.

### Browser launch hooks

The <ApiLink to="class/BrowserPool#pre_launch_hook">`pre_launch_hook`</ApiLink> and <ApiLink to="class/BrowserPool#post_launch_hook">`post_launch_hook`</ApiLink> are called once per browser instance, before and after it is launched. Use them for logging, metrics, or any setup at the browser level. Note that these hooks are not called when a new page is created in an already-running browser.

<RunnableCodeBlock className="language-python" language="python">
{BrowserPoolLaunchHooksExample}
</RunnableCodeBlock>

### Page lifecycle hooks

For additional setup or event-driven actions around page creation and closure, the <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> exposes four hooks: <ApiLink to="class/BrowserPool#pre_page_create_hook">`pre_page_create_hook`</ApiLink>, <ApiLink to="class/BrowserPool#post_page_create_hook">`post_page_create_hook`</ApiLink>, <ApiLink to="class/BrowserPool#pre_page_close_hook">`pre_page_close_hook`</ApiLink>, and <ApiLink to="class/BrowserPool#post_page_close_hook">`post_page_close_hook`</ApiLink>.

<RunnableCodeBlock className="language-python" language="python">
{BrowserPoolPageHooksExample}
Expand All @@ -75,4 +88,4 @@ Navigation hooks allow for additional configuration at specific points during pa

## Conclusion

This guide introduced the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and explained how to configure it using <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> and <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink>. You learned how to launch multiple browsers, configure browser and context settings, use <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> lifecycle page hooks, and apply navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
This guide introduced the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and explained how to configure it using <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> and <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink>. You learned how to launch multiple browsers, configure browser and context settings, use <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> lifecycle hooks, and apply navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
41 changes: 39 additions & 2 deletions src/crawlee/browsers/_browser_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ def __init__(
self._pages = WeakValueDictionary[str, CrawleePage]() # Track the pages in the pool
self._plugins_cycle = itertools.cycle(self._plugins) # Cycle through the plugins

# Hooks for custom behavior at different stages of the browser and page lifecycles.
self._pre_launch_hooks: list[Callable[[str, BrowserPlugin], Awaitable[None]]] = []
self._post_launch_hooks: list[Callable[[str, BrowserController], Awaitable[None]]] = []
self._pre_page_create_hooks: list[
Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]
] = []
Expand Down Expand Up @@ -307,7 +310,7 @@ async def _get_new_page(

try:
if not browser_controller:
browser_controller = await asyncio.wait_for(self._launch_new_browser(plugin), timeout)
browser_controller = await asyncio.wait_for(self._launch_new_browser(page_id, plugin), timeout)
browser_new_context_options = dict(plugin.browser_new_context_options)

await self._execute_hooks(
Expand Down Expand Up @@ -356,9 +359,22 @@ def _retire_browser(self, browser: BrowserController) -> None:
self._active_browsers.remove(browser)
self._inactive_browsers.append(browser)

async def _launch_new_browser(self, plugin: BrowserPlugin) -> BrowserController:
async def _launch_new_browser(self, page_id: str, plugin: BrowserPlugin) -> BrowserController:
"""Launch a new browser instance using the specified plugin."""
await self._execute_hooks(self._pre_launch_hooks, page_id, plugin)
browser = await plugin.new_browser()

try:
await self._execute_hooks(self._post_launch_hooks, page_id, browser)
except BaseException:
# Catch BaseException to also clean up on CancelledError raised by the outer
# asyncio.wait_for(operation_timeout) wrapping this call.
try:
await browser.close(force=True)
except Exception:
logger.exception('Failed to close browser after post_launch_hook error.')
raise

self._active_browsers.append(browser)
return browser

Expand Down Expand Up @@ -395,6 +411,27 @@ async def close_with_hooks(*args: Any, **kwargs: Any) -> None:

crawlee_page.page.close: Callable[..., Awaitable[None]] = close_with_hooks

def pre_launch_hook(
self, hook: Callable[[str, BrowserPlugin], Awaitable[None]]
) -> Callable[[str, BrowserPlugin], Awaitable[None]]:
"""Register a hook to be called just before a new browser is launched.

The hook receives the page ID that triggered the launch and the `BrowserPlugin` being used.
Use it for logging, metrics, or other side effects scoped to the browser launch.
"""
self._pre_launch_hooks.append(hook)
return hook

def post_launch_hook(
self, hook: Callable[[str, BrowserController], Awaitable[None]]
) -> Callable[[str, BrowserController], Awaitable[None]]:
"""Register a hook to be called right after a new browser is launched.

The hook receives the page ID that triggered the launch and the newly created `BrowserController`.
"""
self._post_launch_hooks.append(hook)
return hook

def pre_page_create_hook(
self, hook: Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]
) -> Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]:
Expand Down
86 changes: 84 additions & 2 deletions tests/unit/browsers/test_browser_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from yarl import URL

from crawlee.browsers._browser_plugin import BrowserPlugin
from crawlee.proxy_configuration import ProxyInfo


Expand Down Expand Up @@ -309,11 +310,19 @@ async def hook(page_id: str, controller: BrowserController) -> None:
assert isinstance(controller, BrowserController)


async def test_page_hooks_execution_order() -> None:
async def test_hooks_execution_order() -> None:
call_order: list[str] = []

async with BrowserPool() as browser_pool:

@browser_pool.pre_launch_hook
async def pre_launch(_page_id: str, _plugin: BrowserPlugin) -> None:
call_order.append('pre_launch')

@browser_pool.post_launch_hook
async def post_launch(_page_id: str, _controller: BrowserController) -> None:
call_order.append('post_launch')

@browser_pool.pre_page_create_hook
async def pre_create(
_page_id: str,
Expand All @@ -338,7 +347,7 @@ async def post_close(_page_id: str, _controller: BrowserController) -> None:
page = await browser_pool.new_page()
await page.page.close()

assert call_order == ['pre_create', 'post_create', 'pre_close', 'post_close']
assert call_order == ['pre_launch', 'post_launch', 'pre_create', 'post_create', 'pre_close', 'post_close']


async def test_multiple_hooks_all_called() -> None:
Expand All @@ -358,3 +367,76 @@ async def second(_crawlee_page: CrawleePage, _controller: BrowserController) ->
await page.page.close()

assert call_order == ['first', 'second']


async def test_pre_launch_hook_is_called() -> None:
call_mock = AsyncMock()

async with BrowserPool() as browser_pool:

@browser_pool.pre_launch_hook
async def hook(page_id: str, plugin: BrowserPlugin) -> None:
await call_mock(page_id, plugin)

test_page = await browser_pool.new_page()
await test_page.page.close()

call_mock.assert_awaited_once()
page_id, plugin = call_mock.call_args[0]

assert isinstance(page_id, str)
assert test_page.id == page_id
assert isinstance(plugin, PlaywrightBrowserPlugin)


async def test_post_launch_hook_is_called() -> None:
call_mock = AsyncMock()

async with BrowserPool() as browser_pool:

@browser_pool.post_launch_hook
async def hook(page_id: str, controller: BrowserController) -> None:
await call_mock(page_id, controller)

test_page = await browser_pool.new_page()
await test_page.page.close()

call_mock.assert_awaited_once()
page_id, controller = call_mock.call_args[0]

assert isinstance(page_id, str)
assert test_page.id == page_id
assert isinstance(controller, BrowserController)


async def test_post_launch_hook_error_closes_browser() -> None:
async with BrowserPool() as browser_pool:

@browser_pool.post_launch_hook
async def hook(_page_id: str, _controller: BrowserController) -> None:
raise ValueError('Hook failed')

with pytest.raises(ValueError, match='Hook failed'):
await browser_pool.new_page()

assert len(browser_pool.active_browsers) == 0
assert len(browser_pool.inactive_browsers) == 0


async def test_launch_hooks_not_called_for_existing_browser() -> None:
launch_hook_calls = 0

async with BrowserPool() as browser_pool:

@browser_pool.pre_launch_hook
async def hook(_page_id: str, _plugin: BrowserPlugin) -> None:
nonlocal launch_hook_calls
launch_hook_calls += 1

page_1 = await browser_pool.new_page()
page_2 = await browser_pool.new_page()

await page_1.page.close()
await page_2.page.close()

assert launch_hook_calls == 1
Loading