diff --git a/scrapegraph-js/src/crawl.js b/scrapegraph-js/src/crawl.js index f8cef65..631f3f7 100644 --- a/scrapegraph-js/src/crawl.js +++ b/scrapegraph-js/src/crawl.js @@ -25,6 +25,7 @@ import { getMockResponse } from './utils/mockResponse.js'; * @param {boolean} [options.stealth=false] - Enable stealth mode to avoid bot detection * @param {Array} [options.includePaths] - List of path patterns to include (e.g., ['/products/*', '/blog/**']). Supports wildcards: * matches any characters, ** matches any path segments * @param {Array} [options.excludePaths] - List of path patterns to exclude (e.g., ['/admin/*', '/api/*']). Supports wildcards and takes precedence over includePaths + * @param {string} [options.webhookUrl] - URL to receive webhook notifications when the crawl job completes * @returns {Promise} The crawl job response * @throws {Error} Throws an error if the HTTP request fails */ @@ -35,7 +36,7 @@ export async function crawl( schema, options = {} ) { - const { mock = null, renderHeavyJs = false, stealth = false, includePaths = null, excludePaths = null } = options; + const { mock = null, renderHeavyJs = false, stealth = false, includePaths = null, excludePaths = null, webhookUrl = null } = options; // Check if mock mode is enabled const useMock = mock !== null ? mock : isMockEnabled(); @@ -98,6 +99,10 @@ export async function crawl( payload.exclude_paths = excludePaths; } + if (webhookUrl) { + payload.webhook_url = webhookUrl; + } + try { const response = await axios.post(endpoint, payload, { headers }); return response.data; diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index 4240c63..4491482 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -864,6 +864,7 @@ async def crawl( stealth: bool = False, include_paths: Optional[list[str]] = None, exclude_paths: Optional[list[str]] = None, + webhook_url: Optional[str] = None, return_toon: bool = False, ): """Send a crawl request with support for both AI extraction and @@ -887,6 +888,7 @@ async def crawl( Supports wildcards: * matches any characters, ** matches any path segments exclude_paths: List of path patterns to exclude (e.g., ['/admin/*', '/api/*']) Supports wildcards and takes precedence over include_paths + webhook_url: URL to receive webhook notifications when the crawl completes return_toon: If True, return response in TOON format (reduces token usage by 30-60%) """ logger.info("🔍 Starting crawl request") @@ -916,6 +918,8 @@ async def crawl( logger.debug(f"✅ Include paths: {include_paths}") if exclude_paths: logger.debug(f"❌ Exclude paths: {exclude_paths}") + if webhook_url: + logger.debug(f"🔔 Webhook URL: {webhook_url}") if return_toon: logger.debug("🎨 TOON format output enabled") @@ -945,6 +949,8 @@ async def crawl( request_data["include_paths"] = include_paths if exclude_paths is not None: request_data["exclude_paths"] = exclude_paths + if webhook_url is not None: + request_data["webhook_url"] = webhook_url request = CrawlRequest(**request_data) logger.debug("✅ Request validation passed") diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py index fa0e179..5a04311 100644 --- a/scrapegraph-py/scrapegraph_py/client.py +++ b/scrapegraph-py/scrapegraph_py/client.py @@ -874,6 +874,7 @@ def crawl( stealth: bool = False, include_paths: Optional[list[str]] = None, exclude_paths: Optional[list[str]] = None, + webhook_url: Optional[str] = None, return_toon: bool = False, ): """Send a crawl request with support for both AI extraction and @@ -897,6 +898,7 @@ def crawl( Supports wildcards: * matches any characters, ** matches any path segments exclude_paths: List of path patterns to exclude (e.g., ['/admin/*', '/api/*']) Supports wildcards and takes precedence over include_paths + webhook_url: URL to receive webhook notifications when the crawl completes return_toon: If True, return response in TOON format (reduces token usage by 30-60%) """ logger.info("🔍 Starting crawl request") @@ -926,6 +928,8 @@ def crawl( logger.debug(f"✅ Include paths: {include_paths}") if exclude_paths: logger.debug(f"❌ Exclude paths: {exclude_paths}") + if webhook_url: + logger.debug(f"🔔 Webhook URL: {webhook_url}") if return_toon: logger.debug("🎨 TOON format output enabled") @@ -955,6 +959,8 @@ def crawl( request_data["include_paths"] = include_paths if exclude_paths is not None: request_data["exclude_paths"] = exclude_paths + if webhook_url is not None: + request_data["webhook_url"] = webhook_url request = CrawlRequest(**request_data) logger.debug("✅ Request validation passed") diff --git a/scrapegraph-py/scrapegraph_py/models/crawl.py b/scrapegraph-py/scrapegraph_py/models/crawl.py index 72d6471..7f4638f 100644 --- a/scrapegraph-py/scrapegraph_py/models/crawl.py +++ b/scrapegraph-py/scrapegraph_py/models/crawl.py @@ -94,6 +94,12 @@ class CrawlRequest(BaseModel): "Takes precedence over include_paths.", example=["/admin/*", "/api/**"] ) + webhook_url: Optional[str] = Field( + default=None, + description="URL to receive webhook notifications when the crawl job completes. " + "The webhook will receive a POST request with the crawl results.", + example="https://example.com/webhook" + ) @model_validator(mode="after") def validate_url(self) -> "CrawlRequest": @@ -169,6 +175,21 @@ def validate_path_patterns(self) -> "CrawlRequest": return self + @model_validator(mode="after") + def validate_webhook_url(self) -> "CrawlRequest": + """Validate webhook URL format if provided""" + if self.webhook_url is not None: + if not self.webhook_url.strip(): + raise ValueError("Webhook URL cannot be empty") + if not ( + self.webhook_url.startswith("http://") + or self.webhook_url.startswith("https://") + ): + raise ValueError( + "Invalid webhook URL - must start with http:// or https://" + ) + return self + class GetCrawlRequest(BaseModel): """Request model for get_crawl endpoint"""