Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion scrapegraph-js/src/crawl.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import { getMockResponse } from './utils/mockResponse.js';
* @param {boolean} [options.stealth=false] - Enable stealth mode to avoid bot detection
* @param {Array<string>} [options.includePaths] - List of path patterns to include (e.g., ['/products/*', '/blog/**']). Supports wildcards: * matches any characters, ** matches any path segments
* @param {Array<string>} [options.excludePaths] - List of path patterns to exclude (e.g., ['/admin/*', '/api/*']). Supports wildcards and takes precedence over includePaths
* @param {string} [options.webhookUrl] - URL to receive webhook notifications when the crawl job completes
* @returns {Promise<Object>} The crawl job response
* @throws {Error} Throws an error if the HTTP request fails
*/
Expand All @@ -35,7 +36,7 @@ export async function crawl(
schema,
options = {}
) {
const { mock = null, renderHeavyJs = false, stealth = false, includePaths = null, excludePaths = null } = options;
const { mock = null, renderHeavyJs = false, stealth = false, includePaths = null, excludePaths = null, webhookUrl = null } = options;

// Check if mock mode is enabled
const useMock = mock !== null ? mock : isMockEnabled();
Expand Down Expand Up @@ -98,6 +99,10 @@ export async function crawl(
payload.exclude_paths = excludePaths;
}

if (webhookUrl) {
payload.webhook_url = webhookUrl;
Comment on lines +102 to +103
Copy link

Copilot AI Jan 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if (webhookUrl) silently ignores explicitly provided empty-string values and also allows whitespace-only strings through (both are likely invalid URLs). Prefer treating “provided” as webhookUrl != null, validating it’s a non-empty string after trim and starts with http:// or https://, and then setting payload.webhook_url (or throwing a clear error) to avoid surprising no-op behavior.

Suggested change
if (webhookUrl) {
payload.webhook_url = webhookUrl;
if (webhookUrl != null) {
if (typeof webhookUrl !== 'string') {
throw new Error('webhookUrl must be a string starting with "http://" or "https://".');
}
const trimmedWebhookUrl = webhookUrl.trim();
if (!trimmedWebhookUrl) {
throw new Error('webhookUrl must be a non-empty string.');
}
if (!trimmedWebhookUrl.startsWith('http://') && !trimmedWebhookUrl.startsWith('https://')) {
throw new Error('webhookUrl must start with "http://" or "https://".');
}
payload.webhook_url = trimmedWebhookUrl;

Copilot uses AI. Check for mistakes.
}

try {
const response = await axios.post(endpoint, payload, { headers });
return response.data;
Expand Down
6 changes: 6 additions & 0 deletions scrapegraph-py/scrapegraph_py/async_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -864,6 +864,7 @@ async def crawl(
stealth: bool = False,
include_paths: Optional[list[str]] = None,
exclude_paths: Optional[list[str]] = None,
webhook_url: Optional[str] = None,
return_toon: bool = False,
):
"""Send a crawl request with support for both AI extraction and
Expand All @@ -887,6 +888,7 @@ async def crawl(
Supports wildcards: * matches any characters, ** matches any path segments
exclude_paths: List of path patterns to exclude (e.g., ['/admin/*', '/api/*'])
Supports wildcards and takes precedence over include_paths
webhook_url: URL to receive webhook notifications when the crawl completes
return_toon: If True, return response in TOON format (reduces token usage by 30-60%)
"""
logger.info("🔍 Starting crawl request")
Expand Down Expand Up @@ -916,6 +918,8 @@ async def crawl(
logger.debug(f"✅ Include paths: {include_paths}")
if exclude_paths:
logger.debug(f"❌ Exclude paths: {exclude_paths}")
if webhook_url:
logger.debug(f"🔔 Webhook URL: {webhook_url}")
if return_toon:
logger.debug("🎨 TOON format output enabled")

Expand Down Expand Up @@ -945,6 +949,8 @@ async def crawl(
request_data["include_paths"] = include_paths
if exclude_paths is not None:
request_data["exclude_paths"] = exclude_paths
if webhook_url is not None:
request_data["webhook_url"] = webhook_url
Comment on lines 949 to +953
Copy link

Copilot AI Jan 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Async crawl currently supports webhook_url, but the async client tests don’t verify the outgoing request payload includes webhook_url when set. Consider adding an async test that asserts the POST body contains webhook_url (and omits it when None) to prevent regressions.

Copilot uses AI. Check for mistakes.

request = CrawlRequest(**request_data)
logger.debug("✅ Request validation passed")
Expand Down
6 changes: 6 additions & 0 deletions scrapegraph-py/scrapegraph_py/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -874,6 +874,7 @@ def crawl(
stealth: bool = False,
include_paths: Optional[list[str]] = None,
exclude_paths: Optional[list[str]] = None,
webhook_url: Optional[str] = None,
return_toon: bool = False,
):
"""Send a crawl request with support for both AI extraction and
Expand All @@ -897,6 +898,7 @@ def crawl(
Supports wildcards: * matches any characters, ** matches any path segments
exclude_paths: List of path patterns to exclude (e.g., ['/admin/*', '/api/*'])
Supports wildcards and takes precedence over include_paths
webhook_url: URL to receive webhook notifications when the crawl completes
return_toon: If True, return response in TOON format (reduces token usage by 30-60%)
"""
logger.info("🔍 Starting crawl request")
Expand Down Expand Up @@ -926,6 +928,8 @@ def crawl(
logger.debug(f"✅ Include paths: {include_paths}")
if exclude_paths:
logger.debug(f"❌ Exclude paths: {exclude_paths}")
if webhook_url:
logger.debug(f"🔔 Webhook URL: {webhook_url}")
if return_toon:
logger.debug("🎨 TOON format output enabled")

Expand Down Expand Up @@ -955,6 +959,8 @@ def crawl(
request_data["include_paths"] = include_paths
if exclude_paths is not None:
request_data["exclude_paths"] = exclude_paths
if webhook_url is not None:
request_data["webhook_url"] = webhook_url
Comment on lines 961 to +963
Copy link

Copilot AI Jan 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The client now adds webhook_url to request_data, but there are no assertions in the existing crawl request-body tests to ensure this field is actually sent when provided (and omitted when None). Please extend the existing crawl tests that inspect request JSON (e.g., in scrapegraph-py/tests/test_crawl_polling.py) to cover webhook_url.

Copilot uses AI. Check for mistakes.

request = CrawlRequest(**request_data)
logger.debug("✅ Request validation passed")
Expand Down
21 changes: 21 additions & 0 deletions scrapegraph-py/scrapegraph_py/models/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,12 @@ class CrawlRequest(BaseModel):
"Takes precedence over include_paths.",
example=["/admin/*", "/api/**"]
)
webhook_url: Optional[str] = Field(
default=None,
description="URL to receive webhook notifications when the crawl job completes. "
"The webhook will receive a POST request with the crawl results.",
example="https://example.com/webhook"
)

@model_validator(mode="after")
def validate_url(self) -> "CrawlRequest":
Expand Down Expand Up @@ -169,6 +175,21 @@ def validate_path_patterns(self) -> "CrawlRequest":

return self

@model_validator(mode="after")
def validate_webhook_url(self) -> "CrawlRequest":
"""Validate webhook URL format if provided"""
if self.webhook_url is not None:
if not self.webhook_url.strip():
raise ValueError("Webhook URL cannot be empty")
if not (
self.webhook_url.startswith("http://")
or self.webhook_url.startswith("https://")
):
raise ValueError(
"Invalid webhook URL - must start with http:// or https://"
)
return self
Comment on lines +178 to +191
Copy link

Copilot AI Jan 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing test coverage for the newly added webhook_url behavior (accepts valid http(s) URLs, rejects empty/whitespace, rejects non-http(s) schemes, and is included/excluded correctly in model_dump(exclude_none=True)). There are already CrawlRequest validation/serialization tests (e.g., scrapegraph-py/tests/test_crawl_path_filtering.py) that should be extended to cover this field.

Copilot uses AI. Check for mistakes.


class GetCrawlRequest(BaseModel):
"""Request model for get_crawl endpoint"""
Expand Down
Loading