Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 27 additions & 8 deletions src/crawlee/_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class RequestState(IntEnum):
class CrawleeRequestData(BaseModel):
"""Crawlee-specific configuration stored in the `user_data`."""

max_retries: Annotated[int | None, Field(alias='maxRetries')] = None
max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None
"""Maximum number of retries for this request. Allows to override the global `max_request_retries` option of
`BasicCrawler`."""

Expand Down Expand Up @@ -137,6 +137,8 @@ class RequestOptions(TypedDict):
always_enqueue: NotRequired[bool]
user_data: NotRequired[dict[str, JsonSerializable]]
no_retry: NotRequired[bool]
enqueue_strategy: NotRequired[EnqueueStrategy]
max_retries: NotRequired[int | None]


@docs_group('Storage data')
Expand Down Expand Up @@ -166,7 +168,7 @@ class Request(BaseModel):

model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)

unique_key: Annotated[str, Field(alias='uniqueKey')]
unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)]
"""A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
to the same URL.

Expand All @@ -178,17 +180,18 @@ class Request(BaseModel):
and specify which URLs shall be considered equal.
"""

url: Annotated[str, BeforeValidator(validate_http_url), Field()]
url: Annotated[str, BeforeValidator(validate_http_url), Field(frozen=True)]
"""The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
and fragments."""

method: HttpMethod = 'GET'
method: Annotated[HttpMethod, Field(frozen=True)] = 'GET'
"""HTTP request method."""

payload: Annotated[
HttpPayload | None,
BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v),
Field(frozen=True),
] = None
"""HTTP request payload."""

Expand Down Expand Up @@ -250,6 +253,8 @@ def from_url(
keep_url_fragment: bool = False,
use_extended_unique_key: bool = False,
always_enqueue: bool = False,
enqueue_strategy: EnqueueStrategy | None = None,
max_retries: int | None = None,
**kwargs: Any,
) -> Self:
"""Create a new `Request` instance from a URL.
Expand Down Expand Up @@ -277,6 +282,9 @@ def from_url(
`unique_key` computation. This is only relevant when `unique_key` is not provided.
always_enqueue: If set to `True`, the request will be enqueued even if it is already present in the queue.
Using this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.
enqueue_strategy: The strategy that will be used for enqueuing the request.
max_retries: Maximum number of retries for this request. Allows to override the global `max_request_retries`
option of `BasicCrawler`.
**kwargs: Additional request properties.
"""
if unique_key is not None and always_enqueue:
Expand All @@ -301,12 +309,27 @@ def from_url(
if always_enqueue:
unique_key = f'{crypto_random_object_id()}|{unique_key}'

user_data_dict = kwargs.pop('user_data', {}) or {}
crawlee_data_dict = user_data_dict.get('__crawlee', {})

if max_retries is not None:
crawlee_data_dict['maxRetries'] = max_retries

if enqueue_strategy is not None:
crawlee_data_dict['enqueueStrategy'] = enqueue_strategy

crawlee_data = CrawleeRequestData(**crawlee_data_dict)

if crawlee_data:
user_data_dict['__crawlee'] = crawlee_data

request = cls(
url=url,
unique_key=unique_key,
method=method,
headers=headers,
payload=payload,
user_data=user_data_dict,
**kwargs,
)

Expand Down Expand Up @@ -365,10 +388,6 @@ def max_retries(self) -> int | None:
"""Crawlee-specific limit on the number of retries of the request."""
return self.crawlee_data.max_retries

@max_retries.setter
def max_retries(self, new_max_retries: int) -> None:
self.crawlee_data.max_retries = new_max_retries

@property
def session_rotation_count(self) -> int | None:
"""Crawlee-specific number of finished session rotations for the request."""
Expand Down
Loading