Source code for scrapy.spidermiddlewares.base

from __future__ import annotations

from typing import TYPE_CHECKING, Any

from scrapy import Request, Spider

if TYPE_CHECKING:
    from collections.abc import AsyncIterator, Iterable

    # typing.Self requires Python 3.11
    from typing_extensions import Self

    from scrapy.crawler import Crawler
    from scrapy.http import Response


[docs] class BaseSpiderMiddleware: """Optional base class for spider middlewares. .. versionadded:: 2.13 This class provides helper methods for asynchronous ``process_spider_output()`` and ``process_start()`` methods. Middlewares that don't have either of these methods don't need to use this class. You can override the :meth:`~scrapy.spidermiddlewares.base.BaseSpiderMiddleware.get_processed_request` method to add processing code for requests and the :meth:`~scrapy.spidermiddlewares.base.BaseSpiderMiddleware.get_processed_item` method to add processing code for items. These methods take a single request or item from the spider output iterable and return a request or item (the same or a new one), or ``None`` to remove this request or item from the processing. """ def __init__(self, crawler: Crawler): self.crawler: Crawler = crawler @classmethod def from_crawler(cls, crawler: Crawler) -> Self: return cls(crawler) def process_start_requests( self, start: Iterable[Any], spider: Spider ) -> Iterable[Any]: for o in start: if (o := self._get_processed(o, None)) is not None: yield o async def process_start(self, start: AsyncIterator[Any]) -> AsyncIterator[Any]: async for o in start: if (o := self._get_processed(o, None)) is not None: yield o def process_spider_output( self, response: Response, result: Iterable[Any], spider: Spider ) -> Iterable[Any]: for o in result: if (o := self._get_processed(o, response)) is not None: yield o async def process_spider_output_async( self, response: Response, result: AsyncIterator[Any], spider: Spider ) -> AsyncIterator[Any]: async for o in result: if (o := self._get_processed(o, response)) is not None: yield o def _get_processed(self, o: Any, response: Response | None) -> Any: if isinstance(o, Request): return self.get_processed_request(o, response) return self.get_processed_item(o, response)
[docs] def get_processed_request( self, request: Request, response: Response | None ) -> Request | None: """Return a processed request from the spider output. This method is called with a single request from the start seeds or the spider output. It should return the same or a different request, or ``None`` to ignore it. :param request: the input request :type request: :class:`~scrapy.Request` object :param response: the response being processed :type response: :class:`~scrapy.http.Response` object or ``None`` for start seeds :return: the processed request or ``None`` """ return request
[docs] def get_processed_item(self, item: Any, response: Response | None) -> Any: """Return a processed item from the spider output. This method is called with a single item from the start seeds or the spider output. It should return the same or a different item, or ``None`` to ignore it. :param item: the input item :type item: item object :param response: the response being processed :type response: :class:`~scrapy.http.Response` object or ``None`` for start seeds :return: the processed item or ``None`` """ return item