Source code for scrapy.spidermiddlewares.depth

"""
Depth Spider Middleware

See documentation in docs/topics/spider-middleware.rst
"""

import logging

from scrapy.http import Request

logger = logging.getLogger(__name__)


[docs]class DepthMiddleware: def __init__(self, maxdepth, stats, verbose_stats=False, prio=1): self.maxdepth = maxdepth self.stats = stats self.verbose_stats = verbose_stats self.prio = prio @classmethod def from_crawler(cls, crawler): settings = crawler.settings maxdepth = settings.getint("DEPTH_LIMIT") verbose = settings.getbool("DEPTH_STATS_VERBOSE") prio = settings.getint("DEPTH_PRIORITY") return cls(maxdepth, crawler.stats, verbose, prio) def process_spider_output(self, response, result, spider): self._init_depth(response, spider) return (r for r in result or () if self._filter(r, response, spider)) async def process_spider_output_async(self, response, result, spider): self._init_depth(response, spider) async for r in result or (): if self._filter(r, response, spider): yield r def _init_depth(self, response, spider): # base case (depth=0) if "depth" not in response.meta: response.meta["depth"] = 0 if self.verbose_stats: self.stats.inc_value("request_depth_count/0", spider=spider) def _filter(self, request, response, spider): if not isinstance(request, Request): return True depth = response.meta["depth"] + 1 request.meta["depth"] = depth if self.prio: request.priority -= depth * self.prio if self.maxdepth and depth > self.maxdepth: logger.debug( "Ignoring link (depth > %(maxdepth)d): %(requrl)s ", {"maxdepth": self.maxdepth, "requrl": request.url}, extra={"spider": spider}, ) return False if self.verbose_stats: self.stats.inc_value(f"request_depth_count/{depth}", spider=spider) self.stats.max_value("request_depth_max", depth, spider=spider) return True