Source code for scrapy.dupefilters

from __future__ import annotations

import logging
import warnings
from pathlib import Path
from typing import TYPE_CHECKING
from warnings import warn

from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.job import job_dir
from scrapy.utils.request import (
    RequestFingerprinter,
    RequestFingerprinterProtocol,
    referer_str,
)

if TYPE_CHECKING:
    from twisted.internet.defer import Deferred

    # typing.Self requires Python 3.11
    from typing_extensions import Self

    from scrapy.crawler import Crawler
    from scrapy.http.request import Request
    from scrapy.settings import BaseSettings
    from scrapy.spiders import Spider


[docs] class BaseDupeFilter: """Dummy duplicate request filtering class (:setting:`DUPEFILTER_CLASS`) that does not filter out any request.""" @classmethod def from_settings(cls, settings: BaseSettings) -> Self: warnings.warn( f"{cls.__name__}.from_settings() is deprecated, use from_crawler() instead.", category=ScrapyDeprecationWarning, stacklevel=2, ) return cls() @classmethod def from_crawler(cls, crawler: Crawler) -> Self: return cls() def request_seen(self, request: Request) -> bool: return False def open(self) -> Deferred[None] | None: pass def close(self, reason: str) -> Deferred[None] | None: pass def log(self, request: Request, spider: Spider) -> None: """Log that a request has been filtered""" warn( "Calling BaseDupeFilter.log() is deprecated.", ScrapyDeprecationWarning, stacklevel=2, )
[docs] class RFPDupeFilter(BaseDupeFilter): """Duplicate request filtering class (:setting:`DUPEFILTER_CLASS`) that filters out requests with the canonical (:func:`w3lib.url.canonicalize_url`) :attr:`~scrapy.http.Request.url`, :attr:`~scrapy.http.Request.method` and :attr:`~scrapy.http.Request.body`. """ def __init__( self, path: str | None = None, debug: bool = False, *, fingerprinter: RequestFingerprinterProtocol | None = None, ) -> None: self.file = None self.fingerprinter: RequestFingerprinterProtocol = ( fingerprinter or RequestFingerprinter() ) self.fingerprints: set[str] = set() self.logdupes = True self.debug = debug self.logger = logging.getLogger(__name__) if path: self.file = Path(path, "requests.seen").open("a+", encoding="utf-8") self.file.seek(0) self.fingerprints.update(x.rstrip() for x in self.file) @classmethod def from_settings( cls, settings: BaseSettings, *, fingerprinter: RequestFingerprinterProtocol | None = None, ) -> Self: warnings.warn( f"{cls.__name__}.from_settings() is deprecated, use from_crawler() instead.", category=ScrapyDeprecationWarning, stacklevel=2, ) return cls._from_settings(settings, fingerprinter=fingerprinter) @classmethod def from_crawler(cls, crawler: Crawler) -> Self: assert crawler.request_fingerprinter return cls._from_settings( crawler.settings, fingerprinter=crawler.request_fingerprinter, ) @classmethod def _from_settings( cls, settings: BaseSettings, *, fingerprinter: RequestFingerprinterProtocol | None = None, ) -> Self: debug = settings.getbool("DUPEFILTER_DEBUG") return cls(job_dir(settings), debug, fingerprinter=fingerprinter) def request_seen(self, request: Request) -> bool: fp = self.request_fingerprint(request) if fp in self.fingerprints: return True self.fingerprints.add(fp) if self.file: self.file.write(fp + "\n") return False def request_fingerprint(self, request: Request) -> str: """Returns a string that uniquely identifies the specified request.""" return self.fingerprinter.fingerprint(request).hex() def close(self, reason: str) -> None: if self.file: self.file.close() def log(self, request: Request, spider: Spider) -> None: if self.debug: msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)" args = {"request": request, "referer": referer_str(request)} self.logger.debug(msg, args, extra={"spider": spider}) elif self.logdupes: msg = ( "Filtered duplicate request: %(request)s" " - no more duplicates will be shown" " (see DUPEFILTER_DEBUG to show all duplicates)" ) self.logger.debug(msg, {"request": request}, extra={"spider": spider}) self.logdupes = False assert spider.crawler.stats spider.crawler.stats.inc_value("dupefilter/filtered", spider=spider)