from __future__ import annotations
import logging
import warnings
from pathlib import Path
from typing import TYPE_CHECKING
from warnings import warn
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.job import job_dir
from scrapy.utils.request import (
RequestFingerprinter,
RequestFingerprinterProtocol,
referer_str,
)
if TYPE_CHECKING:
from twisted.internet.defer import Deferred
# typing.Self requires Python 3.11
from typing_extensions import Self
from scrapy.crawler import Crawler
from scrapy.http.request import Request
from scrapy.settings import BaseSettings
from scrapy.spiders import Spider
[docs]
class BaseDupeFilter:
"""Dummy duplicate request filtering class (:setting:`DUPEFILTER_CLASS`)
that does not filter out any request."""
@classmethod
def from_settings(cls, settings: BaseSettings) -> Self:
warnings.warn(
f"{cls.__name__}.from_settings() is deprecated, use from_crawler() instead.",
category=ScrapyDeprecationWarning,
stacklevel=2,
)
return cls()
@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
return cls()
def request_seen(self, request: Request) -> bool:
return False
def open(self) -> Deferred[None] | None:
pass
def close(self, reason: str) -> Deferred[None] | None:
pass
def log(self, request: Request, spider: Spider) -> None:
"""Log that a request has been filtered"""
warn(
"Calling BaseDupeFilter.log() is deprecated.",
ScrapyDeprecationWarning,
stacklevel=2,
)
[docs]
class RFPDupeFilter(BaseDupeFilter):
"""Duplicate request filtering class (:setting:`DUPEFILTER_CLASS`) that
filters out requests with the canonical
(:func:`w3lib.url.canonicalize_url`) :attr:`~scrapy.http.Request.url`,
:attr:`~scrapy.http.Request.method` and :attr:`~scrapy.http.Request.body`.
"""
def __init__(
self,
path: str | None = None,
debug: bool = False,
*,
fingerprinter: RequestFingerprinterProtocol | None = None,
) -> None:
self.file = None
self.fingerprinter: RequestFingerprinterProtocol = (
fingerprinter or RequestFingerprinter()
)
self.fingerprints: set[str] = set()
self.logdupes = True
self.debug = debug
self.logger = logging.getLogger(__name__)
if path:
self.file = Path(path, "requests.seen").open("a+", encoding="utf-8")
self.file.seek(0)
self.fingerprints.update(x.rstrip() for x in self.file)
@classmethod
def from_settings(
cls,
settings: BaseSettings,
*,
fingerprinter: RequestFingerprinterProtocol | None = None,
) -> Self:
warnings.warn(
f"{cls.__name__}.from_settings() is deprecated, use from_crawler() instead.",
category=ScrapyDeprecationWarning,
stacklevel=2,
)
return cls._from_settings(settings, fingerprinter=fingerprinter)
@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
assert crawler.request_fingerprinter
return cls._from_settings(
crawler.settings,
fingerprinter=crawler.request_fingerprinter,
)
@classmethod
def _from_settings(
cls,
settings: BaseSettings,
*,
fingerprinter: RequestFingerprinterProtocol | None = None,
) -> Self:
debug = settings.getbool("DUPEFILTER_DEBUG")
return cls(job_dir(settings), debug, fingerprinter=fingerprinter)
def request_seen(self, request: Request) -> bool:
fp = self.request_fingerprint(request)
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)
if self.file:
self.file.write(fp + "\n")
return False
def request_fingerprint(self, request: Request) -> str:
"""Returns a string that uniquely identifies the specified request."""
return self.fingerprinter.fingerprint(request).hex()
def close(self, reason: str) -> None:
if self.file:
self.file.close()
def log(self, request: Request, spider: Spider) -> None:
if self.debug:
msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)"
args = {"request": request, "referer": referer_str(request)}
self.logger.debug(msg, args, extra={"spider": spider})
elif self.logdupes:
msg = (
"Filtered duplicate request: %(request)s"
" - no more duplicates will be shown"
" (see DUPEFILTER_DEBUG to show all duplicates)"
)
self.logger.debug(msg, {"request": request}, extra={"spider": spider})
self.logdupes = False
assert spider.crawler.stats
spider.crawler.stats.inc_value("dupefilter/filtered", spider=spider)