from __future__ import annotations
from email.utils import formatdate
from typing import TYPE_CHECKING
from twisted.internet import defer
from twisted.internet.error import (
ConnectError,
ConnectionDone,
ConnectionLost,
ConnectionRefusedError,
DNSLookupError,
TCPTimedOutError,
TimeoutError,
)
from twisted.web.client import ResponseFailed
from scrapy import signals
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.utils.misc import load_object
if TYPE_CHECKING:
# typing.Self requires Python 3.11
from typing_extensions import Self
from scrapy.crawler import Crawler
from scrapy.http.request import Request
from scrapy.http.response import Response
from scrapy.settings import Settings
from scrapy.spiders import Spider
from scrapy.statscollectors import StatsCollector
[docs]class HttpCacheMiddleware:
DOWNLOAD_EXCEPTIONS = (
defer.TimeoutError,
TimeoutError,
DNSLookupError,
ConnectionRefusedError,
ConnectionDone,
ConnectError,
ConnectionLost,
TCPTimedOutError,
ResponseFailed,
OSError,
)
def __init__(self, settings: Settings, stats: StatsCollector) -> None:
if not settings.getbool("HTTPCACHE_ENABLED"):
raise NotConfigured
self.policy = load_object(settings["HTTPCACHE_POLICY"])(settings)
self.storage = load_object(settings["HTTPCACHE_STORAGE"])(settings)
self.ignore_missing = settings.getbool("HTTPCACHE_IGNORE_MISSING")
self.stats = stats
@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
assert crawler.stats
o = cls(crawler.settings, crawler.stats)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o
def spider_opened(self, spider: Spider) -> None:
self.storage.open_spider(spider)
def spider_closed(self, spider: Spider) -> None:
self.storage.close_spider(spider)
def process_request(
self, request: Request, spider: Spider
) -> Request | Response | None:
if request.meta.get("dont_cache", False):
return None
# Skip uncacheable requests
if not self.policy.should_cache_request(request):
request.meta["_dont_cache"] = True # flag as uncacheable
return None
# Look for cached response and check if expired
cachedresponse: Response | None = self.storage.retrieve_response(
spider, request
)
if cachedresponse is None:
self.stats.inc_value("httpcache/miss", spider=spider)
if self.ignore_missing:
self.stats.inc_value("httpcache/ignore", spider=spider)
raise IgnoreRequest(f"Ignored request not in cache: {request}")
return None # first time request
# Return cached response only if not expired
cachedresponse.flags.append("cached")
if self.policy.is_cached_response_fresh(cachedresponse, request):
self.stats.inc_value("httpcache/hit", spider=spider)
return cachedresponse
# Keep a reference to cached response to avoid a second cache lookup on
# process_response hook
request.meta["cached_response"] = cachedresponse
return None
def process_response(
self, request: Request, response: Response, spider: Spider
) -> Request | Response:
if request.meta.get("dont_cache", False):
return response
# Skip cached responses and uncacheable requests
if "cached" in response.flags or "_dont_cache" in request.meta:
request.meta.pop("_dont_cache", None)
return response
# RFC2616 requires origin server to set Date header,
# https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18
if "Date" not in response.headers:
response.headers["Date"] = formatdate(usegmt=True)
# Do not validate first-hand responses
cachedresponse: Response | None = request.meta.pop("cached_response", None)
if cachedresponse is None:
self.stats.inc_value("httpcache/firsthand", spider=spider)
self._cache_response(spider, response, request, cachedresponse)
return response
if self.policy.is_cached_response_valid(cachedresponse, response, request):
self.stats.inc_value("httpcache/revalidate", spider=spider)
return cachedresponse
self.stats.inc_value("httpcache/invalidate", spider=spider)
self._cache_response(spider, response, request, cachedresponse)
return response
def process_exception(
self, request: Request, exception: Exception, spider: Spider
) -> Request | Response | None:
cachedresponse: Response | None = request.meta.pop("cached_response", None)
if cachedresponse is not None and isinstance(
exception, self.DOWNLOAD_EXCEPTIONS
):
self.stats.inc_value("httpcache/errorrecovery", spider=spider)
return cachedresponse
return None
def _cache_response(
self,
spider: Spider,
response: Response,
request: Request,
cachedresponse: Response | None,
) -> None:
if self.policy.should_cache_response(response, request):
self.stats.inc_value("httpcache/store", spider=spider)
self.storage.store_response(spider, request, response)
else:
self.stats.inc_value("httpcache/uncacheable", spider=spider)