Source code for scrapy.downloadermiddlewares.httpcompression

import warnings
from logging import getLogger

from scrapy import signals
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Response, TextResponse
from scrapy.responsetypes import responsetypes
from scrapy.utils._compression import (
    _DecompressionMaxSizeExceeded,
    _inflate,
    _unbrotli,
    _unzstd,
)
from scrapy.utils.deprecate import ScrapyDeprecationWarning
from scrapy.utils.gz import gunzip

logger = getLogger(__name__)

ACCEPTED_ENCODINGS = [b"gzip", b"deflate"]

try:
    import brotli  # noqa: F401
except ImportError:
    pass
else:
    ACCEPTED_ENCODINGS.append(b"br")

try:
    import zstandard  # noqa: F401
except ImportError:
    pass
else:
    ACCEPTED_ENCODINGS.append(b"zstd")


[docs]class HttpCompressionMiddleware: """This middleware allows compressed (gzip, deflate) traffic to be sent/received from web sites""" def __init__(self, stats=None, *, crawler=None): if not crawler: self.stats = stats self._max_size = 1073741824 self._warn_size = 33554432 return self.stats = crawler.stats self._max_size = crawler.settings.getint("DOWNLOAD_MAXSIZE") self._warn_size = crawler.settings.getint("DOWNLOAD_WARNSIZE") crawler.signals.connect(self.open_spider, signals.spider_opened) @classmethod def from_crawler(cls, crawler): if not crawler.settings.getbool("COMPRESSION_ENABLED"): raise NotConfigured try: return cls(crawler=crawler) except TypeError: warnings.warn( "HttpCompressionMiddleware subclasses must either modify " "their '__init__' method to support a 'crawler' parameter or " "reimplement their 'from_crawler' method.", ScrapyDeprecationWarning, ) mw = cls() mw.stats = crawler.stats mw._max_size = crawler.settings.getint("DOWNLOAD_MAXSIZE") mw._warn_size = crawler.settings.getint("DOWNLOAD_WARNSIZE") crawler.signals.connect(mw.open_spider, signals.spider_opened) return mw def open_spider(self, spider): if hasattr(spider, "download_maxsize"): self._max_size = spider.download_maxsize if hasattr(spider, "download_warnsize"): self._warn_size = spider.download_warnsize def process_request(self, request, spider): request.headers.setdefault("Accept-Encoding", b", ".join(ACCEPTED_ENCODINGS)) def process_response(self, request, response, spider): if request.method == "HEAD": return response if isinstance(response, Response): content_encoding = response.headers.getlist("Content-Encoding") if content_encoding: encoding = content_encoding.pop() max_size = request.meta.get("download_maxsize", self._max_size) warn_size = request.meta.get("download_warnsize", self._warn_size) try: decoded_body = self._decode( response.body, encoding.lower(), max_size ) except _DecompressionMaxSizeExceeded: raise IgnoreRequest( f"Ignored response {response} because its body " f"({len(response.body)} B) exceeded DOWNLOAD_MAXSIZE " f"({max_size} B) during decompression." ) if len(response.body) < warn_size <= len(decoded_body): logger.warning( f"{response} body size after decompression " f"({len(decoded_body)} B) is larger than the " f"download warning size ({warn_size} B)." ) if self.stats: self.stats.inc_value( "httpcompression/response_bytes", len(decoded_body), spider=spider, ) self.stats.inc_value( "httpcompression/response_count", spider=spider ) respcls = responsetypes.from_args( headers=response.headers, url=response.url, body=decoded_body ) kwargs = dict(cls=respcls, body=decoded_body) if issubclass(respcls, TextResponse): # force recalculating the encoding until we make sure the # responsetypes guessing is reliable kwargs["encoding"] = None response = response.replace(**kwargs) if not content_encoding: del response.headers["Content-Encoding"] return response def _decode(self, body, encoding, max_size): if encoding == b"gzip" or encoding == b"x-gzip": return gunzip(body, max_size=max_size) if encoding == b"deflate": return _inflate(body, max_size=max_size) if encoding == b"br" and b"br" in ACCEPTED_ENCODINGS: return _unbrotli(body, max_size=max_size) if encoding == b"zstd" and b"zstd" in ACCEPTED_ENCODINGS: return _unzstd(body, max_size=max_size) return body