Source code for scrapy.downloadermiddlewares.httpauth

"""
HTTP basic auth downloader middleware

See documentation in docs/topics/downloader-middleware.rst
"""

from __future__ import annotations

import warnings
from typing import TYPE_CHECKING

from w3lib.http import basic_auth_header

from scrapy import Request, Spider, signals
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.settings import SETTINGS_PRIORITIES
from scrapy.utils.decorators import _warn_spider_arg
from scrapy.utils.url import url_is_from_any_domain

if TYPE_CHECKING:
    # typing.Self requires Python 3.11
    from typing_extensions import Self

    from scrapy.crawler import Crawler
    from scrapy.http import Response



[docs]
class HttpAuthMiddleware:
    """Set Basic HTTP Authorization header."""

    def __init__(self) -> None:
        self._auth: bytes | None = None
        self._domain: str | None = None

    @classmethod
    def from_crawler(cls, crawler: Crawler) -> Self:
        o = cls()
        usr = crawler.settings.get("HTTPAUTH_USER", "")
        pwd = crawler.settings.get("HTTPAUTH_PASS", "")
        if usr or pwd:
            domain_priority = crawler.settings.getpriority("HTTPAUTH_DOMAIN") or 0
            if domain_priority <= SETTINGS_PRIORITIES["default"]:
                raise ValueError(
                    "HTTPAUTH_DOMAIN must be set when HTTPAUTH_USER or HTTPAUTH_PASS "
                    "is configured. Set it to a domain (e.g. 'example.com') to restrict "
                    "credentials to that domain, or set it to None to send credentials "
                    "with all requests."
                )
            o._auth = basic_auth_header(usr, pwd)
            o._domain = crawler.settings.get("HTTPAUTH_DOMAIN")
        crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
        return o

    def spider_opened(self, spider: Spider) -> None:
        usr = getattr(spider, "http_user", "")
        pwd = getattr(spider, "http_pass", "")
        if usr or pwd:
            warnings.warn(
                "Use the HTTPAUTH_USER, HTTPAUTH_PASS, and HTTPAUTH_DOMAIN settings "
                "instead of the http_user, http_pass, and http_auth_domain spider "
                "attributes. Support for the spider attributes will be removed in a "
                "future version of Scrapy.",
                category=ScrapyDeprecationWarning,
                stacklevel=2,
            )
            self._auth = basic_auth_header(usr, pwd)
            self._domain = spider.http_auth_domain  # type: ignore[attr-defined]

    @_warn_spider_arg
    def process_request(
        self, request: Request, spider: Spider | None = None
    ) -> Request | Response | None:
        if b"Authorization" in request.headers:
            return None
        # Per-request meta overrides
        usr = request.meta.get("http_user", "")
        pwd = request.meta.get("http_pass", "")
        if usr or pwd:
            domain = request.meta.get("http_auth_domain")
            if not domain or url_is_from_any_domain(request.url, [domain]):
                request.headers[b"Authorization"] = basic_auth_header(usr, pwd)
            return None
        # Middleware-level auth
        if self._auth and (
            not self._domain or url_is_from_any_domain(request.url, [self._domain])
        ):
            request.headers[b"Authorization"] = self._auth
        return None