"""
HTTP basic auth downloader middleware
See documentation in docs/topics/downloader-middleware.rst
"""
import warnings
from w3lib.http import basic_auth_header
from scrapy import signals
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.url import url_is_from_any_domain
[docs]class HttpAuthMiddleware:
"""Set Basic HTTP Authorization header
(http_user and http_pass spider class attributes)"""
@classmethod
def from_crawler(cls, crawler):
o = cls()
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
usr = getattr(spider, "http_user", "")
pwd = getattr(spider, "http_pass", "")
if usr or pwd:
self.auth = basic_auth_header(usr, pwd)
if not hasattr(spider, "http_auth_domain"):
warnings.warn(
"Using HttpAuthMiddleware without http_auth_domain is deprecated and can cause security "
"problems if the spider makes requests to several different domains. http_auth_domain "
"will be set to the domain of the first request, please set it to the correct value "
"explicitly.",
category=ScrapyDeprecationWarning,
)
self.domain_unset = True
else:
self.domain = spider.http_auth_domain
self.domain_unset = False
def process_request(self, request, spider):
auth = getattr(self, "auth", None)
if auth and b"Authorization" not in request.headers:
domain = urlparse_cached(request).hostname
if self.domain_unset:
self.domain = domain
self.domain_unset = False
if not self.domain or url_is_from_any_domain(request.url, [self.domain]):
request.headers[b"Authorization"] = auth