Source code for scrapy.spiderloader

from __future__ import annotations

import traceback
import warnings
from collections import defaultdict
from typing import TYPE_CHECKING

from zope.interface import implementer

from scrapy.interfaces import ISpiderLoader
from scrapy.utils.misc import walk_modules
from scrapy.utils.spider import iter_spider_classes

if TYPE_CHECKING:
    from types import ModuleType

    # typing.Self requires Python 3.11
    from typing_extensions import Self

    from scrapy import Request, Spider
    from scrapy.settings import BaseSettings


[docs]@implementer(ISpiderLoader) class SpiderLoader: """ SpiderLoader is a class which locates and loads spiders in a Scrapy project. """ def __init__(self, settings: BaseSettings): self.spider_modules: list[str] = settings.getlist("SPIDER_MODULES") self.warn_only: bool = settings.getbool("SPIDER_LOADER_WARN_ONLY") self._spiders: dict[str, type[Spider]] = {} self._found: defaultdict[str, list[tuple[str, str]]] = defaultdict(list) self._load_all_spiders() def _check_name_duplicates(self) -> None: dupes = [] for name, locations in self._found.items(): dupes.extend( [ f" {cls} named {name!r} (in {mod})" for mod, cls in locations if len(locations) > 1 ] ) if dupes: dupes_string = "\n\n".join(dupes) warnings.warn( "There are several spiders with the same name:\n\n" f"{dupes_string}\n\n This can cause unexpected behavior.", category=UserWarning, ) def _load_spiders(self, module: ModuleType) -> None: for spcls in iter_spider_classes(module): self._found[spcls.name].append((module.__name__, spcls.__name__)) self._spiders[spcls.name] = spcls def _load_all_spiders(self) -> None: for name in self.spider_modules: try: for module in walk_modules(name): self._load_spiders(module) except (ImportError, SyntaxError): if self.warn_only: warnings.warn( f"\n{traceback.format_exc()}Could not load spiders " f"from module '{name}'. " "See above traceback for details.", category=RuntimeWarning, ) else: raise self._check_name_duplicates()
[docs] @classmethod def from_settings(cls, settings: BaseSettings) -> Self: return cls(settings)
[docs] def load(self, spider_name: str) -> type[Spider]: """ Return the Spider class for the given spider name. If the spider name is not found, raise a KeyError. """ try: return self._spiders[spider_name] except KeyError: raise KeyError(f"Spider not found: {spider_name}")
[docs] def find_by_request(self, request: Request) -> list[str]: """ Return the list of spider names that can handle the given request. """ return [ name for name, cls in self._spiders.items() if cls.handles_request(request) ]
[docs] def list(self) -> list[str]: """ Return a list with the names of all spiders available in the project. """ return list(self._spiders.keys())