Source code for scrapy.extensions.periodic_log

import logging
from datetime import datetime, timezone

from twisted.internet import task

from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder

logger = logging.getLogger(__name__)


[docs]class PeriodicLog: """Log basic scraping stats periodically""" def __init__( self, stats, interval=60.0, ext_stats={}, ext_delta={}, ext_timing_enabled=False, ): self.stats = stats self.interval = interval self.multiplier = 60.0 / self.interval self.task = None self.encoder = ScrapyJSONEncoder(sort_keys=True, indent=4) self.ext_stats_enabled = bool(ext_stats) self.ext_stats_include = ext_stats.get("include", []) self.ext_stats_exclude = ext_stats.get("exclude", []) self.ext_delta_enabled = bool(ext_delta) self.ext_delta_include = ext_delta.get("include", []) self.ext_delta_exclude = ext_delta.get("exclude", []) self.ext_timing_enabled = ext_timing_enabled @classmethod def from_crawler(cls, crawler): interval = crawler.settings.getfloat("LOGSTATS_INTERVAL") if not interval: raise NotConfigured try: ext_stats = crawler.settings.getdict("PERIODIC_LOG_STATS") except (TypeError, ValueError): ext_stats = ( {"enabled": True} if crawler.settings.getbool("PERIODIC_LOG_STATS") else None ) try: ext_delta = crawler.settings.getdict("PERIODIC_LOG_DELTA") except (TypeError, ValueError): ext_delta = ( {"enabled": True} if crawler.settings.getbool("PERIODIC_LOG_DELTA") else None ) ext_timing_enabled = crawler.settings.getbool( "PERIODIC_LOG_TIMING_ENABLED", False ) if not (ext_stats or ext_delta or ext_timing_enabled): raise NotConfigured o = cls( crawler.stats, interval, ext_stats, ext_delta, ext_timing_enabled, ) crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) return o def spider_opened(self, spider): self.time_prev = datetime.now(tz=timezone.utc) self.delta_prev = {} self.stats_prev = {} self.task = task.LoopingCall(self.log) self.task.start(self.interval) def log(self): data = {} if self.ext_timing_enabled: data.update(self.log_timing()) if self.ext_delta_enabled: data.update(self.log_delta()) if self.ext_stats_enabled: data.update(self.log_crawler_stats()) logger.info(self.encoder.encode(data)) def log_delta(self): num_stats = { k: v for k, v in self.stats._stats.items() if isinstance(v, (int, float)) and self.param_allowed(k, self.ext_delta_include, self.ext_delta_exclude) } delta = {k: v - self.delta_prev.get(k, 0) for k, v in num_stats.items()} self.delta_prev = num_stats return {"delta": delta} def log_timing(self): now = datetime.now(tz=timezone.utc) time = { "log_interval": self.interval, "start_time": self.stats._stats["start_time"], "utcnow": now, "log_interval_real": (now - self.time_prev).total_seconds(), "elapsed": (now - self.stats._stats["start_time"]).total_seconds(), } self.time_prev = now return {"time": time} def log_crawler_stats(self): stats = { k: v for k, v in self.stats._stats.items() if self.param_allowed(k, self.ext_stats_include, self.ext_stats_exclude) } return {"stats": stats} def param_allowed(self, stat_name, include, exclude): if not include and not exclude: return True for p in exclude: if p in stat_name: return False if exclude and not include: return True for p in include: if p in stat_name: return True return False def spider_closed(self, spider, reason): self.log() if self.task and self.task.running: self.task.stop()