Source code for torvend.client

# Copyright (c) 2017 Stephen Bunn (stephen@bunn.io)
# MIT License <https://opensource.org/licenses/MIT>

import inspect

from . import (const, meta, spiders,)

import scrapy.crawler
import scrapy.signals


[docs]class TorvendClient(meta.Loggable): """ The client for discovering torrents. """ def __init__(self, settings={}, ignored=[], allowed=[], verbose=False): """ Initializes the client. :param settings: Any additional settings for the scrapy crawler :type settings: dict[str,....] :param ignored: Any ignored spiders :type ignored: list[torvend.spiders._common.BaseSpider] :param allowed: Any allowed spiders :type allowed: list[torvend.spiders._common.BaseSpider] :param bool verbose: A flag to indicate if verbose logging is enabled """ if len(ignored) > 0 and len(allowed) > 0: raise ValueError(( "usage of both 'ignored' and 'allowed' in client '{self}' " "is not supported" ).format(**locals())) (self.settings, self.ignored, self.allowed, self.verbose,) = \ (settings, ignored, allowed, verbose,) @property def settings(self): """ Overrides for default client scrapy settings. :getter: Returns overriding dictionary of client scrapy settings :setter: Sets the overriding settings :rtype: dict[str,....] """ if not hasattr(self, '_settings'): self._settings = {} return self._settings @settings.setter def settings(self, settings): """ Sets the overriding client scrapy settings. :param settings: The new overriding scrapy settings :type settings: dict[str,....] :rtype: None """ assert isinstance(settings, dict), ( "settings must be a dictionary, received '{settings}'" ).format(**locals()) self._settings = settings @property def ignored(self): """ A list of ignored spider classes. :getter: Returns a list of ignored spider classes :setter: Sets the list of ignored spider classes :rtype: list[torvend.spiders._common.BaseSpider] """ if not hasattr(self, '_ignored'): self._ignored = [] return self._ignored @ignored.setter def ignored(self, ignored): """ Sets the list of ignored spider classes. :param ignored: A list of ignored spider classes :type ignored: list[torvend.spiders._common.BaseSpider] :rtype: None """ if ignored: assert isinstance(ignored, list) and all( inspect.isclass(entry) for entry in ignored ) and all( issubclass(entry, spiders._common.BaseSpider) for entry in ignored ), ( "ignored must be a list of spider classes, " "received '{ignored}'" ).format(**locals()) if hasattr(self, '_allowed') and len(self._allowed) > 0: self.log.debug(( 'setting ignored spiders clears allowed spiders, ' 'currently allowed {self._allowed}' ).format(**locals())) self._allowed = [] self._ignored = ignored @property def allowed(self): """ A list of allowed spider classes. :getter: Returns a list of allowed spider classes :setter: Sets the list of allowed spider classes :rtype: list[torvend.spiders._common.BaseSpider] """ if not hasattr(self, '_allowed'): self._allowed = [] return self._allowed @allowed.setter def allowed(self, allowed): """ Sets the list of allowed spider classes. :param allowed: A list of allowed spider classes :type allowed: list[torvend.spiders._common.BaseSpider] :rtype: None """ if allowed: assert isinstance(allowed, list) and all( inspect.isclass(entry) for entry in allowed ) and all( issubclass(entry, spiders._common.BaseSpider) for entry in allowed ), ( "allowed must be a list of spider classes, " "received '{allowed}'" ).format(**locals()) if hasattr(self, '_ignored') and len(self._ignored) > 0: self.log.debug(( 'setting allowed spiders clears ignored spiders, ' 'currently ignored {self._ignored}' ).format(**locals())) self._ignored = [] self._allowed = allowed @property def verbose(self): """ Indicates if verbose logging is enabled. :getter: Returns True if verbose logging is enabled :setter: Sets the verbose flag :rtype: bool """ if not hasattr(self, '_verbose'): self._verbose = False return self._verbose @verbose.setter def verbose(self, verbose): """ Sets the verbose flag. :param bool verbose: The new verbose flag :rtype: None """ assert isinstance(verbose, bool), ( "verbose must be a boolean, received '{verbose}'" ).format(**locals()) self._verbose = verbose const.verbose = verbose def _item_callback(self, item, **kwargs): """ An item callback for logging purposes. :param item: The yielded item :param kwargs: Any additional named arguments :type kwargs: dict[str,....] :rtype: None """ self.log.debug(( 'client `{self}` received item `{item}`, {kwargs}' ).format(**locals()))
[docs] def get_spiders(self): """ Returns a list of spider classes. :returns: A list of spider classes :rtype: list[torvend.spiders._common.BaseSpider] """ # NOTE: assuming allowed > 0 and ignored > 0 is not a case compare_allowed = len(self.allowed) > 0 for (_, spider_class,) in inspect.getmembers( spiders, predicate=inspect.isclass ): if compare_allowed: if spider_class in self.allowed: yield spider_class else: if spider_class not in self.ignored: yield spider_class
[docs] def search(self, query, callback, results=30): """ Starts the search process for a given query. .. note:: The callback method must accept at least a positional argument named ``item``. This is the discovered torrent item. :param str query: The query text to search with :param callable callback: A callback which receives torrent items :param int results: The minimum number of results for each spider to return """ # NOTE: local import to speed up module loading import twisted.internet crawler_settings = { 'BOT_NAME': const.module_name, 'USER_AGENT': ( 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) ' 'Gecko/20100101 Firefox/39.0' ), 'LOG_ENABLED': self.verbose, 'DNS_TIMEOUT': 5.0, 'DOWNLOAD_TIMEOUT': 5.0, } crawler_settings.update(self.settings) crawl_runner = scrapy.crawler.CrawlerRunner(crawler_settings) # register client available spiders for spider_class in self.get_spiders(): self.log.debug(( 'registering spider `{spider_class.__name__}` to ' 'crawl runner `{crawl_runner}`' ).format(**locals())) crawl_runner.crawl( spider_class, query=query, results=results ) for crawler in crawl_runner.crawlers: # subscribe crawler item scraped signal to client callback crawler.signals.connect( self._item_callback, scrapy.signals.item_scraped ) # subscribe crawler item scraped signal to user given callback self.log.debug(( 'connecting item signal for spider `{crawler}` to ' '`{callback}`' ).format(**locals())) crawler.signals.connect( callback, scrapy.signals.item_scraped ) # begin domain parallel crawling process self.log.info(( 'starting crawl for query `{query}` with `{crawler_count}` ' 'different crawlers' ).format(crawler_count=len(crawl_runner.crawlers), **locals())) delay = crawl_runner.join() delay.addBoth(lambda _: twisted.internet.reactor.stop()) twisted.internet.reactor.run()