Source code for torvend.spiders.idope

# Copyright (c) 2017 Stephen Bunn (stephen@bunn.io)
# MIT License <https://opensource.org/licenses/MIT>

from .. import (items,)
from ._common import (BaseSpider,)

import furl


[docs]class IDopeSpider(BaseSpider): """ The spider for idope.se. """ name = 'idope' allowed_domains = [ 'idope.se', ] _category_map = { 'music': items.TorrentCategory.Audio, 'tv': items.TorrentCategory.Video, 'anime': items.TorrentCategory.Video, 'apps': items.TorrentCategory.Application, 'books': items.TorrentCategory.Book, 'xxx': items.TorrentCategory.Adult, 'images': items.TorrentCategory.Image, 'games': items.TorrentCategory.Game, 'others': items.TorrentCategory.Unknown, } @property def paging_index(self): """ Required property for paging indexing. :returns: The starting index of pages :rtype: int """ return 1 @property def paging_results(self): """ Required property for paging results. :returns: The number of results per queried page :rtype: int """ return 10 @property def query_scheme(self): """ Required property for query scheme. :returns: The scheme the query needs :rtype: str """ return 'https' @property def query_path(self): """ Required property for the query path. :returns: The path the query needs :rtype: str """ return '/torrent-list/{query}/?p={page}'
[docs] def parse(self, response): """ Required first level page parser. :param response: The response instance from ``start_requests`` :type response: scrapy.Request :returns: Yields torrent items :rtype: list[items.Torrent] """ soup = self.get_soup(response.text) for result in soup\ .find('div', {'id': 'div2child'})\ .find_all('div', {'class': 'resultdiv'}): torrent = items.Torrent(spider=self.name) torrent['name'] = result.find( 'div', {'class': 'resultdivtopname'} ).contents[0].strip() torrent['source'] = furl.furl(response.url).set( path=result.find('a').attrs['href'], args={} ).url torrent['categories'] = [ self._category_map.get( result.find( 'div', {'class': 'resultdivbottoncategory'} ).contents[0].strip().lower(), items.TorrentCategory.Unknown ) ] info_hash = result.find( 'div', {'class': 'hideinfohash'} ).contents[0].strip() torrent['hash'] = info_hash.lower() torrent['magnet'] = ( 'magnet:?xt=urn:btih:{info_hash}&dn' ).format(**locals()) torrent['seeders'] = int(result.find( 'div', {'class': 'resultdivbottonseed'} ).contents[0]) torrent['size'] = self.parse_size(result.find( 'div', {'class': 'resultdivbottonlength'} ).contents[0]) torrent['uploaded'] = self.parse_datetime(( '{0} ago' ).format(result.find( 'div', {'class': 'resultdivbottontime'} ).contents[0])) # handle non-reported torrent fields torrent['leechers'] = 0 torrent['uploader'] = None yield torrent