Source code for torvend.spiders.torlock

# Copyright (c) 2017 Stephen Bunn (stephen@bunn.io)
# MIT License <https://opensource.org/licenses/MIT>

import re

from .. import (items,)
from ._common import (BaseSpider,)

import furl
import scrapy


[docs]class TorlockSpider(BaseSpider): name = 'torlock' allowed_domains = [ 'torlock.com', ] _category_map = { 'movies': items.TorrentCategory.Video, 'television': items.TorrentCategory.Video, 'music': items.TorrentCategory.Audio, 'anime': items.TorrentCategory.Video, 'software': items.TorrentCategory.Application, 'games': items.TorrentCategory.Game, 'ebooks': items.TorrentCategory.Book, 'audiobook': items.TorrentCategory.Audio, 'images': items.TorrentCategory.Image, 'adult': items.TorrentCategory.Adult, 'other': items.TorrentCategory.Unknown, } @property def paging_index(self): """ Required property for paging indexing. :returns: The starting index of pages :rtype: int """ return 1 @property def paging_results(self): """ Required property for paging results. :returns: The number of results per queried page :rtype: int """ return 75 @property def query_scheme(self): """ Required property for query scheme. :returns: The scheme the query needs :rtype: str """ return 'https' @property def query_path(self): """ Required property for the query path. :returns: The path the query needs :rtype: str """ return '/all/torrents/{query}/{page}.html' def _parse_torrent(self, response): """ Handle parsing torrent info. :param response: The response instance from ``start_requests`` :type response: scrapy.Request :returns: Yields torrent items :rtype: list[items.Torrent] """ torrent = response.meta['torrent'] soup = self.get_soup(response.text) torrent['magnet'] = soup\ .find('article')\ .find('table')\ .find('a', {'href': re.compile(r'^magnet:.*')}).attrs['href'] result = soup\ .find_all('div', {'class': 'well'})[1]\ .find('div', {'class': 'row'})\ .find_all('div')[1] (category_div, infohash_div,) = result\ .find_all('dl', {'class': 'dl-horizontal'})[1:3] torrent['hash'] = infohash_div.find('dd').contents[0].lower().strip() torrent['categories'] = [ self._category_map.get( category_div.find('dd').find('a').contents[0].strip().lower(), items.TorrentCategory.Unknown ) ] torrent['uploader'] = None yield torrent
[docs] def parse(self, response): """ Required first level page parser. :param response: The response instance from ``start_requests`` :type response: scrapy.Request :returns: Yields additional scrapy requests :rtype: list[scrapy.Request] """ soup = self.get_soup(response.text) try: results = soup\ .find('div', {'class': 'panel-default'})\ .find('table')\ .find_all('tr')[1:] except AttributeError: return for result in results: torrent = items.Torrent(spider=self.name) name_link = result.find('td').find('div').find('a') torrent['name'] = name_link.text.strip() torrent['source'] = furl.furl(response.url).set( path=name_link.attrs['href'], args={} ).url torrent['uploaded'] = self.parse_datetime( result.find('td', {'class': 'td'}).contents[0].strip(), formats=[ '%m/%d/%Y' ] ) torrent['size'] = self.parse_size( result.find('td', {'class': 'ts'}).contents[0].strip() ) torrent['seeders'] = int(result.find( 'td', {'class': 'tul'} ).contents[0].strip()) torrent['leechers'] = int(result.find( 'td', {'class': 'tdl'} ).contents[0].strip()) torrent_request = scrapy.Request( torrent['source'], callback=self._parse_torrent ) torrent_request.meta['torrent'] = torrent yield torrent_request