Source code for torvend.spiders.onethreethreesevenx

# Copyright (c) 2017 Stephen Bunn (stephen@bunn.io)
# GNU GPLv3 <https://www.gnu.org/licenses/gpl-3.0.txt>

import re

from .. import (items,)
from ._common import (BaseSpider,)

import furl
import scrapy


[docs]class OneThreeThreeSevenXSpider(BaseSpider): """ The spider for 1337x.to. """ name = '1337x' allowed_domains = [ '1337x.to', ] _category_map = { 'movies': items.TorrentCategory.Video, 'documentaries': items.TorrentCategory.Video, 'music': items.TorrentCategory.Audio, 'apps': items.TorrentCategory.Application, 'games': items.TorrentCategory.Game, 'xxx': items.TorrentCategory.Adult, 'tv': items.TorrentCategory.Video, 'other': items.TorrentCategory.Unknown, } @property def paging_index(self): """ Required property for paging indexing. :returns: The starting index of pages :rtype: int """ return 1 @property def paging_results(self): """ Required property for paging results. :returns: The number of results per queried page :rtype: int """ return 20 @property def query_scheme(self): """ Required property for query scheme. :returns: The scheme the query needs :rtype: str """ return 'http' @property def query_path(self): """ Required property for the query path. :returns: The path the query needs :rtype: str """ return '/search/{query}/{page}/' def _parse_torrent(self, response): """ Handle parsing torrent info. :param response: The response instance from ``start_requests`` :type response: scrapy.Request :returns: Yields torrent items :rtype: list[items.Torrent] """ torrent = response.meta['torrent'] soup = self.get_soup(response.text) result = soup.find('div', {'class': 'box-info-detail'}) torrent['categories'] = [ self._category_map.get( result.find( 'div', {'class': 'torrent-category-detail'} ).find('ul', {'class': 'list'}).find('li').find( 'span' ).contents[0].strip().lower(), items.TorrentCategory.Unknown ) ] torrent['magnet'] = result.find( 'a', {'href': re.compile(r'^magnet:.*')} ).attrs['href'] torrent['hash'] = result.find( 'div', {'class': 'infohash-box'} ).find('span').contents[0].strip().lower() yield torrent
[docs] def parse(self, response): """ Required first level page parser. :param response: The response instance from ``start_requests`` :type response: scrapy.Request :returns: Yields additional scrapy requests :rtype: list[scrapy.Request] """ soup = self.get_soup(response.text) try: results = soup\ .find('div', {'class': 'inner-table'})\ .find('table')\ .find('tbody')\ .find_all('tr') except AttributeError: return for result in results: torrent = items.Torrent(spider=self.name) name_link = result\ .find('td', {'class': 'name'})\ .find('a', {'href': re.compile(r'^/torrent/(?:\d+)/.*')}) torrent['name'] = name_link.contents[0].strip() torrent['source'] = furl.furl(response.url).set( path=name_link.attrs['href'], args={} ).url torrent['seeders'] = int(result.find( 'td', {'class': 'seeds'} ).contents[0].strip()) torrent['leechers'] = int(result.find( 'td', {'class': 'leeches'} ).contents[0].strip()) torrent['uploaded'] = self.parse_datetime(result.find( 'td', {'class': 'coll-date'} ).contents[0].strip()) torrent['size'] = self.parse_size(result.find( 'td', {'class': 'size'} ).contents[0].strip()) torrent['uploader'] = result.find( 'td', {'class': 'coll-5'} ).find('a').contents[0].strip() # handle additional request torrent_request = scrapy.Request( torrent['source'], callback=self._parse_torrent ) torrent_request.meta['torrent'] = torrent yield torrent_request