Source code for torvend.spiders.onethreethreesevenx

# Copyright (c) 2017 Stephen Bunn (stephen@bunn.io)
# GNU GPLv3 <https://www.gnu.org/licenses/gpl-3.0.txt>

import re

from .. import (items,)
from ._common import (BaseSpider,)

import furl
import scrapy


[docs]class OneThreeThreeSevenXSpider(BaseSpider):
    """ The spider for 1337x.to.
    """

    name = '1337x'
    allowed_domains = [
        '1337x.to',
    ]

    _category_map = {
        'movies': items.TorrentCategory.Video,
        'documentaries': items.TorrentCategory.Video,
        'music': items.TorrentCategory.Audio,
        'apps': items.TorrentCategory.Application,
        'games': items.TorrentCategory.Game,
        'xxx': items.TorrentCategory.Adult,
        'tv': items.TorrentCategory.Video,
        'other': items.TorrentCategory.Unknown,
    }

    @property
    def paging_index(self):
        """ Required property for paging indexing.

        :returns: The starting index of pages
        :rtype: int
        """

        return 1

    @property
    def paging_results(self):
        """ Required property for paging results.

        :returns: The number of results per queried page
        :rtype: int
        """

        return 20

    @property
    def query_scheme(self):
        """ Required property for query scheme.

        :returns: The scheme the query needs
        :rtype: str
        """

        return 'http'

    @property
    def query_path(self):
        """ Required property for the query path.

        :returns: The path the query needs
        :rtype: str
        """

        return '/search/{query}/{page}/'

    def _parse_torrent(self, response):
        """ Handle parsing torrent info.

        :param response: The response instance from ``start_requests``
        :type response: scrapy.Request
        :returns: Yields torrent items
        :rtype: list[items.Torrent]
        """

        torrent = response.meta['torrent']
        soup = self.get_soup(response.text)

        result = soup.find('div', {'class': 'box-info-detail'})
        torrent['categories'] = [
            self._category_map.get(
                result.find(
                    'div', {'class': 'torrent-category-detail'}
                ).find('ul', {'class': 'list'}).find('li').find(
                    'span'
                ).contents[0].strip().lower(),
                items.TorrentCategory.Unknown
            )
        ]
        torrent['magnet'] = result.find(
            'a', {'href': re.compile(r'^magnet:.*')}
        ).attrs['href']
        torrent['hash'] = result.find(
            'div', {'class': 'infohash-box'}
        ).find('span').contents[0].strip().lower()

        yield torrent

[docs]    def parse(self, response):
        """ Required first level page parser.

        :param response: The response instance from ``start_requests``
        :type response: scrapy.Request
        :returns: Yields additional scrapy requests
        :rtype: list[scrapy.Request]
        """

        soup = self.get_soup(response.text)
        try:
            results = soup\
                .find('div', {'class': 'inner-table'})\
                .find('table')\
                .find('tbody')\
                .find_all('tr')
        except AttributeError:
            return

        for result in results:
            torrent = items.Torrent(spider=self.name)

            name_link = result\
                .find('td', {'class': 'name'})\
                .find('a', {'href': re.compile(r'^/torrent/(?:\d+)/.*')})
            torrent['name'] = name_link.contents[0].strip()
            torrent['source'] = furl.furl(response.url).set(
                path=name_link.attrs['href'], args={}
            ).url

            torrent['seeders'] = int(result.find(
                'td', {'class': 'seeds'}
            ).contents[0].strip())
            torrent['leechers'] = int(result.find(
                'td', {'class': 'leeches'}
            ).contents[0].strip())
            torrent['uploaded'] = self.parse_datetime(result.find(
                'td', {'class': 'coll-date'}
            ).contents[0].strip())
            torrent['size'] = self.parse_size(result.find(
                'td', {'class': 'size'}
            ).contents[0].strip())
            torrent['uploader'] = result.find(
                'td', {'class': 'coll-5'}
            ).find('a').contents[0].strip()

            # handle additional request
            torrent_request = scrapy.Request(
                torrent['source'],
                callback=self._parse_torrent
            )
            torrent_request.meta['torrent'] = torrent
            yield torrent_request