Source code for torvend.spiders.skytorrents

# Copyright (c) 2017 Stephen Bunn (stephen@bunn.io)
# MIT License <https://opensource.org/licenses/MIT>

import re

from .. import (items,)
from ._common import (BaseSpider,)

import furl
import scrapy


[docs]class SkyTorrentsSpider(BaseSpider):

    name = 'skytorrents'
    allowed_domains = [
        'skytorrents.in',
    ]

    @property
    def paging_index(self):
        """ Required property for paging indexing.

        :returns: The starting index of pages
        :rtype: int
        """

        return 1

    @property
    def paging_results(self):
        """ Required property for paging results.

        :returns: The number of results per queried page
        :rtype: int
        """

        return 40

    @property
    def query_scheme(self):
        """ Required property for query scheme.

        :returns: The scheme the query needs
        :rtype: str
        """

        return 'https'

    @property
    def query_path(self):
        """ Required property for the query path.

        :returns: The path the query needs
        :rtype: str
        """

        return '/search/all/ed/{page}/{query}/?l=en-us'

[docs]    def parse(self, response):
        """ Required first level page parser.

        :param response: The response instance from ``start_requests``
        :type response: scrapy.Request
        :returns: Yields additional scrapy requests
        :rtype: list[scrapy.Request]
        """

        soup = self.get_soup(response.text)

        try:
            results = soup\
                .find('div', {'class': 'columns'})\
                .find_all('div', {'class': 'column'})[1]\
                .find('table')\
                .find_all('tr')[1:]
        except AttributeError:
            return

        for result in results:
            torrent = items.Torrent(spider=self.name)

            (name_link, magnet_link,) = result.find('td').find_all('a')[:2]
            torrent['name'] = name_link.text.strip()
            torrent['source'] = furl.furl(response.url).set(
                path=name_link.attrs['href'], args={}
            ).url
            torrent['magnet'] = magnet_link.attrs['href']
            torrent['hash'] = self.parse_infohash(torrent['magnet'])

            (size_div, _, uploaded_div, seeders_div, leechers_div,) = \
                result.find_all('td')[1:]

            torrent['size'] = self.parse_size(size_div.text.strip())
            torrent['uploaded'] = self.parse_datetime(
                uploaded_div.text.strip(),
                formats=[
                    '%m %b %Y',
                ]
            )
            torrent['seeders'] = int(seeders_div.text.strip())
            torrent['leechers'] = int(leechers_div.text.strip())

            # NOTE: skytorrents.in does not categorize torrents
            torrent['categories'] = [items.TorrentCategory.Unknown]
            torrent['uploader'] = None

            yield torrent