Mam następujący pająk pełzający dane.

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class MegamillionsSpider(CrawlSpider):
    name = 'megamillions'
    allowed_domains = ['nylottery.ny.gov']
    start_urls = ['https://nylottery.ny.gov/mega-millions/past-winning-numbers']

    rules = (
        Rule(LinkExtractor(restrict_xpaths='//div[@class="view-content"]'), callback='parse_item',
             follow=True),
        Rule(LinkExtractor(restrict_xpaths='//a[@title="Go to next page"]'))
    )

    def parse_item(self, response):
        for r in response.xpath('//div[@class="accordion-list"]//div[contains(@class,"views-row")]'):
            data = {}
            date = r.xpath('.//*[contains(@class,"result-date")]/text()').get()
            wn = r.xpath('.//span[contains(@class,"numbers")]/text()').get()
            mega_ball = r.xpath('.//p[contains(@class,"bonus-number")]/text()').get()
            data = {
                'date': date,
                'winning_numbers': wn,
                'mega_ball': int(mega_ball)
            }
            print(data)

Spider z powodzeniem podąża za linkami, ale tylko na pierwszym (URL) zwraca dane.

{'date': ' 03/17/2020', 'winning_numbers': '20-27-28-58-59', 'mega_ball': 25}
{'date': ' 03/13/2020', 'winning_numbers': '07-22-37-43-44', 'mega_ball': 22}
{'date': ' 03/10/2020', 'winning_numbers': '06-17-48-54-69', 'mega_ball': 12}
{'date': ' 03/06/2020', 'winning_numbers': '15-48-56-58-70', 'mega_ball': 4}
{'date': ' 03/03/2020', 'winning_numbers': '08-12-33-56-64', 'mega_ball': 2}
{'date': ' 02/28/2020', 'winning_numbers': '02-03-14-41-64', 'mega_ball': 17}
{'date': ' 02/25/2020', 'winning_numbers': '02-09-43-49-63', 'mega_ball': 15}
{'date': ' 02/21/2020', 'winning_numbers': '04-07-13-16-60', 'mega_ball': 6}
{'date': ' 02/18/2020', 'winning_numbers': '06-12-39-61-70', 'mega_ball': 4}
{'date': ' 02/14/2020', 'winning_numbers': '10-32-48-54-55', 'mega_ball': 18}

Brakowam czegoś? W moim myśleniu, że pająk zeskrobałby wszystkie dane z każdego adresu URL.

0
ktm92 20 marzec 2020, 09:19

1 odpowiedź

Najlepsza odpowiedź

Mam kilka lat doświadczenia z pisemem i jestem przeciwnym użytkowaniu LinkExtractor, możesz po prostu napisać start_requests i wydajność, które chcesz się czołgać

# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider


class MegamillionsSpider(CrawlSpider):
    name = 'megamillions'
    allowed_domains = ['nylottery.ny.gov']

    def start_requests(self):

        yield Request(url="https://nylottery.ny.gov/mega-millions/past-winning-numbers", callback=self.parse_item)


    def parse_item(self, response):
        for r in response.xpath('//div[@class="accordion-list"]//div[contains(@class,"views-row")]'):
            data = {}
            date = r.xpath('.//*[contains(@class,"result-date")]/text()').get()
            wn = r.xpath('.//span[contains(@class,"numbers")]/text()').get()
            mega_ball = r.xpath('.//p[contains(@class,"bonus-number")]/text()').get()
            data = {
                'date': date,
                'winning_numbers': wn,
                'mega_ball': int(mega_ball)
            }
            print(data)

        nextPage = response.xpath('//a[@title="Go to next page"]/@href').extract_first()
        if nextPage:
            yield Request(url=nextPage, callback=self.parse_item)
        else:
            print("%s was last page"%(response.url))
1
Umair Ayub 20 marzec 2020, 07:52