Если проблема заключается в одновременном посещении большего количества URL-адресов, вы можете посещать их по одному, используя сигнал spider_idle ( https://docs.scrapy.org/en/latest/topics/signals.html. ).
Идея состоит в следующем:
1.start_requests посещает только первый URL-адрес
2. когда паук бездействует, метод spider_idle называется
3. метод spider_idle удаляет первый URL и посещает второй URL
4.так на ...
Код будет примерно таким (я не пробовал):
class SlfSpider1Spider(CrawlSpider):
name = 'slf_spider1'
custom_settings = { 'CONCURRENT_REQUESTS': '1' }
allowed_domains = ['gipfelbuch.ch']
start_urls = ['https://www.gipfelbuch.ch/gipfelbuch/touren/seite/'+str(i) for i in [1,650]]
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(SlfSpider1Spider, cls).from_crawler(crawler, *args, **kwargs)
# Here you set which method the spider has to run when it gets idle
crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle)
return spider
# Method which starts the requests by vicisting all URLS specified in start_urls
def start_requests(self):
# the spider visits only the first provided URL
url = self.start_urls[0]:
print('#### START REQUESTS: ',url)
yield scrapy.Request(url, callback=self.parse_verhaeltnisse, dont_filter=True)
def parse_verhaeltnisse(self,response):
links = response.xpath('//td//@href').extract()
for link in links[0:2]:
print('##### PARSING: ',link)
abs_link = 'https://www.gipfelbuch.ch/'+link
yield scrapy.Request(abs_link, callback=self.parse_gipfelbuch_item, dont_filter=True)
def parse_gipfelbuch_item(self, response):
route = response.xpath('/html/body/main/div[4]/div[@class="col_f"]//div[@class="togglebox cont_item mt"]//div[@class="label_container"]')
print('#### PARSER OUTPUT: ')
key=[route[i].xpath('string(./label)').extract()[0] for i in range(len(route))]
value=[route[i].xpath('string(div[@class="label_content"])').extract()[0] for i in range(len(route))]
fields = dict(zip(key,value))
print('Route: ', fields['Gipfelname'])
print('Comments: ', fields['Verhältnis-Beschreibung'])
print('Length of dict extracted from Route: {}'.format(len(route)))
return
# When the spider gets idle, it deletes the first url and visits the second, and so on...
def spider_idle(self, spider):
del(self.start_urls[0])
if len(self.start_urls)>0:
url = self.start_urls[0]
self.crawler.engine.crawl(Request(url, callback=self.parse_verhaeltnisse, dont_filter=True), spider)