1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
| import scrapy import re
from scrapy import Request from urllib import parse
from SpiderDemo.items import SpiderdemoItem class CeicSpider(scrapy.Spider): name = 'ceic' allowed_domains = ['www.ceic.ac.cn/speedsearch?time=6'] start_urls = ['http://www.ceic.ac.cn/speedsearch?time=6/']
def parse(self, response): post_nodes = response.xpath('//*[@align="left"]/a/@href').extract() for post_node in post_nodes: yield Request(url=parse.urljoin(response.url,post_node),callback=self.parse_detail,dont_filter=True)
next_url = response.xpath('//*[@id="paging"]/div/div/ul/li[last()-1]/a/@href').extract()[0] if next_url: yield Request(url=parse.urljoin(response.url,next_url),callback=self.parse,dont_filter=True)
def parse_detail(self,response):
spider_item = SpiderdemoItem()
time = response.xpath('//*[@valign="middle"]/text()').extract()[1].strip() latitude = response.xpath('//*[@valign="middle"]/text()').extract()[3].replace("°","") longitude = response.xpath('//*[@valign="middle"]/text()').extract()[5].replace("°","") depth = response.xpath('//*[@valign="middle"]/text()').extract()[7].replace("千米","") level = response.xpath('//*[@valign="middle"]/text()').extract()[9] area = response.xpath('//*[@valign="middle"]/text()').extract()[11]
spider_item["time"] = time spider_item["latitude"] = latitude spider_item["longitude"] = longitude spider_item["depth"] = depth spider_item["level"] = level spider_item["area"] = area
return spider_item
|