2016-04-28 14 views
-1

Ich endlich geschafft, ein funktionierendes Skript zu bekommen. Nur 1 kleines Problem. Ich kann alle Seiten crawlen und alle benötigten Informationen erhalten, außer von der ersten Seite.Link Extractor in scrapy

Wo ist mein Fehler?

import scrapy.selector 
from scrapy.spiders import CrawlSpider, Rule 
from scrapy.linkextractors import LinkExtractor 
from Prijsvergelijking.items import PrijsvergelijkingItem 

class MySpider(CrawlSpider): 
    name = "coolblue" 
    allowed_domains = ["tvstore.be"] 
    start_urls = ["http://www.tvstore.be/category/192945/televisies.html"] 
    rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="pagination next secondary"]',)), callback = "parse_items",follow = True),)  
    def parse_items(self, response): 
     products = response.xpath("//li[@class='product-list-columns--item product-list-item']") 
     for product in products: 
      item = PrijsvergelijkingItem() 
      item["Product_ref"] = product.xpath(".//h2/a/text()").extract_first().strip() 
      item["Product_price"] = product.xpath(".//strong[1]/text()").extract_first().strip().replace(",",".").replace("-","") 
      yield item 

Antwort

0

Ich sah nicht hart genug.

Ich fand die Antwort. Alles, was ich tun musste, war parse_Items in parse_start_url zu ändern.

from scrapy.spiders import CrawlSpider, Rule 
import scrapy.selector 
from scrapy.linkextractors import LinkExtractor 
from Prijsvergelijking.items import PrijsvergelijkingItem 

class MySpider(CrawlSpider): 
    name = "msh" 
    allowed_domains = ["mediamarkt.be"] 
    start_urls = ["http://www.mediamarkt.be/mcs/productlist/_TV,98952,452540.html?langId=-17&searchParams=&sort=&view=&page=1"] 
    rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//li[@class="pagination-next"]',)), callback = "parse_start_url",follow = True),)  
    def parse_start_url(self, response): 
     products = response.xpath("//ul[@class='products-list']/li/div") 
     for product in products:  
      item = PrijsvergelijkingItem() 
      item["Product_price"] = product.xpath('.//aside/div/div/div/text()').extract_first().replace(",", ".").replace("-", "") 
      item["Product_ref"] = product.xpath('.//div/h2/a/text()').extract_first().strip() 
      yield item