Folgendes ist der Code, den ich verwenden, um Produktinformationen zu scraben. Es gibt viele Produkte auf einer Seite. Ich kratze sie alle und gehe dann zur nächsten Seite. Das Problem ist, dass der Scrapy nur das erste Produkt auf einer Seite auswählt, anstatt über alle Produkte auf einer Seite zu iterieren. Wo gehe ich falsch?Scraping verschiedener Produkte Informationen mit scrapy
import re
import time
import sys
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.http import Request
import parsedatetime
from datetime import datetime
from airline_sentiment.items import *
from airline_sentiment.spiders.crawlerhelper import *
class TripAdvisorRestaurantBaseSpider(BaseSpider):
name = "shoebuy"
allowed_domains = ["shoebuy.com"]
base_uri = "http://www.shoebuy.com"
start_urls = [
base_uri + "/womens-leather-boots/category_2493?cm_sp=cat-_-d_womensboots_tiles_b1_leather-_-092216"
]
def parse(self, response):
sel = Selector(response)
snode_airline = sel.xpath('//*[starts-with(@class, "pt_grid")]/div[starts-with(@class, "pt_product\")]')
for snode_restaurant in snode_airline:
tripadvisor_item = AirlineSentimentItem()
tripadvisor_item['url'] = self.base_uri + clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_info")]/a/@href'))
tripadvisor_item['name'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_info")]/a/span[@class="pt_title"]/text()'))
tripadvisor_item['price'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/span[@class="pt_price"]/text()'))
tripadvisor_item['discount'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/div[@class="pt_discount"]/span[@class="pt_percent_off"]/text()'))
tripadvisor_item['orig_price'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/div[@class="pt_discount"]/span[@class="pt_price_orig"]/text()'))
tripadvisor_item['stars'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//*[@class="bv-rating-ratio"]/span/span[3]/text()'))
tripadvisor_item['reviews'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "bv-inline-rating-container")]/dl/dd[2]/span/text()'))
yield Request(url=tripadvisor_item['url'], meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_fetch_review)
next_page_url = clean_parsed_string(get_parsed_string(sel, '//div[@class="paging"]/a[@class="next"]/@href'))
if next_page_url and len(next_page_url) > 0:
yield Request(url=self.base_uri + next_page_url, meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_next_page)
def parse_next_page(self, response):
sel = Selector(response)
snode_airline = sel.xpath('//*[starts-with(@class, "pt_grid")]/div[starts-with(@class, "pt_product")]')
for snode_restaurant in snode_airline:
tripadvisor_item = AirlineSentimentItem()
tripadvisor_item['url'] = self.base_uri + clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_info")]/a/@href'))
tripadvisor_item['name'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_info")]/a/span[@class="pt_title"]/text()'))
tripadvisor_item['price'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/span[@class="pt_price"]/text()'))
tripadvisor_item['discount'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/div[@class="pt_discount"]/span[@class="pt_percent_off"]/text()'))
tripadvisor_item['orig_price'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/div[@class="pt_discount"]/span[@class="pt_price_orig"]/text()'))
tripadvisor_item['stars'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//*[@class="bv-rating-ratio"]/span/span[3]/text()'))
tripadvisor_item['reviews'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "bv-inline-rating-container")]/dl/dd[2]/span/text()'))
yield Request(url=tripadvisor_item['url'], meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_fetch_review)
next_page_url = clean_parsed_string(get_parsed_string(sel, '//div[@class="paging"]/a[@class="next"]/@href'))
if next_page_url and len(next_page_url) > 0:
yield Request(url=self.base_uri + next_page_url, meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_next_page)
def parse_fetch_review(self, response):
tripadvisor_item = response.meta['tripadvisor_item']
sel = Selector(response)
snode_reviews = sel.xpath('//*[starts-with(@class, "product_info_wrapper")]')
for snode_review in snode_reviews:
tripadvisor_item['img'] = self.base_uri + clean_parsed_string(get_parsed_string(snode_review, '//div[starts-with(@class,"large_thumb")]/img/@src'))
tripadvisor_item['desc'] = clean_parsed_string(get_parsed_string(snode_review, '//*[starts-with(@class,"product_information")]/div[1]/span/text()'))
tripadvisor_item['brand'] = clean_parsed_string(get_parsed_string(snode_review, '//div[starts-with(@class,"seo_module")]/h3/text()'))
yield tripadvisor_item
Das hat perfekt funktioniert. Vielen Dank. Auch bekomme ich keine Sterne und Bewertungen Wert (ich bekomme keine). Ich bin mir nicht sicher, warum der XPath, den ich gegeben habe, nicht funktioniert. Es wäre großartig, wenn ich eine Lösung dafür finden könnte. –
@NeelShah es passiert, weil die Sterne und Bewertungen von einigen Javascriptaufrufen (Ajax) erzeugt werden und Scrapy kein Javascript ausführt. Vielleicht sollten Sie dafür ein neues Thema aufschlagen, da es nichts mit dem aktuellen zu tun hat. – Granitosaurus