2017-08-16 3 views
0

Ich versuche, eine paginierte Liste von Catalog zu durchsuchen, die funktioniert gut.Scrapy, wie Paginierung in der zweiten Ebene oder verschachtelte Paginierung Crawlen

Aber für jedeCatalog gibt es eine paginierte Liste derDataSet aber nur erste Seite dort im Ergebnis erscheint. Ich versuche, ein Ergebnis zu erhalten, das wie folgt aussieht, aber alle 24 Knoten sollten dort entsprechend 24 DataSets sein, die sich über Seiten mit je 6 Elementen erstrecken.

[{'data_sets_count': 24, 
    'description': 'The catalog contains data regarding various indicators of ' 
       'HMIS like Health, Abortions, Immunisation, AEFI, Adolescent, ' 
       'Bite, Sting, Disease, Diarrhoeal, Hypertension, HIV, AIDS, ' 
       'Malaria, Neurological, Stroke, Fever, Respiratory, ' 
       'Infection, suicide, Trauma, Accident, Burn, Tuberculosis, ' 
       'VHND, ASHA, JSY, CHC, PHC, SDH, DH, Hospital.', 
    'last_updated': '11/08/17', 
    'ministry_department': 'Ministry of Health and Family Welfare, Department of ' 
         'Health and Family Welfare', 
    'nodes': [{'node': '3183861', 
      'title': 'Item-wise report for North Goa of Goa upto ' 
         'April-2014-15'}, 
      {'node': '3183881', 
      'title': 'Item-wise report for North Goa of Goa upto May-2014-15'}, 
      {'node': '3183981', 
      'title': 'Item-wise report for North Goa of Goa upto ' 
         'October-2014-15'}, 
      {'node': '3184021', 
      'title': 'Item-wise report for North Goa of Goa upto ' 
         'December-2014-15'}, 
      {'node': '3184061', 
      'title': 'Item-wise report for North Goa of Goa upto ' 
         'February-2014-15'}, 
      {'node': '3183961', 
      'title': 'Item-wise report for North Goa of Goa upto ' 
         'September-2014-15'}], 
    'state_department': None, 
    'title': 'HMIS sub district level item-wise monthly report of Goa', 
    'url': '/catalog/hmis-sub-district-level-item-wise-monthly-report-goa'}] 

import scrapy 
class Category(scrapy.Item): 
    title = scrapy.Field() 
    url = scrapy.Field() 
    ministry_department = scrapy.Field() 
    description = scrapy.Field() 
    state_department = scrapy.Field() 
    last_updated = scrapy.Field() 
    data_sets_count = scrapy.Field() 
    data_sets = scrapy.Field() 
    item = scrapy.Field() 
    nodes = scrapy.Field() 

class CatalogSpider(scrapy.Spider): 
    name = 'catalogspider' 
    start_urls = ['https://data.gov.in/catalogs#sort_by=created&sort_order=DESC&items_per_page=9&page=1'] 

    def parse(self, response): 
     for catalog in response.css('.view-catalogs > div > .views-row-6'): 
      category = Category() 
      category['title'] = catalog.css('.views-field-title .field-content a::text').extract_first() 
      category['url'] = catalog.css('.views-field-title .field-content a::attr(href)').extract_first() 
      category['ministry_department'] = catalog.css('.views-field-field-ministry-department .field-content ::text').extract_first() 
      category['description'] = catalog.css('.views-field-body .field-content ::text').extract_first() 
      category['state_department'] = catalog.css('.views-field-field-state-department .field-content ::text').extract_first() 
      category['last_updated'] = catalog.css('.views-field-changed .field-content ::text').extract_first() 
      category['data_sets_count'] = int(catalog.css('.views-field-resource-count-last .count-resource::text').re(r'\((.*?)\)')[0]) 
      category['nodes'] = [] 
      request = scrapy.Request(response.urljoin(category['url']), callback=self.parseDataSets) 
      request.meta['item'] = category 
      yield request 

     for next_page in response.css('li.pager-next > a'): 
      yield response.follow(next_page, self.parse) 


    def parseDataSets(self, response): 
     item = response.meta['item'] 

     for dataset in response.css('.view-resource-detail-popup > div > .views-row'): 
      item['nodes'].append({ 
       'node' : dataset.css('.data-extension.csv::attr(class)').extract_first().split()[0], 
       'title' : dataset.css('.views-field-title .field-content .title-content::text').extract_first() 
       }) 

     for next_page in response.css('li.pager-next'): 
      print('here') 
      request = scrapy.Request(response.urljoin(next_page.css('a::attr(href)').extract_first()), callback=self.parseDataSets) 
      request.meta['item'] = item 

     yield item 
+0

Könnten Sie Durchforstungsprotokoll schreiben? Sie können dies über 'scrapy crawl spider --logfile output.log' oder' scrapy crawl spider 2> 1 | tun tee output.log' Befehle (die spätere setzt die Ausgabe auf Bildschirm und Datei). – Granitosaurus

+0

@Granitosaurus Ich habe es gerade mit einigen Änderungen im Code arbeiten lassen, ich poste den Arbeitscode jetzt, aber nicht sicher, ob es der * richtige * Weg ist, es zu tun. – sabithpocker

+0

Ich füge zu einer Meta-Variable hinzu, die Elemente in jeder untergeordneten Seite und ergibt Null, am Ende ergibt die Meta-Variable, wenn es die letzte Seite ist. Klingt ein bisschen hacky, aber funktioniert jetzt. – sabithpocker

Antwort

0

Ich habe es durch den Einsatz unter Code arbeiten, ich bin nicht sicher, ob es dies der richtige Weg ist, zu tun. Ich füge DataSet zu einer Meta-Variable category hinzu und liefert None, am Ende ergibt die Meta-Variable category, wenn es die letzte Seite ist. Klingt ein bisschen hacky, aber funktioniert jetzt.

import scrapy 
class Category(scrapy.Item): 
    title = scrapy.Field() 
    url = scrapy.Field() 
    ministry_department = scrapy.Field() 
    description = scrapy.Field() 
    state_department = scrapy.Field() 
    last_updated = scrapy.Field() 
    data_sets_count = scrapy.Field() 
    data_sets_actual_count = scrapy.Field() 
    data_sets = scrapy.Field() 
    item = scrapy.Field() 
    nodes = scrapy.Field() 

class CatalogSpider(scrapy.Spider): 
    name = 'catalogspider' 
    start_urls = ['https://data.gov.in/catalogs#sort_by=created&sort_order=DESC&items_per_page=9&page=1'] 

    def parse(self, response): 
     for catalog in response.css('.view-catalogs > div > .views-row-6'): 
      category = Category() 
      category['title'] = catalog.css('.views-field-title .field-content a::text').extract_first() 
      category['url'] = catalog.css('.views-field-title .field-content a::attr(href)').extract_first() 
      category['ministry_department'] = catalog.css('.views-field-field-ministry-department .field-content ::text').extract_first() 
      category['description'] = catalog.css('.views-field-body .field-content ::text').extract_first() 
      category['state_department'] = catalog.css('.views-field-field-state-department .field-content ::text').extract_first() 
      category['last_updated'] = catalog.css('.views-field-changed .field-content ::text').extract_first() 
      category['data_sets_count'] = int(catalog.css('.views-field-resource-count-last .count-resource::text').re(r'\((.*?)\)')[0]) 
      category['nodes'] = [] 
      request = scrapy.Request(response.urljoin(category['url']), callback=self.parse_data_sets) 
      request.meta['category'] = category 
      yield request 

     #for next_page in response.css('li.pager-next > a'): 
     # yield response.follow(next_page, self.parse) 


    def parse_data_sets(self, response): 
     category = response.meta['category'] 
     datasets = response.css('.view-resource-detail-popup > div > .views-row') 
     if datasets: 
      for dataset in datasets: 
       node = dataset.css('.data-extension.csv::attr(class)').extract_first().split()[0] 
       title = dataset.css('.views-field-title .field-content .title-content::text').extract_first() 
       url = 'https://data.gov.in/node/' + node + '/download' 
       category['nodes'].append({ 
        'node' : node, 
        'title' : title, 
        'url' : url 
        }) 
       yield None 
     else: 
      yield category 

     if len(response.css('li.pager-next').extract()) == 0: 
      category['data_sets_actual_count'] = len(category['nodes']) 
      yield category 

     #pagination 
     for next_page in response.css('li.pager-next'): 
      request = scrapy.Request(response.urljoin(next_page.css('a::attr(href)').extract_first()), callback=self.parse_data_sets) 
      request.meta['category'] = category 
      yield request 

Einer meines Problems wurde Einstellung falsch Tiefe in meinem Befehl, die ich auf eine größere Zahl änderte später, gelegentlich Probleme, wenn in unbekannten Bereichen:

scrapy parse --spider=catalogspider -d 60 'https://data.gov.in/catalogs#sort_by=created&sort_order=DESC&items_per_page=9&page=1'