2016-12-02 24 views
0

Ich schreibe ein Programm, um eine chinesische Website zu crawlen, aber es gibt einige Probleme, die mir passiert sind, ich führe das Projekt, und nichts ist passiert, ich weiß nicht warum, hier ist der Code.Wie scrapy user agent in scrapy version 1.21 zu konfigurieren

(3) [email protected]:~/L/crawlAll$ tree 
. 
├── crawlAll 
│   ├── __init__.py 
│   ├── items.py 
│   ├── pipelines.py 
│   ├── settings.py 
│   ├── spiders 
│   │   ├── __init__.py 
│   │   └── TouTiao.py 
│   └── useragent.py 
├── LICENSE 
├── README.md 
└── scrapy.cfg 

Die Datei: useragent.py

# -*-coding:utf-8-*- 

#import logging 
import random 
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 


class MyUserAgentMiddleware(UserAgentMiddleware): 
    def __init__(self,user_agent = 'Scrapy'): 
     super(MyUserAgentMiddleware,self).__init__() 
     self.user_agent = user_agent 

    def process_request(self,request,spider): 
     ua = random.choice(self.user_agent_list) 
     if ua: 
      #logger = logging.getLogger('') 

      print("******Current User Agent :%s***********"),ua 
      #logging.warning("Current User Agent:" + ua , logging.INFO) 
      request.headers.setdefault('User-Agent',ua) 
    user_agent_list = [ 
     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " 
     "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 
     "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " 
     "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 
     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " 
     "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 
     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " 
     "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 
     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " 
     "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 
     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " 
     "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 
     "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " 
     "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 
     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " 
     "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
     "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " 
     "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " 
     "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " 
     "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " 
     "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " 
     "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " 
     "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
     "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " 
     "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " 
     "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 
     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " 
     "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 
     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " 
     "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 
    ] 

Toutiao.py

# -*- coding: utf-8 -*- 
import scrapy 
import json 
import time 
from crawlAll.items import NewsSpiderItem 

class TouTiaoSpider (scrapy.Spider): 
    name = "toutiao" 
    allowed_domains = ["toutiao.com"] 
    start_urls = ['http://www.toutiao.com/articles_news_society/p1/'] 
    base_cat_url = 'http://www.toutiao.com/articles_news_society' 
    base_url = 'http://www.toutiao.com' 

    maxpage = 1 
    category = [ 
     'articles_news_society', 
    ] 

    def parse(self, response): 
     for ctg in self.category: 
      for page in range(1,self.maxpage): 
       url = self.base_url + '/' + ctg + '/p' + page 
      yield scrapy.Request(url,callback = self.parseNewsHref) 

    def parseNewsHref(self, response): 
     urls = response.xpath("//div[@class='info']//a/@href").extract() 
     for url in urls: 
      new_url = self.base_url + url 
      yield scrapy.Request(new_url, callback = self.parseNews) 

    def parseNews(self, response): 

     articles = response.xpath("//div[@id='article-main']") 
     item = NewsSpiderItem() 
     title = articles.xpath("//h1/text()").extract()[0] 
     tm = articles.xpath("//span[@class='time']/text()").extract()[0] 
     content = articles.xpath("//div[@class='article-content']//p/text()").extract() 

     if(len(title) != 0 and len(tm) != 0 and len(content) != 0): 
      item['title'] = title 
      item['time'] = int(time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M'))) 
      item['url'] = response.url 
      cc = '' 
      if(len(content) != 0): 
       for c in content: 
        cc = cc + c + '\n' 
        item['content'] = cc 
        yield item 

settings.py

BOT_NAME = 'crawlAll' 
SPIDER_MODULES = ['crawlAll.spiders'] 
NEWSPIDER_MODULE = 'crawlAll.spiders' 
ROBOTSTXT_OBEY = False 

DOWNLOAD_DELAY = 3 
COOKIES_ENABLED = False 

DOWNLOADER_MIDDLEWARES = { 
    'crawlAll.useragent.MyUserAgentMiddleware':400, 
    'crawlAll.middlewares.MyCustomDownloaderMiddleware': None 
} 

Aber das funktioniert nicht, Kann jemand lösen dieses Problem? Vielen Dank!

Antwort

0

In Ihrer Parse-Funktion können Sie die URL nicht abrufen, da Sie das for nicht eingeben. Entfernen Sie die for oder erhöhen Sie maxpage.

def parse(self, response): 
    for ctg in self.category: 
     url = self.base_url + '/' + ctg + '/p' + page 
     yield scrapy.Request(url,callback = self.parseNewsHref) 
+0

Vielen Dank, es ist mein Fehler, dieses Problem wurde bereits mit Ihrer Hilfe gelöst. Viel Glück für dich ~ –