Ich schreibe ein Programm, um eine chinesische Website zu crawlen, aber es gibt einige Probleme, die mir passiert sind, ich führe das Projekt, und nichts ist passiert, ich weiß nicht warum, hier ist der Code.Wie scrapy user agent in scrapy version 1.21 zu konfigurieren
(3) [email protected]:~/L/crawlAll$ tree
.
├── crawlAll
│ ├── __init__.py
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ ├── spiders
│ │ ├── __init__.py
│ │ └── TouTiao.py
│ └── useragent.py
├── LICENSE
├── README.md
└── scrapy.cfg
Die Datei: useragent.py
# -*-coding:utf-8-*-
#import logging
import random
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
class MyUserAgentMiddleware(UserAgentMiddleware):
def __init__(self,user_agent = 'Scrapy'):
super(MyUserAgentMiddleware,self).__init__()
self.user_agent = user_agent
def process_request(self,request,spider):
ua = random.choice(self.user_agent_list)
if ua:
#logger = logging.getLogger('')
print("******Current User Agent :%s***********"),ua
#logging.warning("Current User Agent:" + ua , logging.INFO)
request.headers.setdefault('User-Agent',ua)
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
Toutiao.py
# -*- coding: utf-8 -*-
import scrapy
import json
import time
from crawlAll.items import NewsSpiderItem
class TouTiaoSpider (scrapy.Spider):
name = "toutiao"
allowed_domains = ["toutiao.com"]
start_urls = ['http://www.toutiao.com/articles_news_society/p1/']
base_cat_url = 'http://www.toutiao.com/articles_news_society'
base_url = 'http://www.toutiao.com'
maxpage = 1
category = [
'articles_news_society',
]
def parse(self, response):
for ctg in self.category:
for page in range(1,self.maxpage):
url = self.base_url + '/' + ctg + '/p' + page
yield scrapy.Request(url,callback = self.parseNewsHref)
def parseNewsHref(self, response):
urls = response.xpath("//div[@class='info']//a/@href").extract()
for url in urls:
new_url = self.base_url + url
yield scrapy.Request(new_url, callback = self.parseNews)
def parseNews(self, response):
articles = response.xpath("//div[@id='article-main']")
item = NewsSpiderItem()
title = articles.xpath("//h1/text()").extract()[0]
tm = articles.xpath("//span[@class='time']/text()").extract()[0]
content = articles.xpath("//div[@class='article-content']//p/text()").extract()
if(len(title) != 0 and len(tm) != 0 and len(content) != 0):
item['title'] = title
item['time'] = int(time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M')))
item['url'] = response.url
cc = ''
if(len(content) != 0):
for c in content:
cc = cc + c + '\n'
item['content'] = cc
yield item
settings.py
BOT_NAME = 'crawlAll'
SPIDER_MODULES = ['crawlAll.spiders']
NEWSPIDER_MODULE = 'crawlAll.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
COOKIES_ENABLED = False
DOWNLOADER_MIDDLEWARES = {
'crawlAll.useragent.MyUserAgentMiddleware':400,
'crawlAll.middlewares.MyCustomDownloaderMiddleware': None
}
Aber das funktioniert nicht, Kann jemand lösen dieses Problem? Vielen Dank!
Vielen Dank, es ist mein Fehler, dieses Problem wurde bereits mit Ihrer Hilfe gelöst. Viel Glück für dich ~ –