Ich arbeite an einem Scraper, der Tor verwendet, von dem in diesem Beispiel eine vereinfachte Version ist: https://github.com/khpeek/scraper-compose. Das Projekt hat die folgende (vereinfachte) Struktur:"Fehler beim Empfang einer Kontrollmeldung (SocketClosed): leerer Socket-Inhalt" in Tor's Stem Controller
.
├── docker-compose.yml
├── privoxy
│ ├── config
│ └── Dockerfile
├── scraper
│ ├── Dockerfile
│ ├── requirements.txt
│ ├── tutorial
│ │ ├── scrapy.cfg
│ │ └── tutorial
│ │ ├── extensions.py
│ │ ├── __init__.py
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ ├── spiders
│ │ │ ├── __init__.py
│ │ │ └── quotes_spider.py
│ │ └── tor_controller.py
│ └── wait-for
│ └── wait-for
└── tor
├── Dockerfile
└── torrc
die Spinne, quotes_spider.py
definiert, ist sehr einfach auf der Basis der Scrapy Tutorial:
import scrapy
from tutorial.items import QuoteItem
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = ['http://quotes.toscrape.com/page/{n}/'.format(n=n) for n in range(1, 3)]
custom_settings = {
'TOR_RENEW_IDENTITY_ENABLED': True,
'TOR_ITEMS_TO_SCRAPE_PER_IDENTITY': 5
}
download_delay = 2 # Wait 2 seconds (actually a random time between 1 and 3 seconds) between downloading pages
def parse(self, response):
for quote in response.css('div.quote'):
item = QuoteItem()
item['text'] = quote.css('span.text::text').extract_first()
item['author'] = quote.css('small.author::text').extract_first()
item['tags'] = quote.css('div.tags a.tag::text').extract()
yield item
In settings.py
, I eine Scrapy extension aktiviert haben mit die Linien
EXTENSIONS = {
'tutorial.extensions.TorRenewIdentity': 1,
}
wo extensions.py
ist
import logging
import random
from scrapy import signals
from scrapy.exceptions import NotConfigured
import tutorial.tor_controller as tor_controller
logger = logging.getLogger(__name__)
class TorRenewIdentity(object):
def __init__(self, crawler, item_count):
self.crawler = crawler
self.item_count = self.randomize(item_count) # Randomize the item count to confound traffic analysis
self._item_count = item_count # Also remember the given item count for future randomizations
self.items_scraped = 0
# Connect the extension object to signals
self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
@staticmethod
def randomize(item_count, min_factor=0.5, max_factor=1.5):
'''Randomize the number of items scraped before changing identity. (A similar technique is applied to Scrapy's DOWNLOAD_DELAY setting).'''
randomized_item_count = random.randint(int(min_factor*item_count), int(max_factor*item_count))
logger.info("The crawler will scrape the following (randomized) number of items before changing identity (again): {}".format(randomized_item_count))
return randomized_item_count
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('TOR_RENEW_IDENTITY_ENABLED'):
raise NotConfigured
item_count = crawler.settings.getint('TOR_ITEMS_TO_SCRAPE_PER_IDENTITY', 50)
return cls(crawler=crawler, item_count=item_count) # Instantiate the extension object
def item_scraped(self, item, spider):
'''When item_count items are scraped, pause the engine and change IP address.'''
self.items_scraped += 1
if self.items_scraped == self.item_count:
logger.info("Scraped {item_count} items. Pausing engine while changing identity...".format(item_count=self.item_count))
self.crawler.engine.pause()
tor_controller.change_identity() # Change IP address (cf. https://stem.torproject.org/faq.html#how-do-i-request-a-new-identity-from-tor)
self.items_scraped = 0 # Reset the counter
self.item_count = self.randomize(self._item_count) # Generate a new random number of items to scrape before changing identity again
self.crawler.engine.unpause()
und tor_controller.py
ist
import logging
import sys
import socket
import time
import requests
import stem
import stem.control
# Tor settings
TOR_ADDRESS = socket.gethostbyname("tor") # The Docker-Compose service in which this code is running should be linked to the "tor" service.
TOR_CONTROL_PORT = 9051 # This is configured in /etc/tor/torrc by the line "ControlPort 9051" (or by launching Tor with "tor -controlport 9051")
TOR_PASSWORD = "foo" # The Tor password is written in the docker-compose.yml file. (It is passed as a build argument to the 'tor' service).
# Privoxy settings
PRIVOXY_ADDRESS = "privoxy" # This assumes this code is running in a Docker-Compose service linked to the "privoxy" service
PRIVOXY_PORT = 8118 # This is determined by the "listen-address" in Privoxy's "config" file
HTTP_PROXY = 'http://{address}:{port}'.format(address=PRIVOXY_ADDRESS, port=PRIVOXY_PORT)
logger = logging.getLogger(__name__)
class TorController(object):
def __init__(self):
self.controller = stem.control.Controller.from_port(address=TOR_ADDRESS, port=TOR_CONTROL_PORT)
self.controller.authenticate(password=TOR_PASSWORD)
self.session = requests.Session()
self.session.proxies = {'http': HTTP_PROXY}
def request_ip_change(self):
self.controller.signal(stem.Signal.NEWNYM)
def get_ip(self):
'''Check what the current IP address is (as seen by IPEcho).'''
return self.session.get('http://ipecho.net/plain').text
def change_ip(self):
'''Signal a change of IP address and wait for confirmation from IPEcho.net'''
current_ip = self.get_ip()
logger.debug("Initializing change of identity from the current IP address, {current_ip}".format(current_ip=current_ip))
self.request_ip_change()
while True:
new_ip = self.get_ip()
if new_ip == current_ip:
logger.debug("The IP address is still the same. Waiting for 1 second before checking again...")
time.sleep(1)
else:
break
logger.debug("The IP address has been changed from {old_ip} to {new_ip}".format(old_ip=current_ip, new_ip=new_ip))
return new_ip
def __enter__(self):
return self
def __exit__(self, *args):
self.controller.close()
def change_identity():
with TorController() as tor_controller:
tor_controller.change_ip()
Wenn ich kriechen beginnen docker-compose build
von docker-compose up
gefolgt mit, im Großen und Ganzen die Erweiterung funktioniert: nach den Protokollen erfolgreich ändert IP-Adresse und setzt Schaben.
Was mich ärgert, ist jedoch, dass während der Zeit der Motor angehalten wird, sehe ich Fehlermeldungen wie
scraper_1 | 2017-05-12 16:35:06 [stem] INFO: Error while receiving a control message (SocketClosed): empty socket content
gefolgt von
scraper_1 | 2017-05-12 16:35:06 [stem] INFO: Error while receiving a control message (SocketClosed): received exception "peek of closed file"
Was diese Fehler verursacht? Da sie die INFO
Ebene haben, kann ich sie vielleicht einfach ignorieren? (Ich habe mir den Quellcode von Stem bei https://gitweb.torproject.org/stem.git/ angesehen, aber bisher konnte ich nicht verstehen, was passiert).