2016-07-20 16 views
0

Ich versuche, mehr als 200 Links zu analysieren, aber BS4 blieb einfach mit der Verarbeitung. Ich sah das Beautifulsoup findall get stuck without processing, aber das ist anders. Stuck in den zufälligen Orten.Beautifulsoup stecken

import os 
import urllib.request 
from bs4 import BeautifulSoup 
def get_html(url): 
    response = urllib.request.urlopen(url) 
    return response.read() 

def parse(html, url): 
    soup = BeautifulSoup(html, "html.parser") 
    table = soup.find_all('tr', title = "Допущено до конкурсу") 
    if os.path.exists('base/%s.txt' % url[27:]): 
     pass 
    else: 
     abitbase = open('base/%s.txt' % (url[27:]), 'w') 
     for unit in table: 
      collection = unit.find_all('td') 
      position = collection[0].text 
      name = collection[1].text 
      priority = collection[2].text 
      score = collection[3].text 
      abitbase.write('%s %s %s %s \n' % (position, name, priority, score)) 
     abitbase.close() 

def main(): 
    global applicants 
    url_list = open('clist.txt', 'r') 
    for count in range(1, 241): 
     url_s = url_list.readline() 
     if url_s[-1] == '\n': 
      url = url_s[:-1] 
     else: 
      url = url_s 
     parse(get_html(url), url) 
     print('base [%s] saved | %s%s' %(url[27:], (round((count/2.41), 2)), '%')) 

if __name__ == '__main__': 
    applicants = {} 
    main() 

Und Timeouterror:

Traceback (most recent call last): 
    File "/usr/lib/python3.4/urllib/request.py", line 1182, in do_open 
    h.request(req.get_method(), req.selector, req.data, headers) 
    File "/usr/lib/python3.4/http/client.py", line 1088, in request 
    self._send_request(method, url, body, headers) 
    File "/usr/lib/python3.4/http/client.py", line 1126, in _send_request 
    self.endheaders(body) 
    File "/usr/lib/python3.4/http/client.py", line 1084, in endheaders 
    self._send_output(message_body) 
    File "/usr/lib/python3.4/http/client.py", line 922, in _send_output 
    self.send(msg) 
    File "/usr/lib/python3.4/http/client.py", line 857, in send 
    self.connect() 
    File "/usr/lib/python3.4/http/client.py", line 834, in connect 
    self.timeout, self.source_address) 
    File "/usr/lib/python3.4/socket.py", line 512, in create_connection 
    raise err 
    File "/usr/lib/python3.4/socket.py", line 503, in create_connection 
    sock.connect(sa) 
TimeoutError: [Errno 110] Connection timed out 

During handling of the above exception, another exception occurred: 

Traceback (most recent call last): 
    File "/home/maxlagerz/PycharmProjects/AbitLogger/main.py", line 58, in <module> 
    main() 
    File "/home/maxlagerz/PycharmProjects/AbitLogger/main.py", line 53, in main 
    parse(get_html(url), url) 
    File "/home/maxlagerz/PycharmProjects/AbitLogger/main.py", line 22, in get_html 
    response = urllib.request.urlopen(url) 
    File "/usr/lib/python3.4/urllib/request.py", line 161, in urlopen 
    return opener.open(url, data, timeout) 
    File "/usr/lib/python3.4/urllib/request.py", line 463, in open 
    response = self._open(req, data) 
    File "/usr/lib/python3.4/urllib/request.py", line 481, in _open 
    '_open', req) 
    File "/usr/lib/python3.4/urllib/request.py", line 441, in _call_chain 
    result = func(*args) 
    File "/usr/lib/python3.4/urllib/request.py", line 1210, in http_open 
    return self.do_open(http.client.HTTPConnection, req) 
    File "/usr/lib/python3.4/urllib/request.py", line 1184, in do_open 
    raise URLError(err) 
urllib.error.URLError: <urlopen error [Errno 110] Connection timed out> 

Antwort

1

BS4 funktioniert gut, dass meine Schuld.

Ich habe einfach os.path.exists vor parse(get_html(url), url) und es funktioniert gut.

Entschuldigung.