0
Ich versuche, mehr als 200 Links zu analysieren, aber BS4 blieb einfach mit der Verarbeitung. Ich sah das Beautifulsoup findall get stuck without processing, aber das ist anders. Stuck in den zufälligen Orten.Beautifulsoup stecken
import os
import urllib.request
from bs4 import BeautifulSoup
def get_html(url):
response = urllib.request.urlopen(url)
return response.read()
def parse(html, url):
soup = BeautifulSoup(html, "html.parser")
table = soup.find_all('tr', title = "Допущено до конкурсу")
if os.path.exists('base/%s.txt' % url[27:]):
pass
else:
abitbase = open('base/%s.txt' % (url[27:]), 'w')
for unit in table:
collection = unit.find_all('td')
position = collection[0].text
name = collection[1].text
priority = collection[2].text
score = collection[3].text
abitbase.write('%s %s %s %s \n' % (position, name, priority, score))
abitbase.close()
def main():
global applicants
url_list = open('clist.txt', 'r')
for count in range(1, 241):
url_s = url_list.readline()
if url_s[-1] == '\n':
url = url_s[:-1]
else:
url = url_s
parse(get_html(url), url)
print('base [%s] saved | %s%s' %(url[27:], (round((count/2.41), 2)), '%'))
if __name__ == '__main__':
applicants = {}
main()
Und Timeouterror:
Traceback (most recent call last):
File "/usr/lib/python3.4/urllib/request.py", line 1182, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "/usr/lib/python3.4/http/client.py", line 1088, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python3.4/http/client.py", line 1126, in _send_request
self.endheaders(body)
File "/usr/lib/python3.4/http/client.py", line 1084, in endheaders
self._send_output(message_body)
File "/usr/lib/python3.4/http/client.py", line 922, in _send_output
self.send(msg)
File "/usr/lib/python3.4/http/client.py", line 857, in send
self.connect()
File "/usr/lib/python3.4/http/client.py", line 834, in connect
self.timeout, self.source_address)
File "/usr/lib/python3.4/socket.py", line 512, in create_connection
raise err
File "/usr/lib/python3.4/socket.py", line 503, in create_connection
sock.connect(sa)
TimeoutError: [Errno 110] Connection timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/maxlagerz/PycharmProjects/AbitLogger/main.py", line 58, in <module>
main()
File "/home/maxlagerz/PycharmProjects/AbitLogger/main.py", line 53, in main
parse(get_html(url), url)
File "/home/maxlagerz/PycharmProjects/AbitLogger/main.py", line 22, in get_html
response = urllib.request.urlopen(url)
File "/usr/lib/python3.4/urllib/request.py", line 161, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.4/urllib/request.py", line 463, in open
response = self._open(req, data)
File "/usr/lib/python3.4/urllib/request.py", line 481, in _open
'_open', req)
File "/usr/lib/python3.4/urllib/request.py", line 441, in _call_chain
result = func(*args)
File "/usr/lib/python3.4/urllib/request.py", line 1210, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "/usr/lib/python3.4/urllib/request.py", line 1184, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 110] Connection timed out>