Mit Ihrer Beispiel-URL können wir alle URLs aus HeadRowMenu
abrufen und eine Schleife verwenden, um alle H1s von jeder Seite zu extrahieren.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = "http://dsv.su.se/en"
base = "http://dsv.su.se"
def crawl(start, base):
r = requests.get(start)
soup = BeautifulSoup(r.content, "lxml")
hs = ["h1", "h2", "h3", "h4", "h5", "h6"]
menu_links = [urljoin(base, a["href"]) for a in soup.select("#HeadRowMenu a")][1:]
for h in hs:
yield soup.find_all(h)
for lnk in menu_links:
soup = BeautifulSoup(requests.get(lnk).content)
for h in hs:
yield soup.find_all(h)
Wenn wir führen Sie es:
In [17]: print(list(chain.from_iterable(crawl(url, base))))
[<h1 class="visuallyhidden">Department of Computer and Systems Sciences</h1>, <h1>
<a href="/en/about/news/improve-your-digital-competences-on-line-with-eskills-match-1.278510">Improve your digital competences on-line with eSkills Match</a>
</h1>, <h1>
<a href="/en/about/news/envisioning-equality-in-computer-science-tomorrow-today-1.272045">Envisioning Equality in Computer Science - Tomorrow Today</a>
</h1>, <h1>
<a href="/en/about/news/egovlab-develops-online-democracy-1.271839">eGovlab develops online democracy</a>
</h1>, <h1>
<a href="/en/about/events/vinnova-and-dsv-invite-you-to-a-seminar-about-horizon-2020-1.266104">Vinnova and DSV invite you to a seminar about Horizon 2020</a>
</h1>, <h1>
<a href="/en/about/news/significant-increase-of-applicants-for-international-programmes-1.265744">Significant increase of applicants for international programmes</a>
</h1>, <h1>News</h1>, <h2>Semester start information</h2>, <h2>Meet our students</h2>, <h1 class="visuallyhidden">Education</h1>, <h1>Welcome to the education web at DSV!</h1>, <h1>Master's Programmes at DSV</h1>, <h2>
Master's Programmes in English:</h2>, <h1 class="visuallyhidden">Research</h1>, <h1>Research highlights</h1>, <h2>Research news</h2>, <h1 class="visuallyhidden">About us</h1>, <h1>About DSV</h1>, <h2>Sweden's oldest IT department</h2>, <h2>Interdisciplinary education and research</h2>, <h2>Right in the middle of one of the world's leading ICT clusters</h2>, <h1 class="visuallyhidden">Internal</h1>, <h1>Internal</h1>, <h2>Semester start information</h2>, <h2>Meet our students</h2>]
Wenn Sie jeden Link auf der Website buchstäblich kratzen wollen Sie bei scrapy aussehen sollte, ist es id nicht trivial zu tun, wie Sie nicht einfach blind jeden Link besuchen Sie finden, wie das buchstäblich Sie irgendwo bringen und unendlich schlingen könnte. Sie müssen sicherstellen, dass Sie nur die Domäne, die Sie wollen, besuchen, die Sie mit scrapy ganz einfach tun können. Werfen Sie einen Blick auf die crawlspider.
So setzen Sie Ihre eigenen:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
class Crawl:
def __init__(self, start_url, allowed, base, select):
self.start_url = start_url
self.base = base
self.allowed_domain = allowed
self.crawled = set()
self.select = select
def start(self):
r = requests.get(self.start_url)
soup = BeautifulSoup(r.content, "lxml")
menu_links = [urljoin(self.base, a["href"]) for a in soup.select(self.select)]
for lnk in menu_links:
yield from self.crawl(lnk)
def crawl(self, lnk):
r = requests.get(lnk)
soup = BeautifulSoup(r.content, "lxml")
hs = ["h1", "h2", "h3", "h4", "h5", "h6"]
page_links = (a["href"] for a in soup.select("a[href]"))
joined = (urljoin(base, lnk) if lnk.startswith("/en/") else lnk for lnk in page_links)
for lnk in filter(lambda link: link.startswith("http"), joined):
if lnk not in self.crawled:
soup = BeautifulSoup(requests.get(lnk).content,"lxml")
for h in hs:
yield soup.find_all(h)
self.crawled.add(lnk)
Ein Probelauf:
In [2]: from itertools import chain
In [3]: url = "http://dsv.su.se/en"
In [4]: base = "http://dsv.su.se"
In [5]: crawler = Crawl(url, "dsv.su.se", base, "#HeadRowMenu a")
In [6]: for h in chain.from_iterable(crawler.start()):
...: print(h)
...:
<h1 class="visuallyhidden">Institutionen för data- och systemvetenskap</h1>
<h1>
<a href="/omdsv/evenemang/dsv-50-%C3%A5r-digitala-aff%C3%A4rer-%C3%B6ppet-jubileumsseminarium-1.274298">*DSV 50 år* - Digitala affärer - öppet jubileumsseminarium </a>
</h1>
<h1>
<a href="/omdsv/nyheter/premi%C3%A4r-f%C3%B6r-vandringsdramat-exil-fria-poeter-p%C3%A5-flykt-1.278502">Premiär för vandringsdramat Exil - fria poeter på flykt</a>
</h1>
<h1>
<a href="/omdsv/nyheter/nu-b%C3%B6r-det-st%C3%A5-klart-att-n%C3%A5got-m%C3%A5ste-g%C3%B6ras-1.277680">Nu bör det stå klart att något måste göras </a>
</h1>
<h1>
<a href="/omdsv/nyheter/hur-enkelt-%C3%A4r-det-f%C3%B6r-fbi-att-kn%C3%A4cka-en-iphone-utan-apples-hj%C3%A4lp-1.277546">Hur enkelt är det för FBI att knäcka en Iphone utan Apples hjälp?</a>
</h1>
<h1>
<a href="/omdsv/nyheter/1-av-2-vill-l%C3%A5ta-staten-hacka-sig-in-i-datorer-1.277367">Svårt att backa tillbaka från ökad övervakning</a>
</h1>
<h1>Senaste nyheterna</h1>
<h2 class="category">Kommande evenemang</h2>
<h2>Information inför terminsstart</h2>
<h1 class="visuallyhidden">Other languages</h1>
<h1>Other languages</h1>
<h2>
Information in Chinese and Russian</h2>
<h2>Contact The Administration of Studies</h2>
<h1 class="visuallyhidden">Department of Computer and Systems Sciences</h1>
<h1>
<a href="/en/about/news/improve-your-digital-competences-on-line-with-eskills-match-1.278510">Improve your digital competences on-line with eSkills Match</a>
</h1>
<h1>
<a href="/en/about/news/envisioning-equality-in-computer-science-tomorrow-today-1.272045">Envisioning Equality in Computer Science - Tomorrow Today</a>
</h1>
<h1>
<a href="/en/about/news/egovlab-develops-online-democracy-1.271839">eGovlab develops online democracy</a>
</h1>
<h1>
<a href="/en/about/events/vinnova-and-dsv-invite-you-to-a-seminar-about-horizon-2020-1.266104">Vinnova and DSV invite you to a seminar about Horizon 2020</a>
</h1>
<h1>
<a href="/en/about/news/significant-increase-of-applicants-for-international-programmes-1.265744">Significant increase of applicants for international programmes</a>
</h1>
<h1>News</h1>
<h2>Semester start information</h2>
<h2>Meet our students</h2>
...................................
Natürlich, wenn Sie tiefer gehen wollen Sie mehr Logik hinzufügen Speichern aller Links in einer Struktur müssen und Looping bis es ist leer. Etwas wie:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from time import sleep
class Crawl:
def __init__(self, start_url, allowed, base, select):
self.start_url = start_url
self.base = base
self.allowed_domain = allowed
self.crawled = set()
self.select = select
self.urls = set()
def start(self):
r = requests.get(self.start_url)
soup = BeautifulSoup(r.content, "lxml")
menu_links = [urljoin(self.base, a["href"]) for a in soup.select(self.select)]
print(menu_links)
for lnk in menu_links:
yield from self.crawl(lnk)
def filter_urls(self, soup):
page_links = [a["href"] for a in soup.select("a[href]")]
joined = (urljoin(base, lnk) if lnk.startswith("/en/") else lnk for lnk in page_links)
return set(filter(lambda lnk: self.allowed_domain in lnk, joined))
def crawl(self, lnk):
r = requests.get(lnk)
soup = BeautifulSoup(r.content, "lxml")
hs = ["h1", "h2", "h3", "h4", "h5", "h6"]
self.urls.update(self.filter_urls(soup))
while self.urls:
nxt = self.urls.pop()
if nxt not in self.crawled:
try:
soup = BeautifulSoup(requests.get(nxt).content, "lxml")
except requests.exceptions.RequestException as e:
print(e.strerror)
self.crawled.add(nxt)
continue
self.urls.update((self.filter_urls(soup) - self.crawled))
for h in hs:
yield soup.find_all(h)
self.crawled.add(nxt)
sleep(.1)
, dass jeder Link auf der Website besuchen, die dsv.su.se
in der URL hat aber gewarnt werden viele Links sind so zu kratzen vorbereitet werden für eine Weile zu warten.