Ich versuche, ein Element auszuwählen, die soup.find_all("li", {"class" : "first_result"})
Warum findet Beautiful Soup dieses Element nicht mit mehreren Klassen?
Das Element ist auf jeden Fall auf der Seite mit der Abfrage wie <li class="result first_result">
aussieht, aber es zeigt nicht, wenn ich meinen Skript ausführen. Ich habe auch versucht soup.find_all("li", {"class" : "result first_result"})
für die Aufzeichnung, aber immer noch nichts.
Was mache ich falsch?
edit: auf Anfrage von Alecxe Ich habe den Code, den ich bisher habe. Ich bin auf 64-Bit Windows 7 mit Python 3.4, die ich bin sicher der Schuldige. Der spezifische Teil ich diese Frage gemacht für ist am unteren Ende unter ###METACRITIC STUFF###
from bs4 import BeautifulSoup
from urllib3 import poolmanager
import csv
import requests
import sys
import os
import codecs
import re
import html5lib
import math
import time
from random import randint
connectBuilder = poolmanager.PoolManager()
inputstring = sys.argv[1] #argv string MUST use double quotes
inputarray = re.split('\s+',inputstring)
##########################KAT STUFF########################
katstring = ""
for item in inputarray: katstring += (item + "+")
katstring=katstring[:-1]
#kataddress = "https://kat.cr/usearch/?q=" + katstring #ALL kat
kataddress = "https://kat.cr/usearch/" + inputstring + " category:tv/?field=seeders&sorder=desc" #JUST TV kat
#print(kataddress)
numSeedsArray = []
numLeechArray = []
r = requests.get(kataddress)
soup = BeautifulSoup(r.content, "html5lib")
totalpages = [h2.find('span') for h2 in soup.findAll('h2')][0].text #get a string that looks like 'house of cards results 1-25 from 178'
totalpages = int(totalpages[-4:]) #slice off everything but the total # of pages
totalpages = math.floor(totalpages/25)
#print("totalpages= "+str(totalpages))
iteration=0
savedpage = ""
def getdata(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib")
global numSeedsArray
global numLeechArray
tds = soup.findAll("td", { "class" : "green center" })
numSeedsArray += [int(td.text) for td in tds]
tds = soup.findAll("td", { "class" : "red lasttd center"})
numLeechArray += [int(td.text) for td in tds]
#print(numSeedsArray)
def getnextpage(url):
global iteration
global savedpage
#print("url examined= "+url)
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib")
nextpagelinks = soup.findAll("a", { "class" : "turnoverButton siteButton bigButton" })
nextpagelinks = [link.get('href') for link in nextpagelinks]
#print(nextpagelinks)
activepage = soup.findAll("a", { "class" : "turnoverButton siteButton bigButton active" })
#print("activepage= " +activepage[0].text)
currentpagenum = activepage[0].text
#print("currentpagenum= "+currentpagenum)
if len(currentpagenum)==1 and iteration>1:
nextpage = savedpage+str(int(currentpagenum)+1)+str(nextpagelinks[0][-27:])
#print("nextpage= "+nextpage)
nextpage = re.sub(r'(%20)', ' ', nextpage)
nextpage = re.sub(r'(%3A)', ':', nextpage)
nextpage = "https://kat.cr"+nextpage
#print(nextpage)
elif len(currentpagenum)==1 and iteration<=1:
nextpage = str(nextpagelinks[0][:-28])+str(int(currentpagenum)+1)+str(nextpagelinks[0][-27:])
savedpage = str(nextpagelinks[0][:-28])
#print("savedpage= "+savedpage)
nextpage = re.sub(r'(%20)', ' ', nextpage)
nextpage = re.sub(r'(%3A)', ':', nextpage)
nextpage = "https://kat.cr"+nextpage
#print(nextpage)
elif len(currentpagenum)==2:
nextpage = savedpage+str(int(currentpagenum)+1)+str(nextpagelinks[0][-27:])
#print("nextpage= "+nextpage)
nextpage = re.sub(r'(%20)', ' ', nextpage)
nextpage = re.sub(r'(%3A)', ':', nextpage)
nextpage = "https://kat.cr"+nextpage
#print(nextpage)
return nextpage
if totalpages<2:
while iteration < totalpages-1: #should be totalpages-1 for max accuracy
getdata(kataddress)
iteration+=1
kataddress = getnextpage(kataddress)
else:
while iteration < 2: #should be totalpages-1 for max accuracy
getdata(kataddress)
iteration+=1
kataddress = getnextpage(kataddress)
# print(str(sum(numSeedsArray)))
# print(str(sum(numLeechArray)))
print(str(sum(numLeechArray)+sum(numSeedsArray)))
def getgoogdata(title):
title = re.sub(r' ', '+', title)
url = 'https://www.google.com/search?q=' +title+ '&ie=utf-8&oe=utf-8'
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib")
resultnum = soup.find("div", {"id": "resultStats"}).text[:-14]
s2 = resultnum.replace(',', '')
resultnum = re.findall(r'\b\d+\b', s2)
print(resultnum)
getgoogdata(inputstring)
####################METACRITIC STUFF#########################
metainputstring = ""
for item in inputarray:
metainputstring += item + " "
metainputstring = metainputstring[:-1]
metacriticaddress = "http://www.metacritic.com/search/tv/" + metainputstring + "/results"
print (metacriticaddress)
r = requests.get(metacriticaddress)
soup = BeautifulSoup(r.content, "html5lib")
first_result = soup.find_all("li", attrs={"class" : "first_result"})
# first_result = soup.select("li.result.first_result")
print(first_result)
Could Sie posten den kompletten Code, den Sie bisher haben? – alecxe
Sicher kein Problem –