2017-01-23 5 views
1

Ich habe versucht, die Listendaten aus jedem Dropdown-Menü auf dieser Seite zu sammeln. Ich kann auf den li-Tagteil zugreifen und die "href" -Daten mit Selenium Python 3.6 sammeln. Aber das Problem ist, dass ich die Textdaten jeder Liste nicht bekommen kann.Probleme beim Extrahieren von "Text" mit Selenium Python

Mein Code ist unten:

from selenium import webdriver 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.common.exceptions import StaleElementReferenceException 
from selenium.webdriver.support import expected_conditions as EC 
from selenium.webdriver.support.ui import Select 
from selenium.common.exceptions import NoSuchElementException 

from bs4 import BeautifulSoup 
from time import sleep 


link = 'http://www.bobaedream.co.kr/cyber/CyberCar.php?gubun=I' 
driver = webdriver.PhantomJS() 
driver.set_window_size(1920, 1080) 
driver.get(link) 
sleep(.75) 

soup = BeautifulSoup(driver.page_source, "html.parser", from_encoding='utf-8') 

manufacturers = [ 
    ('%s' % o.text, '%s' % o.get_attribute('href')) 
    for o 
    in driver.find_elements_by_css_selector("#layer_maker ul.list li a") 
    if o.text != '전체'] 

for manufacturer in manufacturers: 
    print(manufacturer) 

Mein Ergebnis ist unten:

('', "javascript:selChange('maker', '0', '%EC%A0%84%EC%B2%B4');") 
('', "javascript:selChange('maker', '1', 'BMW');") 
('', "javascript:selChange('maker', '21', '%EB%B2%A4%EC%B8%A0');") 
('', "javascript:selChange('maker', '32', '%EC%95%84%EC%9A%B0%EB%94%94');") 
('', "javascript:selChange('maker', '44', '%ED%8F%AD%EC%8A%A4%EB%B0%94%EA%B2%90');") 
('', "javascript:selChange('maker', '13', '%EB%A0%89%EC%84%9C%EC%8A%A4');") 
('', "javascript:selChange('maker', '97', '%EB%AF%B8%EB%8B%88');") 
('', "javascript:selChange('maker', '2', 'GM');") 
('', "javascript:selChange('maker', '77', 'GMC');") 
('', "javascript:selChange('maker', '5', '%EB%8B%9B%EC%82%B0');") 
('', "javascript:selChange('maker', '6', '%EB%8B%A4%EC%9D%B4%ED%95%98%EC%93%B0');") 
('', "javascript:selChange('maker', '7', '%EB%8B%B7%EC%A7%80');") 
('', "javascript:selChange('maker', '9', '%EB%8F%84%EC%9A%94%ED%83%80');") 
('', "javascript:selChange('maker', '10', '%EB%9E%80%EC%B9%98%EC%95%84');") 
('', "javascript:selChange('maker', '11', '%EB%9E%8C%EB%B3%B4%EB%A5%B4%EA%B8%B0%EB%8B%88');") 
('', "javascript:selChange('maker', '12', '%EB%9E%9C%EB%93%9C%EB%A1%9C%EB%B2%84');") 
('', "javascript:selChange('maker', '14', '%EB%A1%9C%EB%B2%84');") 
('', "javascript:selChange('maker', '15', '%EB%A1%9C%ED%84%B0%EC%8A%A4');") 
('', "javascript:selChange('maker', '16', '%EB%A1%A4%EC%8A%A4%EB%A1%9C%EC%9D%B4%EC%8A%A4');") 
('', "javascript:selChange('maker', '61', '%EB%A5%B4%EB%85%B8');") 
('', "javascript:selChange('maker', '17', '%EB%A7%81%EC%BB%A8');") 
('', "javascript:selChange('maker', '18', '%EB%A7%88%EC%84%B8%EB%9D%BC%ED%8B%B0');") 
('', "javascript:selChange('maker', '19', '%EB%A7%88%EC%AF%94%EB%8B%A4');") 
('', "javascript:selChange('maker', '1003', '%EB%A7%A5%EB%9D%BC%EB%A0%8C');") 
('', "javascript:selChange('maker', '60', '%EB%A8%B8%ED%81%90%EB%A6%AC');") 
('', "javascript:selChange('maker', '20', '%EB%AF%B8%EC%93%B0%EB%B9%84%EC%8B%9C');") 
('', "javascript:selChange('maker', '82', '%EB%AF%B8%EC%AF%94%EC%98%A4%EC%B9%B4');") 
('', "javascript:selChange('maker', '22', '%EB%B2%A4%ED%8B%80%EB%A6%AC');") 
('', "javascript:selChange('maker', '23', '%EB%B3%BC%EB%B3%B4');") 
('', "javascript:selChange('maker', '1009', '%EB%B6%81%EA%B8%B0%EC%9D%80%EC%83%81');") 
('', "javascript:selChange('maker', '88', '%EB%B6%80%EA%B0%80%ED%8B%B0');") 
('', "javascript:selChange('maker', '24', '%EB%B7%B0%EC%9D%B5');") 
('', "javascript:selChange('maker', '99', '%EB%B9%84%EC%9D%B4%EC%8A%A4%EB%A7%8C');") 
('', "javascript:selChange('maker', '25', '%EC%82%AC%EB%B8%8C');") 
('', "javascript:selChange('maker', '94', '%EC%83%88%ED%84%B4');") 
('', "javascript:selChange('maker', '29', '%EC%89%90%EB%B3%B4%EB%A0%88');") 
('', "javascript:selChange('maker', '27', '%EC%8A%A4%EB%B0%94%EB%A3%A8');") 
('', "javascript:selChange('maker', '28', '%EC%8A%A4%EC%A6%88%ED%82%A4');") 
('', "javascript:selChange('maker', '103', '%EC%8A%A4%EC%B9%B4%EB%8B%88%EC%95%84');") 
('', "javascript:selChange('maker', '93', '%EC%8A%A4%ED%8C%8C%EC%9D%B4%EC%BB%A4');") 
('', "javascript:selChange('maker', '30', '%EC%8B%9C%ED%8A%B8%EB%A1%9C%EC%97%A5');") 
('', "javascript:selChange('maker', '33', '%EC%95%8C%ED%8C%8C%EB%A1%9C%EB%A9%94%EC%98%A4');") 
('', "javascript:selChange('maker', '62', '%EC%95%A0%EC%8A%A4%ED%84%B4%EB%A7%88%ED%8B%B4');") 
('', "javascript:selChange('maker', '95', '%EC%96%B4%ED%81%90%EB%9D%BC');") 
('', "javascript:selChange('maker', '34', '%EC%98%A4%ED%8E%A0');") 
('', "javascript:selChange('maker', '1011', '%EC%98%A4%EC%8A%A4%ED%8B%B4');") 
('', "javascript:selChange('maker', '35', '%EC%98%AC%EC%A6%88%EB%AA%A8%EB%B9%8C');") 
('', "javascript:selChange('maker', '83', '%EC%9B%A8%EC%8A%A4%ED%8A%B8%ED%95%84%EB%93%9C');") 
('', "javascript:selChange('maker', '36', '%EC%9D%B4%EC%8A%A4%EC%A6%88');") 
('', "javascript:selChange('maker', '81', '%EC%9D%B8%ED%94%BC%EB%8B%88%ED%8B%B0');") 
('', "javascript:selChange('maker', '37', '%EC%9E%AC%EA%B7%9C%EC%96%B4');") 
('', "javascript:selChange('maker', '96', '%EC%A7%80%ED%94%84');") 
('', "javascript:selChange('maker', '1006', '%ED%85%8C%EC%8A%AC%EB%9D%BC');") 
('', "javascript:selChange('maker', '38', '%EC%BA%90%EB%94%9C%EB%9D%BD');") 
('', "javascript:selChange('maker', '89', '%EC%BD%94%EB%8B%89%EC%84%B8%ED%81%AC');") 
('', "javascript:selChange('maker', '39', '%ED%81%AC%EB%9D%BC%EC%9D%B4%EC%8A%AC%EB%9F%AC');") 
('', "javascript:selChange('maker', '84', '%ED%8C%8C%EA%B0%80%EB%8B%88');") 
('', "javascript:selChange('maker', '41', '%ED%8E%98%EB%9D%BC%EB%A6%AC');") 
('', "javascript:selChange('maker', '42', '%ED%8F%AC%EB%93%9C');") 
('', "javascript:selChange('maker', '43', '%ED%8F%AC%EB%A5%B4%EC%89%90');") 
('', "javascript:selChange('maker', '1008', '%ED%8F%AC%ED%86%A4');") 
('', "javascript:selChange('maker', '45', '%ED%8F%B0%ED%8B%B0%EC%95%85');") 
('', "javascript:selChange('maker', '46', '%ED%91%B8%EC%A1%B0');") 
('', "javascript:selChange('maker', '91', '%ED%94%BC%EC%8A%A4%EC%BB%A4');") 
('', "javascript:selChange('maker', '47', '%ED%94%BC%EC%95%84%ED%8A%B8');") 
('', "javascript:selChange('maker', '48', '%ED%97%88%EB%A8%B8');") 
('', "javascript:selChange('maker', '50', '%ED%98%BC%EB%8B%A4');") 
('', "javascript:selChange('maker', '76', '%ED%99%80%EB%8D%B4');") 
('', "javascript:selChange('maker', '4', '%EA%B8%B0%ED%83%80 %EC%88%98%EC%9E%85%EC%B0%A8');") 

Dies wird das aufgenommene Bild der HTML-Quelle: enter image description here

Ich verstehe nicht, warum Der Textteil ist leer und der gesamte koreanische Buchstabe ist gebrochen (der koreanische Buchstabe ist das dritte Element in javascript: selChange). Was ich tun möchte, ist, den Textteil zu erfüllen und den koreanischen Brief zu korrigieren.

Bitte helfen.

Antwort

1

Versuchen folgenden Code zu verwenden:

from urllib import parse 

... 
manufacturers = [ 
(o.get_attribute('text'), parse.unquote(o.get_attribute('href'))) 
for o 
in driver.find_elements_by_css_selector("#layer_maker ul.list li a") 
if o.get_attribute('text') != '전체'] 

für Hersteller in Hersteller: print (Hersteller)

Ausgang:

('BMW', "javascript:selChange('maker', '1', 'BMW');") 
('벤츠', "javascript:selChange('maker', '21', '벤츠');") 
('아우디', "javascript:selChange('maker', '32', '아우디');") 
('폭스바겐', "javascript:selChange('maker', '44', '폭스바겐');") 
... 
+0

Danke, Andersson. Ich habe es endlich herausgefunden !! Ich habe deine Hilfe sehr geschätzt. Vielen Dank!! –

Verwandte Themen