Ich kriechen Bilder mit Pythons Selen Web-Treiber (Chrom)Python Selen Web-Treiber Multiprozessing
Kann ich mehrere Treiber verwenden und jeder Treiber Crawl das Bild haben?
Ich möchte die folgenden Dinge mit mehreren Verarbeitungs
Quellcode Code
def crawl(searchText):
driver = webdriver.Chrome('C:\\Users\\HYOWON\\Desktop\\Desktop\\Graduation\\Code\\Crawling\\chromedriver.exe')
searchUrl = "https://www.google.com/search?q={}&site=webhp&tbm=isch".format(searchText)
driver.get(searchUrl)
imgs_urls = [] # Url 저장 배열
cnt = 0
for j in range(20):
element = driver.find_element_by_css_selector("div[data-ri = '" + str(cnt + j) + "'] img")
element.click()
sleep(1)
soup = create_soup()
for img in soup.find_all('img'):
try:
if img['src'].startswith('http') and img['src'].endswith('jpg'):
imgs_urls.append(img['src'])
except:
pass
driver.close()
return(imgs_urls)
Modification
def crawl():
imgs_urls = []
for j in range(50):
element1 = driver1.find_element_by_css_selector("div[data-ri = '" + str(cnt) + "'] img")
element2 = driver2.find_element_by_css_selector("div[data-ri = '" + str(cnt) + "'] img")
element3 = driver3.find_element_by_css_selector("div[data-ri = '" + str(cnt) + "'] img")
element1.click()
WebDriverWait(driver1, 1)
soup1 = create_soup(driver1)
for img in soup1.find_all('img'):
try:
if img['src'].startswith('http') and img['src'].endswith('jpg'): # http로 시작 jpg로 끝나는것만
imgs_urls.append(img['src'])
except: # 예외 pass
pass
element2.click()
WebDriverWait(driver2, 1)
soup2 = create_soup(driver2)
for img in soup2.find_all('img'):
try:
if img['src'].startswith('http') and img['src'].endswith('jpg'):
imgs_urls.append(img['src'])
except: # 예외 pass
pass
element3.click()
WebDriverWait(driver3, 1)
soup3 = create_soup(driver3)
for img in soup3.find_all('img'):
try:
if img['src'].startswith('http') and img['src'].endswith('jpg'):
imgs_urls.append(img['src'])
except: # 예외 pass
pass
cnt += 3
return (imgs_urls)
def download_img(url, filename):
full_name = str(filename) + ".jpg"
urllib.request.urlretrieve(url, 'C:/Python/' + full_name)
for url in crawl():
download_img(url, filename)
Sie müssen eine tatsächliche Multiprocessing-Warteschlange implementieren. Selen blockiert, was bedeutet, dass es Ihren Python daran hindert, etwas anderes zu tun. Treiber 1 fordert eine Seite an, und Treiber 2 kann nichts tun, bis der Treiber 1 fertig ist. Dies wird mit der Multiprocessing-Bibliothek gelöst. – eusid