2016-04-15 3 views
1

Html Daten, die ich kratzen möchte, ist wie folgt. Es besteht aus vielen div Tags.Wie kann ich mit bs4 mehrere verschachtelte div-Werte erhalten und im json-Format ausgeben?

<div id="hour3"> 
    <div id="day0" class="hour3"> 
    <div class="row first"> 
     <div class="label">Time</div> 
     <div style="font-size: 12px;">14:00</div> 
     <div style="font-size: 12px;">17:00</div> 
    </div> 
    <div class="row wd"> 
     <div class="label h3_wd">Temperature</div> 
     <div>27.5℃ </div> 
     <div>27.8℃ </div> 
    </div> 
    <div id="day1" class="hour3"> 
    <div class="row first"> 
     <div class="label">Time</div> 
     <div style="font-size: 12px;">8:00</div> 
     <div style="font-size: 12px;">11:00</div> 
    </div> 
    <div class="row wd"> 
     <div class="label h3_wd">Temperature</div> 
     <div>27.5℃ </div> 
     <div>27.8℃ </div> 
    </div> 

Daten, die ich ausgegeben werden soll:

{day0: [{'Time' : 14:00,'Temperature' : 27.5℃ }], 
     [{'Time' : 17:00,'Temperature' : 27.8℃ }]}, 

{day1: [{'Time' : 8:00,'Temperature' : 27.5℃ }], 
     [{'Time' : 11:00,'Temperature' : 27.8℃ }]} 

Auf der Website, es sieht aus wie eine Tabelle

   day0    day1 

Time   14:00 17:00  08:00 11:00 

Temperature 27.5℃ 27.8℃  27.5℃ 27.8℃ 

-Code ich habe:

import time, re 
import urllib2 
from bs4 import BeautifulSoup 

start_time = time.time() 

url = 'some url' 
html = urllib2.urlopen(url).read() 
soup = BeautifulSoup(html,'html.parser') 

for datas in soup.findAll('div', attrs = {'id':'hour3'}): 
    for dates in datas('div',{'class':'row first'}): 
     for temp in datas('div',{'class':'row wd'}): 

      result = { 
       'day0':[ 
        { 
         'date' : dates.text.strip(), 
         'temperature' : temp.text.strip() 
        } 
       ] 
      } 

print result 

Und ich erhalten:

{'day0': [{'date': u'Description 1\n  \n\n  14:00\n  \n\n  17:00\n  \n\n  08:00\n  \n\n  11:00\n, 'temperature': 27.5\u2103 \n  \n\n  27.8\u2103 \n  \n\n  27.5\u2103 \n  \n\n  27.8\u2103 \n  \n\n}]} 

Wie kann ich die Daten im Wunschformat bekommen?

Antwort

2
html_doc='''<div id="hour3"> 
    <div id="day0" class="hour3"> 
    <div class="row first"> 
     <div class="label">Time</div> 
     <div style="font-size: 12px;">14:00</div> 
     <div style="font-size: 12px;">17:00</div> 
    </div> 
    <div class="row wd"> 
     <div class="label h3_wd">Temperature</div> 
     <div>27.5 </div> 
     <div>27.8 </div> 
    </div> 
    <div id="day1" class="hour3"> 
    <div class="row first"> 
     <div class="label">Time</div> 
     <div style="font-size: 12px;">8:00</div> 
     <div style="font-size: 12px;">11:00</div> 
    </div> 
    <div class="row wd"> 
     <div class="label h3_wd">Temperature</div> 
     <div>27.5 </div> 
     <div>27.8 </div> 
    </div>''' 
from bs4 import BeautifulSoup 
soup = BeautifulSoup(html_doc, 'html.parser') 
result={} 
days_conut=0 
for datas in soup.findAll('div', attrs = {'id':'hour3'}): 
    for dates in datas('div',{'class':'row first'}): 

     for temp in datas('div',{'class':'row wd'}): 
      lst_of_time=dates.text.split()[1:] 
      lst_of_temp=temp.text.split()[1:] 
      count=0 
      result['day'+str(days_conut)]=[] 
      for i in lst_of_temp: 
       result['day'+str(days_conut)].append({'time':lst_of_time[count],'temperature':lst_of_temp[count]}) 
       count=count+1 
     days_conut=days_conut+1 


print result 
1
... 
soup = BeautifulSoup(html, 'html.parser') 

result = {} 
for day in soup.find_all('div', attrs = {'class': 'hour3'}): 
    times = day.find('div', {'class': 'row first'}).find_all('div') 
    temps = day.find('div', {'class': 'row wd'}).find_all('div') 
    result[day.get('id')] = [ 
     {'Time': t.text, 'Temperature': temp.text} 
     for t, temp in zip(times[1:], temps[1:]) 
    ] 
    # [1:] - to skip header column 

print result 
Verwandte Themen