2016-04-21 11 views
0
Created on Apr 19, 2016 
@author: harshitha 
import re 
import urllib 
from urllib import urlopen 
from bs4 import BeautifulSoup 
import urllib2 
import csv 
import sys 
print(sys.version_info) 
print(sys.version) 
f = open('/home/harshitha/Documents/house_database.csv', 'w') 

zz=0 

f.write('Zipcode') 
f.write(',') 
f.write('Bedrooms')   
f.write(',') 
f.write('Bathrooms') 
f.write(',') 
f.write('Square_Footage') 
f.write(',') 
f.write('Price_Per_SqFt') 
f.write(',') 
f.write('Lot_Size') 
f.write(',') 
f.write('Stories') 
f.write(',') 
f.write('Property_Type') 
f.write(',') 
f.write('Year_Built') 
f.write(',') 
f.write('MLS') 
f.write(',') 
f.write('Neighborhood') 
f.write(',') 
f.write('County') 
f.write(',') 
f.write('Monthly_Est_Motgage ') 
f.write(',') 
f.write('Monthly_Est_Insurance') 
f.write(',') 
f.write('Last_Updated') 
f.write(',') 
f.write('Last_Sold_Date') 
f.write(',') 
f.write('Last_Sold_Price') 
f.write(',') 
f.write('\n') 

# Get all the zipcodes 
with open('/home/harshitha/Documents/zip.csv','rU') as csvfile: 
reader = csv.DictReader(csvfile) 
for row in reader: 
    zipcode = row['ZipCode'] 
    zz = zz + 1 

    if zz <3789: 
     continue 

#from the link 
    link = 'http://www.homes.com/property/1416-church-st-san-francisco-  ca-94131/id-100013343668/' 
    html = urlopen(link).read() 
    parsed_html = BeautifulSoup(html) 
    #print parsed_html 

Bedrooms = '--' 
Bathrooms = '--' 
Square_Footage = '--' 
Price_Per_SqFt = '--' 
Lot_Size = '--' 
Stories = '--' 
Property_Type = '--' 
Year_Built = '--' 
MLS = '--' 
Neighborhood = '--' 
County = '--' 
Monthly_Est_Motgage = '--' 
Monthly_Est_Insurance = '--' 
Last_Updated = '--' 
Last_Sold_Date = '--' 
Last_Sold_price = '--' 

try: 
    data = parsed_html.find("dt", text=re.compile("Bedroom(s)")) 
    if data is not None: 
      Bedrooms = data.parent.findNextSibling("dd").text 


    data = parsed_html.find("dt", text=re.compile("Bathroom(s)")) 
    if data is not None: 
      Bathrooms = data.parent.findNextSibling("dd").text 

    data = parsed_html.find("dt", text=re.compile("Square Footage")) 
    if data is not None: 
      Square_Footage = data.parent.findNextSibling("dd").text 

    data = parsed_html.find("dt", text=re.compile("Price Per SqFt ")) 
    if data is not None: 
      Price_Per_SqFt = data.parent.findNextSibling("dd").text 

    data = parsed_html.find("dt", text=re.compile("Lot Size")) 
    if data is not None: 
      Lot_Size = data.parent.findNextSibling("dd").text 

    data = parsed_html.find("dt", text=re.compile("Stories")) 
    if data is not None: 
      data_field = data.parent.findNextSibling("dd") 
      if data_field is not None: 
       Stories = BeautifulSoup(''.join(data_field[0])).text 

    data = parsed_html.find("dt", text=re.compile("Property Type")) 
    if data is not None: 
      data_field = data.parent.findNext("dt") 
      data_field = data.parent.findNextSibling("dd") 
      if data_field is not None: 
       Apt = BeautifulSoup(''.join(data_field[0])).text 
       condo = BeautifulSoup(''.join(data_field[0])).text 

    data = parsed_html.find("dt", text=re.compile("YearBuilt")) 
    if data is not None: 
      Year_Built = data.parent.findNextSibling("dd").text 

    data = parsed_html.find("dt", text=re.compile("MLS")) 
    if data is not None: 
      data_field = data.parent.findNextSibling("dd") 
      if data_field is not None: 
       MLS = BeautifulSoup(''.join(data_field[0])).text 

    data = parsed_html.find("dt", text=re.compile("Neighborhood")) 
    if data is not None: 
      data_field = data.parent.findNextSibling("dd") 
      if data_field is not None: 
       Neighborhood = BeautifulSoup(''.join(data_field[0])).text 

    data = parsed_html.find("dt", text=re.compile("County")) 
    if data is not None: 
      data_field = data.parent.findNextSibling("dd") 
      if data_field is not None: 
       county = BeautifulSoup(''.join(data_field[0])).text 

    data = parsed_html.find("dt", text=re.compile("Monthly Est Mortgage")) 
    if data is not None: 
      data_field = data.parent.findNextSibling("dd") 
      if data_field is not None: 
       Monthly_Est_Mortgage =  BeautifulSoup(''.join(data_field[0])).text 

    data = parsed_html.find("dt", text=re.compile("Monthly Est Insurance")) 
    if data is not None: 
      data_field = data.parent.findNextSibling("dd") 
      if data_field is not None: 
       Monthly_Est_Insurance = BeautifulSoup(''.join(data_field[0])).text 

    data = parsed_html.find("dt", text=re.compile("Last Updated")) 
    if data is not None: 
      data_field= data.parent.findNextSibling("dd") 
      if data_field is not None: 
       Last_Updated = BeautifulSoup(''.join(data_field[0])).text 

    data = parsed_html.find("dt", text=re.compile("Last Sold Date")) 
    if data is not None: 
      data_field= data.parent.findNextSibling("dd") 
      if data_field is not None: 
       Last_Sold_Date = BeautifulSoup(''.join(data_field[0])).text 

    data = parsed_html.find("dt", text=re.compile("Last Sold Price")) 
    if data is not None: 
      data_field1 = data.parent.findNextSibling("dd") 
      if data_field1 is not None: 
       Last_Sold_Price = BeautifulSoup(''.join(data_field1[0])).text 

except: 
print ('') 
f.write(zipcode + ',' + Bedrooms.replace(',','')+ ',' +  Bathrooms.replace(',','') + ',' + Square_Footage.replace(',','')+ ',' + Price_Per_SqFt.replace(',','')+ ',' + Lot_Size(',','') + ',' + Stories.replace(',','')) 

'''f.write(zipcode + ',' + Bedrooms.replace(',','')+ ',' + Bathrooms.replace(',','') + ',' + Square_Footage.replace(',','') \ 
      + ',' + Price_Per_SqFt.replace(',','')+ ',' + Lot_Size(',','') + ',' + Stories.replace(',','') \ 
      + ',' + Property_Type.replace(',','') + ',' +  
+0

könnte jemand mir – harshita

+0

In einer letzten Zeile helfen ist, dass ein einen Teil eines Codes oder irgendein Botenstück auskommentiert? – MarcinWolny

+0

Erstens, ich denke, die Identität ist nicht richtig angeordnet. Sie sollten es von Hand oder mit einem Editor wie Sublim beheben. Zweitens, ich denke, Sie versuchen, alle Daten für ** jede Postleitzahl in der Datei 'zip.csv' zu finden. Habe ich recht? Wenn das der Fall ist, machst du nicht das Richtige. Die for-Schleife in der with-Anweisung aktualisiert die 'zipcode'-Variable. Das heißt, sobald die for-Schleife beendet ist, wird die 'Postleitzahl'-Variable nur die ** letzte ** Postleitzahl in der' zip.csv' enthalten. – DboyLiao

Antwort

0

Versuchen Sie folgendes:

""" 
Created on Apr 19, 2016 

@author: harshitha 
""" 

import re 
import urllib 
from urllib import urlopen 
from bs4 import BeautifulSoup 
import urllib2 
import csv 
import sys 
print(sys.version_info) 
print(sys.version) 
f = open('/home/harshitha/Documents/house_database.csv', 'w') 

zz=0 
header = ",".join([ 
'Zipcode', 'Bedrooms', 'Bathrooms', 
'Square_Footage','Price_Per_SqFt', 'Lot_Size', 
'Stories', 'Property_Type', 'Year_Built', 
'MLS', 'Neighborhood', 'County', 
'Monthly_Est_Motgage','Monthly_Est_Insurance', 'Last_Updated', 
'Last_Sold_Date', 'Last_Sold_Price' 
]) 
f.write(header + "\n") 

# Get all the zipcodes 
with open('/home/harshitha/Documents/zip.csv','rU') as csvfile: 
    reader = csv.DictReader(csvfile) 
    for row in reader: 
     zipcode = row['ZipCode'] 
     zz = zz + 1 

     if zz < 3789: 
      continue 

     #from the link 
     link = 'http://www.homes.com/property/1416-church-st-san-francisco-ca-94131/id-100013343668/' 
     html = urlopen(link).read() 
     parsed_html = BeautifulSoup(html) 
     #print parsed_html 

     Bedrooms = '--' 
     Bathrooms = '--' 
     Square_Footage = '--' 
     Price_Per_SqFt = '--' 
     Lot_Size = '--' 
     Stories = '--' 
     Property_Type = '--' 
     Year_Built = '--' 
     MLS = '--' 
     Neighborhood = '--' 
     County = '--' 
     Monthly_Est_Motgage = '--' 
     Monthly_Est_Insurance = '--' 
     Last_Updated = '--' 
     Last_Sold_Date = '--' 
     Last_Sold_price = '--' 

     try: 
      data = parsed_html.find("dt", text=re.compile("Bedroom(s)")) 
      if data is not None: 
       Bedrooms = data.parent.findNextSibling("dd").text 


      data = parsed_html.find("dt", text=re.compile("Bathroom(s)")) 
      if data is not None: 
       Bathrooms = data.parent.findNextSibling("dd").text 

      data = parsed_html.find("dt", text=re.compile("Square Footage")) 
      if data is not None: 
       Square_Footage = data.parent.findNextSibling("dd").text 

      data = parsed_html.find("dt", text=re.compile("Price Per SqFt ")) 
      if data is not None: 
       Price_Per_SqFt = data.parent.findNextSibling("dd").text 

      data = parsed_html.find("dt", text=re.compile("Lot Size")) 
      if data is not None: 
       Lot_Size = data.parent.findNextSibling("dd").text 

      data = parsed_html.find("dt", text=re.compile("Stories")) 
      if data is not None: 
       data_field = data.parent.findNextSibling("dd") 
       if data_field is not None: 
         Stories = BeautifulSoup(''.join(data_field[0])).text 

      data = parsed_html.find("dt", text=re.compile("Property Type")) 
      if data is not None: 
       data_field = data.parent.findNext("dt") 
       data_field = data.parent.findNextSibling("dd") 
       if data_field is not None: 
        Apt = BeautifulSoup(''.join(data_field[0])).text 
        condo = BeautifulSoup(''.join(data_field[0])).text 

      data = parsed_html.find("dt", text=re.compile("YearBuilt")) 
      if data is not None: 
       Year_Built = data.parent.findNextSibling("dd").text 

      data = parsed_html.find("dt", text=re.compile("MLS")) 
      if data is not None: 
       data_field = data.parent.findNextSibling("dd") 
       if data_field is not None: 
        MLS = BeautifulSoup(''.join(data_field[0])).text 

      data = parsed_html.find("dt", text=re.compile("Neighborhood")) 
      if data is not None: 
       data_field = data.parent.findNextSibling("dd") 
       if data_field is not None: 
        Neighborhood = BeautifulSoup(''.join(data_field[0])).text 

      data = parsed_html.find("dt", text=re.compile("County")) 
      if data is not None: 
       data_field = data.parent.findNextSibling("dd") 
       if data_field is not None: 
        county = BeautifulSoup(''.join(data_field[0])).text 

      data = parsed_html.find("dt", text=re.compile("Monthly Est Mortgage")) 
      if data is not None: 
       data_field = data.parent.findNextSibling("dd") 
       if data_field is not None: 
        Monthly_Est_Mortgage =  BeautifulSoup(''.join(data_field[0])).text 

      data = parsed_html.find("dt", text=re.compile("Monthly Est Insurance")) 
      if data is not None: 
       data_field = data.parent.findNextSibling("dd") 
       if data_field is not None: 
        Monthly_Est_Insurance = BeautifulSoup(''.join(data_field[0])).text 

      data = parsed_html.find("dt", text=re.compile("Last Updated")) 
      if data is not None: 
       data_field= data.parent.findNextSibling("dd") 
       if data_field is not None: 
        Last_Updated = BeautifulSoup(''.join(data_field[0])).text 

      data = parsed_html.find("dt", text=re.compile("Last Sold Date")) 
      if data is not None: 
       data_field= data.parent.findNextSibling("dd") 
       if data_field is not None: 
        Last_Sold_Date = BeautifulSoup(''.join(data_field[0])).text 

      data = parsed_html.find("dt", text=re.compile("Last Sold Price")) 
      if data is not None: 
       data_field1 = data.parent.findNextSibling("dd") 
       if data_field1 is not None: 
        Last_Sold_Price = BeautifulSoup(''.join(data_field1[0])).text 

     except: 
      print ('') 

     f.write(zipcode + ',' + Bedrooms.replace(',','')+ ',' + Bathrooms.replace(',','') + ',' + Square_Footage.replace(',','')+ ',' + Price_Per_SqFt.replace(',','')+ ',' + Lot_Size(',','') + ',' + Stories.replace(',','')) 
f.close() 
Verwandte Themen