0
Created on Apr 19, 2016
@author: harshitha
import re
import urllib
from urllib import urlopen
from bs4 import BeautifulSoup
import urllib2
import csv
import sys
print(sys.version_info)
print(sys.version)
f = open('/home/harshitha/Documents/house_database.csv', 'w')
zz=0
f.write('Zipcode')
f.write(',')
f.write('Bedrooms')
f.write(',')
f.write('Bathrooms')
f.write(',')
f.write('Square_Footage')
f.write(',')
f.write('Price_Per_SqFt')
f.write(',')
f.write('Lot_Size')
f.write(',')
f.write('Stories')
f.write(',')
f.write('Property_Type')
f.write(',')
f.write('Year_Built')
f.write(',')
f.write('MLS')
f.write(',')
f.write('Neighborhood')
f.write(',')
f.write('County')
f.write(',')
f.write('Monthly_Est_Motgage ')
f.write(',')
f.write('Monthly_Est_Insurance')
f.write(',')
f.write('Last_Updated')
f.write(',')
f.write('Last_Sold_Date')
f.write(',')
f.write('Last_Sold_Price')
f.write(',')
f.write('\n')
# Get all the zipcodes
with open('/home/harshitha/Documents/zip.csv','rU') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
zipcode = row['ZipCode']
zz = zz + 1
if zz <3789:
continue
#from the link
link = 'http://www.homes.com/property/1416-church-st-san-francisco- ca-94131/id-100013343668/'
html = urlopen(link).read()
parsed_html = BeautifulSoup(html)
#print parsed_html
Bedrooms = '--'
Bathrooms = '--'
Square_Footage = '--'
Price_Per_SqFt = '--'
Lot_Size = '--'
Stories = '--'
Property_Type = '--'
Year_Built = '--'
MLS = '--'
Neighborhood = '--'
County = '--'
Monthly_Est_Motgage = '--'
Monthly_Est_Insurance = '--'
Last_Updated = '--'
Last_Sold_Date = '--'
Last_Sold_price = '--'
try:
data = parsed_html.find("dt", text=re.compile("Bedroom(s)"))
if data is not None:
Bedrooms = data.parent.findNextSibling("dd").text
data = parsed_html.find("dt", text=re.compile("Bathroom(s)"))
if data is not None:
Bathrooms = data.parent.findNextSibling("dd").text
data = parsed_html.find("dt", text=re.compile("Square Footage"))
if data is not None:
Square_Footage = data.parent.findNextSibling("dd").text
data = parsed_html.find("dt", text=re.compile("Price Per SqFt "))
if data is not None:
Price_Per_SqFt = data.parent.findNextSibling("dd").text
data = parsed_html.find("dt", text=re.compile("Lot Size"))
if data is not None:
Lot_Size = data.parent.findNextSibling("dd").text
data = parsed_html.find("dt", text=re.compile("Stories"))
if data is not None:
data_field = data.parent.findNextSibling("dd")
if data_field is not None:
Stories = BeautifulSoup(''.join(data_field[0])).text
data = parsed_html.find("dt", text=re.compile("Property Type"))
if data is not None:
data_field = data.parent.findNext("dt")
data_field = data.parent.findNextSibling("dd")
if data_field is not None:
Apt = BeautifulSoup(''.join(data_field[0])).text
condo = BeautifulSoup(''.join(data_field[0])).text
data = parsed_html.find("dt", text=re.compile("YearBuilt"))
if data is not None:
Year_Built = data.parent.findNextSibling("dd").text
data = parsed_html.find("dt", text=re.compile("MLS"))
if data is not None:
data_field = data.parent.findNextSibling("dd")
if data_field is not None:
MLS = BeautifulSoup(''.join(data_field[0])).text
data = parsed_html.find("dt", text=re.compile("Neighborhood"))
if data is not None:
data_field = data.parent.findNextSibling("dd")
if data_field is not None:
Neighborhood = BeautifulSoup(''.join(data_field[0])).text
data = parsed_html.find("dt", text=re.compile("County"))
if data is not None:
data_field = data.parent.findNextSibling("dd")
if data_field is not None:
county = BeautifulSoup(''.join(data_field[0])).text
data = parsed_html.find("dt", text=re.compile("Monthly Est Mortgage"))
if data is not None:
data_field = data.parent.findNextSibling("dd")
if data_field is not None:
Monthly_Est_Mortgage = BeautifulSoup(''.join(data_field[0])).text
data = parsed_html.find("dt", text=re.compile("Monthly Est Insurance"))
if data is not None:
data_field = data.parent.findNextSibling("dd")
if data_field is not None:
Monthly_Est_Insurance = BeautifulSoup(''.join(data_field[0])).text
data = parsed_html.find("dt", text=re.compile("Last Updated"))
if data is not None:
data_field= data.parent.findNextSibling("dd")
if data_field is not None:
Last_Updated = BeautifulSoup(''.join(data_field[0])).text
data = parsed_html.find("dt", text=re.compile("Last Sold Date"))
if data is not None:
data_field= data.parent.findNextSibling("dd")
if data_field is not None:
Last_Sold_Date = BeautifulSoup(''.join(data_field[0])).text
data = parsed_html.find("dt", text=re.compile("Last Sold Price"))
if data is not None:
data_field1 = data.parent.findNextSibling("dd")
if data_field1 is not None:
Last_Sold_Price = BeautifulSoup(''.join(data_field1[0])).text
except:
print ('')
f.write(zipcode + ',' + Bedrooms.replace(',','')+ ',' + Bathrooms.replace(',','') + ',' + Square_Footage.replace(',','')+ ',' + Price_Per_SqFt.replace(',','')+ ',' + Lot_Size(',','') + ',' + Stories.replace(',',''))
'''f.write(zipcode + ',' + Bedrooms.replace(',','')+ ',' + Bathrooms.replace(',','') + ',' + Square_Footage.replace(',','') \
+ ',' + Price_Per_SqFt.replace(',','')+ ',' + Lot_Size(',','') + ',' + Stories.replace(',','') \
+ ',' + Property_Type.replace(',','') + ',' +
könnte jemand mir – harshita
In einer letzten Zeile helfen ist, dass ein einen Teil eines Codes oder irgendein Botenstück auskommentiert? – MarcinWolny
Erstens, ich denke, die Identität ist nicht richtig angeordnet. Sie sollten es von Hand oder mit einem Editor wie Sublim beheben. Zweitens, ich denke, Sie versuchen, alle Daten für ** jede Postleitzahl in der Datei 'zip.csv' zu finden. Habe ich recht? Wenn das der Fall ist, machst du nicht das Richtige. Die for-Schleife in der with-Anweisung aktualisiert die 'zipcode'-Variable. Das heißt, sobald die for-Schleife beendet ist, wird die 'Postleitzahl'-Variable nur die ** letzte ** Postleitzahl in der' zip.csv' enthalten. – DboyLiao