import time import pandas as pd import numpy as np from bs4 import BeautifulSoup from selenium import webdriver import random #Function to check Zillow's page for captcha. def captcha(): try: if driver.find_element_by_xpath('/html/body/main/div/div/h5').text == "Please verify you're a human to continue.": print('Solving Captcha...') input('press any key to continue') except: pass #Function to get property detail and return information to dataframe. def home_detail(link): driver_path = r'C:\Users\ymc\Downloads\chromedriver_win32 (1)\chromedriver.exe' driver2 = webdriver.Chrome(executable_path=driver_path) driver2.get(link) captcha() html = driver2.page_source soup = BeautifulSoup(html, 'html.parser') try: price = soup.find('span', {'class' : 'ds-status-icon zsg-icon-recently-sold'}).parent.text.strip('Sold:').replace('$','').replace(' ', '') auction = False except: auction = True if auction == False: date_sold = soup.find('span', {'class' : 'ds-status-icon zsg-icon-recently-sold'}).parent.parent.findAll('span')[3].text.strip('Sold on ') bedbathsqft = soup.find('div', {'class' : 'ds-bed-bath-living-area-header'}).findAll('span')[0].text.replace('bd', '').replace('ba', '').replace('sqft', '').split(' ') address = soup.find('h1', {'id' : 'ds-chip-property-address'}).text.split(',')[0] if soup.find('h1', {'id' : 'ds-chip-property-address'}).text.split(',')[2] != ' NY': city = soup.find('h1', {'id' : 'ds-chip-property-address'}).text.split(',')[1].replace(' ', '') else: city = soup.find('h1', {'id': 'ds-chip-property-address'}).text.split(',')[1] + ' ' + soup.find('h1', {'id': 'ds-chip-property-address'}).text.split(',')[2].replace(' ', '') zip = soup.find('h1', {'id' : 'ds-chip-property-address'}).text.split(',')[2].replace('NY', '').replace(' ', '') bedroom = bedbathsqft[0] bathroom = bedbathsqft[1] sqft = bedbathsqft[2] if sqft == '--': sqft = np.nan home_facts = soup.find('ul', {'class' : 'ds-home-fact-list'}).findAll('li') for i in range(0,len(home_facts)): details = home_facts[i].text.split(':') print(home_facts[i].text) if details[0] == 'BuildingType': building_type = details[1] elif details[0] == 'CalendarYear built': year_built = details[1] elif details[0] == 'ParkingParking': parking = details[1] elif details[0] == 'HOAHOA': if 'monthly' in details[1]: hoa = details[1].strip('monthly').replace('$', '').replace(' ', '') else: hoa = details[1].strip('annually').replace('$', '').replace(' ', '') hoa = int(hoa)/12 try: hoa except: hoa = np.nan id = int(link.split('/')[-2].replace('_zpid', '')) series = pd.Series(data = [id,address, zip, city, str(price), date_sold, building_type, year_built, bedroom, bathroom, parking, hoa, sqft], index=['id', 'address', 'zip', 'city', 'price', 'date_sold', 'building_type', 'year_built ', 'bedroom', 'bathroom', 'parking', 'hoa', 'sqft']) driver2.close() return series else: driver2.close() pass def main(): # Checking if there is a existing csv. Empty dataframe will be created if no csv is present. try: df = pd.read_csv("nyc_housing_sales.csv") print('Dataframe imported.') print(df) except: df = pd.DataFrame( columns=['id', 'address', 'zip', 'city', 'price', 'date_sold', 'building_type', 'year_built ', 'bedroom', 'bathroom', 'parking', 'hoa', 'sqft']) link = 'https://www.zillow.com/new-york-ny/sold/' driver_path = r'C:\Users\ymc\Downloads\chromedriver_win32 (1)\chromedriver.exe' driver = webdriver.Chrome(executable_path=driver_path) for i in range(1,21): pages = 'https://www.zillow.com/new-york-ny/sold/{}_p'.format(i) driver.get(pages) captcha() y = 500 for timer in range(0,10): driver.execute_script("window.scrollTo(0, "+str(y)+")") y += 1000 time.sleep(1) elems = driver.find_elements_by_xpath("//a[@href]") links = [] for elem in elems: try: if elem.get_attribute("href").split('/')[3] == 'homedetails': links.append(elem.get_attribute("href")) except: pass for i in range(0,len(links)): listing_id = int(links[i].split('/')[-2].replace('_zpid', '')) if listing_id in df.values: pass else: details = home_detail(links[i]) try: if details.id > 0: df = df.append([details], ignore_index=True) except: pass delay = random.randint(1,5) print('waiting {} seconds.'.format(delay)) time.sleep(delay) df.to_csv("nyc_housing_sales.csv", index=False, encoding='utf-8-sig') main()