Zillow-Web-Scrape / main.py
main.py
Raw
import time
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
import random


#Function to check Zillow's page for captcha.
def captcha():
    try:
        if driver.find_element_by_xpath('/html/body/main/div/div/h5').text == "Please verify you're a human to continue.":

            print('Solving Captcha...')
            input('press any key to continue')

    except:
        pass

#Function to get property detail and return information to dataframe.
def home_detail(link):
    driver_path = r'C:\Users\ymc\Downloads\chromedriver_win32 (1)\chromedriver.exe'
    driver2 = webdriver.Chrome(executable_path=driver_path)
    driver2.get(link)

    captcha()

    html = driver2.page_source

    soup = BeautifulSoup(html, 'html.parser')

    try:
        price = soup.find('span', {'class' : 'ds-status-icon zsg-icon-recently-sold'}).parent.text.strip('Sold:').replace('$','').replace(' ', '')
        auction = False
    except:
        auction = True

    if auction == False:
        date_sold = soup.find('span', {'class' : 'ds-status-icon zsg-icon-recently-sold'}).parent.parent.findAll('span')[3].text.strip('Sold on ')

        bedbathsqft = soup.find('div', {'class' : 'ds-bed-bath-living-area-header'}).findAll('span')[0].text.replace('bd', '').replace('ba', '').replace('sqft', '').split(' ')

        address = soup.find('h1', {'id' : 'ds-chip-property-address'}).text.split(',')[0]

        if soup.find('h1', {'id' : 'ds-chip-property-address'}).text.split(',')[2] != ' NY':
            city = soup.find('h1', {'id' : 'ds-chip-property-address'}).text.split(',')[1].replace(' ', '')
        else:
            city = soup.find('h1', {'id': 'ds-chip-property-address'}).text.split(',')[1] + ' ' + soup.find('h1', {'id': 'ds-chip-property-address'}).text.split(',')[2].replace(' ', '')


        zip = soup.find('h1', {'id' : 'ds-chip-property-address'}).text.split(',')[2].replace('NY', '').replace(' ', '')

        bedroom = bedbathsqft[0]

        bathroom = bedbathsqft[1]

        sqft = bedbathsqft[2]

        if sqft == '--':
            sqft = np.nan

        home_facts = soup.find('ul', {'class' : 'ds-home-fact-list'}).findAll('li')

        for i in range(0,len(home_facts)):
            details = home_facts[i].text.split(':')
            print(home_facts[i].text)
            if details[0] == 'BuildingType':
                building_type = details[1]
            elif details[0] == 'CalendarYear built':
                year_built = details[1]
            elif details[0] == 'ParkingParking':
                parking = details[1]
            elif details[0] == 'HOAHOA':
                if 'monthly' in details[1]:
                    hoa = details[1].strip('monthly').replace('$', '').replace(' ', '')
                else:
                    hoa = details[1].strip('annually').replace('$', '').replace(' ', '')
                    hoa = int(hoa)/12

        try:
            hoa
        except:
            hoa = np.nan


        id = int(link.split('/')[-2].replace('_zpid', ''))

        series = pd.Series(data = [id,address,  zip, city, str(price), date_sold, building_type, year_built, bedroom, bathroom, parking, hoa, sqft],
                               index=['id', 'address', 'zip', 'city', 'price', 'date_sold', 'building_type', 'year_built ', 'bedroom', 'bathroom', 'parking', 'hoa', 'sqft'])

        driver2.close()
        return series
    else:
        driver2.close()
        pass



def main():
    # Checking if there is a existing csv. Empty dataframe will be created if no csv is present.
    try:
        df = pd.read_csv("nyc_housing_sales.csv")
        print('Dataframe imported.')
        print(df)
    except:
        df = pd.DataFrame(
            columns=['id', 'address', 'zip', 'city', 'price', 'date_sold', 'building_type', 'year_built ', 'bedroom',
                     'bathroom', 'parking', 'hoa', 'sqft'])

    link = 'https://www.zillow.com/new-york-ny/sold/'

    driver_path = r'C:\Users\ymc\Downloads\chromedriver_win32 (1)\chromedriver.exe'
    driver = webdriver.Chrome(executable_path=driver_path)


    for i in range(1,21):
        pages = 'https://www.zillow.com/new-york-ny/sold/{}_p'.format(i)

        driver.get(pages)

        captcha()

        y = 500
        for timer in range(0,10):
            driver.execute_script("window.scrollTo(0, "+str(y)+")")
            y += 1000
            time.sleep(1)

        elems = driver.find_elements_by_xpath("//a[@href]")
        links = []

        for elem in elems:
            try:
                if elem.get_attribute("href").split('/')[3] == 'homedetails':
                    links.append(elem.get_attribute("href"))
            except:
                pass

        for i in range(0,len(links)):
            listing_id = int(links[i].split('/')[-2].replace('_zpid', ''))
            if listing_id in df.values:
                pass
            else:
                details = home_detail(links[i])
                try:
                    if details.id > 0:
                        df = df.append([details], ignore_index=True)
                except:
                    pass

                delay = random.randint(1,5)
                print('waiting {} seconds.'.format(delay))
                time.sleep(delay)

                df.to_csv("nyc_housing_sales.csv", index=False, encoding='utf-8-sig')


main()