Web-Crawler / 3700crawler
3700crawler
Raw
#!/usr/bin/env python3

import argparse
import socket
import ssl
import sys
import urllib.parse

from html.parser import HTMLParser

DEFAULT_SERVER = "proj5.3700.network"
DEFAULT_PORT = 443
# carriage return, line feed
CRLF ='\r\n'

# Fakebook HTML Parser: fed HTML data
# and performs parsing of data
class FakebookParser(HTMLParser):
    csrfmiddleware = None

    # initial function 
    def __init__(self):
        HTMLParser.__init__(self)
        self.csrfmiddleware = None
        self.hrefs = []
        self.flags = []
    
    # handles HTML data found within tags and stores a flag if found
    def handle_data(self, data):
        if 'FLAG: ' in data:
            self.flags.append(data.replace('FLAG: ',''))
    
    # custom handle_starttag method that searches for middlewaretoken
    # and urls found on the website
    def handle_starttag(self, tag, attrbs):
        if tag == "input" and not self.csrfmiddleware:
            for key, val in attrbs:
                if key == "value":
                    self.csrfmiddleware = val
                    return
        elif tag == 'a':
            for key,val in attrbs:
                if key == 'href' and val != '/accounts/logout/' and val != '/':
                    self.hrefs.append(val)

    # returns the scraped data
    def return_data(self):
        return self.hrefs.copy(), self.flags

# Crawler class used to crawl Fakebook
class Crawler:
    def __init__(self, args):
        self.server = args.server
        self.port = args.port
        self.username = args.username
        self.password = args.password
        self.csrf_token = None
        self.cookies = {'csrftoken' : None, 'sessionid' : None}
        self.visit_queue = []
        self.seen = set()

        # use provided server and port (default is proj5.3700.network and 443 respectively)
        self.server_port = "https://" + str(self.server) + ":" + str(self.port)
        self.root_url = self.server_port + "/fakebook/" 
        self.login_url = self.server_port + "/accounts/login/?next=/fakebook/"

        # keep-alive requires a socket used for all sending and receiving
        mysocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        mysocket.connect((self.server, self.port))
        # the TCP socket gets wrapped in TLS immediately after connection
        self.socket = ssl.wrap_socket(mysocket)

    # attempt to login to Fakebook site via an initial GET request,
    # using the provided cookies and CSRF to POST a login request to the website
    def try_login(self):
        self.handle_GET_request(self.login_url)
        response = self.receive()

        parser = FakebookParser()
        parser.feed(response.split(CRLF + CRLF)[1])
        # retrieve CSRF from response, to be submitted with POST request
        self.csrf_token = parser.csrfmiddleware
        self.handle_POST_request()
        self.receive()

    # sends our HTTP GET request to the server
    def handle_GET_request(self, path):
        # standard GET headers - request type and Host header
        request_line = "GET " + str(path) + " HTTP/1.1" + CRLF
        host_hdr = "Host: " + self.server + ":" + str(self.port) + CRLF

        # if there are cookies stored, they are always sent with a GET request
        if self.cookies['csrftoken'] and self.cookies['sessionid']:
            cookies = "Cookie: csrftoken=" + self.cookies['csrftoken']+ "; sessionid=" + self.cookies['sessionid'] + CRLF
            request = request_line + host_hdr + cookies + CRLF
        else:
            request = request_line + host_hdr + CRLF
        self.send_request(request) 

    # sends our HTTP POST request to the server
    def handle_POST_request(self):
        # standard POST headers - request type and Host header
        request_line = "POST " + self.login_url + " HTTP/1.1" + CRLF
        host_hdr = "Host: " + self.server + ":" + str(self.port) + CRLF

        # POST request body formulation
        params = { 'username' : self.username, 'password' : self.password, 'csrfmiddlewaretoken' : self.csrf_token}
        encoded_data = urllib.parse.urlencode(params)
        
        # specifiying content type and length
        content_type = "Content-Type: " + "application/x-www-form-urlencoded" + CRLF
        content_length = "Content-Length: " + str(len(encoded_data)) + (CRLF + CRLF)

        # append stored cookies to the headers
        cookies = "Cookie: csrftoken=" + self.cookies['csrftoken']+ "; sessionid=" + self.cookies['sessionid']+ CRLF
        request = request_line + host_hdr + cookies + content_type + content_length + encoded_data.replace('%','=')

        self.send_request(request)
    
    # parses a response to learn the necessary information for frontier 
    # tracking, and returns whether the HTTP status code represents success
    def formulate_response(self, response, path):
        parsed = {}
        lines = response.split(CRLF) 
        parsed['status code'] = []
        parsed['location'] = []
        # now parse header
        for key in lines: 
            if "HTTP/1.1" in key:
                value = key.split()
                # read the status code
                parsed['status code'] = int(value[1])
            value = key.split(": ")
            if (value[0] == "Location"):
                parsed['location'] = value[1]
        return self.handle_status_code(parsed, path)
    
    # identifies HTTP status code and takes the appropriate 
    # action, then returns True on success (200- OK) code or False otherwise
    def handle_status_code(self, response, path):
        status = response['status code']
        if status == 200:
            # 200 OK - proceed with parsing
            return True
        elif status == 503:
            # 503 error -  retry GET request 
            self.visit_queue.append(path)
        elif status == 403 or 404:
            #TRIPPED: Abandoning URL (not adding to frontier)
            pass
        elif status == 302:
            # 302 Found - try the request again with the new URL in Location header
            if (not response['location']):
                sys.stderr("Error: Received HTTP redirect, but not given new URL in header")
            self.visit_queue.append(response['location'])
        else:
            sys.stderr("Unrecognized status code.")
        return False

    # sends a given request to the http server
    def send_request(self, request):
        self.socket.send(request.encode('ascii'))

    # receives a response from the server
    def receive(self):
        data = ""
        # read the header 
        while (CRLF + CRLF) not in data:
            chunk = self.socket.recv(100)
            data += chunk.decode('ascii')

        # update the stored csrf and session id with the new cookies
        if 'Set-Cookie: csrftoken=' in data and 'Set-Cookie: sessionid=' in data:
            self.cookies['csrftoken'] = data.split('Set-Cookie: csrftoken=')[1].split(';')[0]
            self.cookies['sessionid'] = data.split('Set-Cookie: sessionid=')[1].split(';')[0]

        # read the precise number of bytes specified in the header
        length = int(data.split('Content-Length: ')[1].split('Connection')[0])
        data += self.socket.recv(length).decode('ascii')
        return data

    # essentially 'runs' our Crawler:
    # logs into Fakebook, instantiates parser
    # and starts searching for flags
    def run(self):
        sender.try_login()
        # GET welcome page
        self.handle_GET_request(self.root_url)
        response = self.receive()
        parser = FakebookParser()
        # feed response to parser to extract info
        parser.feed(response.split(CRLF + CRLF)[1])
        # parse the login welcome page to get an initial frontier
        self.visit_queue.extend(parser.return_data()[0])
        parser.reset()
       
        # iterate over the frontier while unexplored urls remain
        while self.visit_queue:
            url = self.visit_queue.pop(0)
            # always crawl the root domain
            path = self.root_url + url.replace('/fakebook/','')
            self.handle_GET_request(path)
            response = self.receive()
            # begin parsing the data if a 200 - OK response was received
            if self.formulate_response(response.split(CRLF + CRLF)[0], path) == True:
                parser.feed(response.split(CRLF + CRLF)[1])
                hrefs, flags = parser.return_data()
                # scan the links found on a page for frontier eligibility
                for h in hrefs:
                    if h not in self.seen:
                        self.visit_queue.append(h)
                        self.seen.add(h)
                # return once all five flags are found
                if flags and len(flags) == 5:
                    for f in flags:
                        print(f)
                    return
                parser.reset()

# main method
if __name__ == "__main__":
    # parse command line arguments
    parser = argparse.ArgumentParser(description='crawl Fakebook')
    parser.add_argument('-s', dest="server", type=str, default=DEFAULT_SERVER, help="The server to crawl")
    parser.add_argument('-p', dest="port", type=int, default=DEFAULT_PORT, help="The port to use")
    parser.add_argument('username', type=str, help="The username to use")
    parser.add_argument('password', type=str, help="The password to use")
    args = parser.parse_args()
    sender = Crawler(args)
    sender.run()