#!/usr/bin/env python3
import argparse
import socket
import ssl
import sys
import urllib.parse
from html.parser import HTMLParser
DEFAULT_SERVER = "proj5.3700.network"
DEFAULT_PORT = 443
# carriage return, line feed
CRLF ='\r\n'
# Fakebook HTML Parser: fed HTML data
# and performs parsing of data
class FakebookParser(HTMLParser):
csrfmiddleware = None
# initial function
def __init__(self):
HTMLParser.__init__(self)
self.csrfmiddleware = None
self.hrefs = []
self.flags = []
# handles HTML data found within tags and stores a flag if found
def handle_data(self, data):
if 'FLAG: ' in data:
self.flags.append(data.replace('FLAG: ',''))
# custom handle_starttag method that searches for middlewaretoken
# and urls found on the website
def handle_starttag(self, tag, attrbs):
if tag == "input" and not self.csrfmiddleware:
for key, val in attrbs:
if key == "value":
self.csrfmiddleware = val
return
elif tag == 'a':
for key,val in attrbs:
if key == 'href' and val != '/accounts/logout/' and val != '/':
self.hrefs.append(val)
# returns the scraped data
def return_data(self):
return self.hrefs.copy(), self.flags
# Crawler class used to crawl Fakebook
class Crawler:
def __init__(self, args):
self.server = args.server
self.port = args.port
self.username = args.username
self.password = args.password
self.csrf_token = None
self.cookies = {'csrftoken' : None, 'sessionid' : None}
self.visit_queue = []
self.seen = set()
# use provided server and port (default is proj5.3700.network and 443 respectively)
self.server_port = "https://" + str(self.server) + ":" + str(self.port)
self.root_url = self.server_port + "/fakebook/"
self.login_url = self.server_port + "/accounts/login/?next=/fakebook/"
# keep-alive requires a socket used for all sending and receiving
mysocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysocket.connect((self.server, self.port))
# the TCP socket gets wrapped in TLS immediately after connection
self.socket = ssl.wrap_socket(mysocket)
# attempt to login to Fakebook site via an initial GET request,
# using the provided cookies and CSRF to POST a login request to the website
def try_login(self):
self.handle_GET_request(self.login_url)
response = self.receive()
parser = FakebookParser()
parser.feed(response.split(CRLF + CRLF)[1])
# retrieve CSRF from response, to be submitted with POST request
self.csrf_token = parser.csrfmiddleware
self.handle_POST_request()
self.receive()
# sends our HTTP GET request to the server
def handle_GET_request(self, path):
# standard GET headers - request type and Host header
request_line = "GET " + str(path) + " HTTP/1.1" + CRLF
host_hdr = "Host: " + self.server + ":" + str(self.port) + CRLF
# if there are cookies stored, they are always sent with a GET request
if self.cookies['csrftoken'] and self.cookies['sessionid']:
cookies = "Cookie: csrftoken=" + self.cookies['csrftoken']+ "; sessionid=" + self.cookies['sessionid'] + CRLF
request = request_line + host_hdr + cookies + CRLF
else:
request = request_line + host_hdr + CRLF
self.send_request(request)
# sends our HTTP POST request to the server
def handle_POST_request(self):
# standard POST headers - request type and Host header
request_line = "POST " + self.login_url + " HTTP/1.1" + CRLF
host_hdr = "Host: " + self.server + ":" + str(self.port) + CRLF
# POST request body formulation
params = { 'username' : self.username, 'password' : self.password, 'csrfmiddlewaretoken' : self.csrf_token}
encoded_data = urllib.parse.urlencode(params)
# specifiying content type and length
content_type = "Content-Type: " + "application/x-www-form-urlencoded" + CRLF
content_length = "Content-Length: " + str(len(encoded_data)) + (CRLF + CRLF)
# append stored cookies to the headers
cookies = "Cookie: csrftoken=" + self.cookies['csrftoken']+ "; sessionid=" + self.cookies['sessionid']+ CRLF
request = request_line + host_hdr + cookies + content_type + content_length + encoded_data.replace('%','=')
self.send_request(request)
# parses a response to learn the necessary information for frontier
# tracking, and returns whether the HTTP status code represents success
def formulate_response(self, response, path):
parsed = {}
lines = response.split(CRLF)
parsed['status code'] = []
parsed['location'] = []
# now parse header
for key in lines:
if "HTTP/1.1" in key:
value = key.split()
# read the status code
parsed['status code'] = int(value[1])
value = key.split(": ")
if (value[0] == "Location"):
parsed['location'] = value[1]
return self.handle_status_code(parsed, path)
# identifies HTTP status code and takes the appropriate
# action, then returns True on success (200- OK) code or False otherwise
def handle_status_code(self, response, path):
status = response['status code']
if status == 200:
# 200 OK - proceed with parsing
return True
elif status == 503:
# 503 error - retry GET request
self.visit_queue.append(path)
elif status == 403 or 404:
#TRIPPED: Abandoning URL (not adding to frontier)
pass
elif status == 302:
# 302 Found - try the request again with the new URL in Location header
if (not response['location']):
sys.stderr("Error: Received HTTP redirect, but not given new URL in header")
self.visit_queue.append(response['location'])
else:
sys.stderr("Unrecognized status code.")
return False
# sends a given request to the http server
def send_request(self, request):
self.socket.send(request.encode('ascii'))
# receives a response from the server
def receive(self):
data = ""
# read the header
while (CRLF + CRLF) not in data:
chunk = self.socket.recv(100)
data += chunk.decode('ascii')
# update the stored csrf and session id with the new cookies
if 'Set-Cookie: csrftoken=' in data and 'Set-Cookie: sessionid=' in data:
self.cookies['csrftoken'] = data.split('Set-Cookie: csrftoken=')[1].split(';')[0]
self.cookies['sessionid'] = data.split('Set-Cookie: sessionid=')[1].split(';')[0]
# read the precise number of bytes specified in the header
length = int(data.split('Content-Length: ')[1].split('Connection')[0])
data += self.socket.recv(length).decode('ascii')
return data
# essentially 'runs' our Crawler:
# logs into Fakebook, instantiates parser
# and starts searching for flags
def run(self):
sender.try_login()
# GET welcome page
self.handle_GET_request(self.root_url)
response = self.receive()
parser = FakebookParser()
# feed response to parser to extract info
parser.feed(response.split(CRLF + CRLF)[1])
# parse the login welcome page to get an initial frontier
self.visit_queue.extend(parser.return_data()[0])
parser.reset()
# iterate over the frontier while unexplored urls remain
while self.visit_queue:
url = self.visit_queue.pop(0)
# always crawl the root domain
path = self.root_url + url.replace('/fakebook/','')
self.handle_GET_request(path)
response = self.receive()
# begin parsing the data if a 200 - OK response was received
if self.formulate_response(response.split(CRLF + CRLF)[0], path) == True:
parser.feed(response.split(CRLF + CRLF)[1])
hrefs, flags = parser.return_data()
# scan the links found on a page for frontier eligibility
for h in hrefs:
if h not in self.seen:
self.visit_queue.append(h)
self.seen.add(h)
# return once all five flags are found
if flags and len(flags) == 5:
for f in flags:
print(f)
return
parser.reset()
# main method
if __name__ == "__main__":
# parse command line arguments
parser = argparse.ArgumentParser(description='crawl Fakebook')
parser.add_argument('-s', dest="server", type=str, default=DEFAULT_SERVER, help="The server to crawl")
parser.add_argument('-p', dest="port", type=int, default=DEFAULT_PORT, help="The port to use")
parser.add_argument('username', type=str, help="The username to use")
parser.add_argument('password', type=str, help="The password to use")
args = parser.parse_args()
sender = Crawler(args)
sender.run()