python-data-structures-wikiracer / py_wikiracer / wikiracer.py
wikiracer.py
Raw
import random
import math
from py_wikiracer.internet import Internet
from typing import List
from urllib.request import urlopen
from queue import PriorityQueue

class Parser:

    @staticmethod
    def get_links_in_page(html: str) -> List[str]:
        """
        This method returns all URLs, given an HTML string, in a Wikipedia page as a list of strings
        of format '/wiki/<something>'. This ensures no duplicate URLs and disallowed characters (defined
        by internet.DISALLOWED) witin any of the URLs
        """
        links = []
        disallowed = Internet.DISALLOWED
        from_index = 0
        target_substring = "href=\""

        while True:
            start_index = html.find(target_substring, from_index)
            if start_index != -1:
                start_index += len(target_substring)
                end_index = html.find("\"", start_index)
                link = html[start_index : end_index]
                if link not in links and link.find("/wiki/") == 0 and not any([char in link[6:] for char in disallowed]):
                        links.append(link)
                from_index = end_index + 1
            else:
                break
        return links

class BFSProblem:
    def __init__(self):
        self.internet = Internet()
    
    def bfs(self, source = "/wiki/Calvin_Li", goal = "/wiki/Wikipedia"):
        # Define memory allocations to be used by the algorithm (with
        # the main data structure being a queue)
        path = [source]
        visited_pages = []
        visited_parents = []
        queue = []
        sub_path = []
        target_found = False
        visited_pages.append(source)
        visited_parents.append(None)
        queue.append(source)

        # Keep looping until either the queue gets empty or the goal
        # URL is found in any of the traversed paged
        while queue and not target_found:
            page = queue.pop(0) # Pop items in a FIFO order
            html = self.internet.get_page(page)
            links = Parser.get_links_in_page(html)
            for link in links:
                if link not in visited_pages or link == goal: # Only add a node to visited & queue if it hasn't been visited before
                    visited_pages.append(link)
                    visited_parents.append(page)
                    queue.append(link)
                    if link == goal: # Mark the target_found flag True & break the loop if the goal URL is found 
                        target_found = True
                        break
        if not target_found:
            return None # Return None if no path was found
        else:
            # Traverse back to trace the path visited
            url_pointer = visited_parents[-1]
            while url_pointer != source: 
                sub_path.append(url_pointer)
                url_pointer = visited_parents[visited_pages.index(url_pointer)]

            if sub_path:
                if len(sub_path) ==1:
                    path.append(sub_path[0])
                else:
                    sub_path.reverse()
                    path.extend(sub_path)
            path.append(goal)
            return path # Return the path as a list of URL strings

class DFSProblem:
    def __init__(self):
        self.internet = Internet()
        self.visited_pages = list()
        self.sub_path = list() # Define subpath to be used between source and goal nodes to create the full path

    def dfs(self, source = "/wiki/Calvin_Li", goal = "/wiki/Wikipedia"):
        """Recursive DFS Algorithm"""
        def dfs_traversal(source, goal):
            target_found = False
            self.visited_pages.append(source)
            html = self.internet.get_page(source)
            links = Parser.get_links_in_page(html)
            if goal in links: # Return True if goal was found
                self.sub_path.append(goal)
                target_found = True
                return target_found
            links.reverse() # Reverse the links list to recurse given every page's last link
            for link in links:
                if link not in self.visited_pages:
                    downstream = dfs_traversal(link, goal)
                    if downstream:
                        self.sub_path.append(link)
                        return True

        path = [source]
        dfs_traversal(source, goal)

        # Append the URL sequence in correct order before returning the path
        if not self.sub_path:
            return None
        else:
            if len(self.sub_path) == 1:
                path.append(self.sub_path[0])
            else:
                self.sub_path.reverse()
                path.extend(self.sub_path)
        if goal not in path:
            path.append(goal)
        return path 

class DijkstrasProblem:
    def __init__(self):
        self.internet = Internet()
    
    def dijkstras(self, source = "/wiki/Calvin_Li", goal = "/wiki/Wikipedia", costFn = lambda x, y: len(y)):
        """Dijkstra's algorithm set with a priority queue"""
        priority_queue = PriorityQueue()
        visited_pages = list()
        priority_queue.put((costFn(source, source), [source]))
        target_found = False
        path = [source]
        
        while not priority_queue.empty():
            next_element = priority_queue.get()
            sub_path = next_element[1]
            sub_destination = sub_path[-1]
            if sub_destination not in visited_pages:
                visited_pages.append(sub_destination)
                html = self.internet.get_page(sub_destination)
                links = Parser.get_links_in_page(html)
                if goal in links:
                    path = sub_path
                    target_found = True
                    break
                for link in links:
                    if link not in visited_pages:
                        new_path = sub_path + [link]
                        priority_queue.put((costFn(new_path[0], new_path[-1]) + next_element[0], new_path))

        if not target_found:
            return None # if no path exists, return None

        path.append(goal)
        return path 

class WikiracerProblem:
    def __init__(self):
        self.internet = Internet()
        self.possible_backlinks = list() # Define the list to store the links in the goal page to be used to reduce the cost of any URL that is a member of this list (for Dijkstra prioritization) 
        # Common links within random Wikipedia pages to be excluded from every search to reduce unnecessary internet requests
        self.blacklist = set(['/wiki/Wikidata', '/wiki/Trove_(identifier)','/wiki/VIAF_(identifier)',  '/wiki/Wikispecies', '/wiki/Animal', '/wiki/Interim_Register_of_Marine_and_Nonmarine_Genera', '/wiki/ISNI_(identifier)', '/wiki/Wayback_Machine', '/wiki/Geographic_coordinate_system', '/wiki/Main_Page', '/wiki/Global_Biodiversity_Information_Facility', '/wiki/Time_zone', '/wiki/Daylight_saving_time', '/wiki/Doi_(identifier)', '/wiki/Integrated_Taxonomic_Information_System', '/wiki/World_War_II', '/wiki/SUDOC_(identifier)', '/wiki/ISBN_(identifier)', '/wiki/ISSN_(identifier)', '/wiki/OCLC_(identifier)','/wiki/RISM_(identifier)', '/wiki/PMID_(identifier)', '/wiki/RERO_(identifier)', '/wiki/Public_domain'])

    def wikiracer(self, source = "/wiki/Calvin_Li", goal = "/wiki/Wikipedia"):
        """Initialize the goal page's backlinks (removing common links)"""
        def get_possible_backlinks(goal):
            html = self.internet.get_page(goal)
            return remove_common_links(Parser.get_links_in_page(html))

        """Define Dijkstra's cost by rewarding nodes in goal page's links & as well nodes with length close to the goal (within 5 characters)"""
        def costFn(node, goal):
            return 2 - int(node in self.possible_backlinks) - int(pow(len(goal)-len(node), 2) <= 25) 
        
        """Remove common URLs from the given list to reduce the internet requests of the algorithm"""
        def remove_common_links(links):
            links_set = set(links)
            links_set = links_set.difference(self.blacklist)
            return list(links_set)
        
        """Setup Dijsktra's algorithm (while removing common Wikipedia links with every internet request)"""
        priority_queue = PriorityQueue()
        visited_pages = list()
        priority_queue.put((0, [source]))
        target_found = False
        path = [source]
        self.possible_backlinks = get_possible_backlinks(goal)
        
        while not priority_queue.empty():
            next_element = priority_queue.get()
            sub_path = next_element[1]
            sub_destination = sub_path[-1]
            if sub_destination not in visited_pages:
                visited_pages.append(sub_destination)
                html = self.internet.get_page(sub_destination)
                links = Parser.get_links_in_page(html)
                links = remove_common_links(links)
                if goal in links:
                    path = sub_path
                    target_found = True
                    break
                for link in links:
                    if link not in visited_pages:
                        new_path = sub_path + [link]
                        priority_queue.put((costFn(new_path[-1], goal) + next_element[0], new_path))

        if not target_found:
            return None

        path.append(goal)
        return path # if no path exists, return None

# KARMA
class FindInPageProblem:
    def __init__(self):
        self.internet = Internet()
    # This Karma problem is a little different. In this, we give you a source page, and then ask you to make up some heuristics that will allow you to efficiently
    #  find a page containing all of the words in `query`. Again, optimize for the fewest number of internet downloads, not for the shortest path.

    def find_in_page(self, source = "/wiki/Calvin_Li", query = ["ham", "cheese"]):

        raise NotImplementedError("Karma method find_in_page")

        path = [source]

        # find a path to a page that contains ALL of the words in query in any place within the page
        # path[-1] should be the page that fulfills the query.
        # YOUR CODE HERE

        return path # if no path exists, return None