import random import math from py_wikiracer.internet import Internet from typing import List from urllib.request import urlopen from queue import PriorityQueue class Parser: @staticmethod def get_links_in_page(html: str) -> List[str]: """ This method returns all URLs, given an HTML string, in a Wikipedia page as a list of strings of format '/wiki/'. This ensures no duplicate URLs and disallowed characters (defined by internet.DISALLOWED) witin any of the URLs """ links = [] disallowed = Internet.DISALLOWED from_index = 0 target_substring = "href=\"" while True: start_index = html.find(target_substring, from_index) if start_index != -1: start_index += len(target_substring) end_index = html.find("\"", start_index) link = html[start_index : end_index] if link not in links and link.find("/wiki/") == 0 and not any([char in link[6:] for char in disallowed]): links.append(link) from_index = end_index + 1 else: break return links class BFSProblem: def __init__(self): self.internet = Internet() def bfs(self, source = "/wiki/Calvin_Li", goal = "/wiki/Wikipedia"): # Define memory allocations to be used by the algorithm (with # the main data structure being a queue) path = [source] visited_pages = [] visited_parents = [] queue = [] sub_path = [] target_found = False visited_pages.append(source) visited_parents.append(None) queue.append(source) # Keep looping until either the queue gets empty or the goal # URL is found in any of the traversed paged while queue and not target_found: page = queue.pop(0) # Pop items in a FIFO order html = self.internet.get_page(page) links = Parser.get_links_in_page(html) for link in links: if link not in visited_pages or link == goal: # Only add a node to visited & queue if it hasn't been visited before visited_pages.append(link) visited_parents.append(page) queue.append(link) if link == goal: # Mark the target_found flag True & break the loop if the goal URL is found target_found = True break if not target_found: return None # Return None if no path was found else: # Traverse back to trace the path visited url_pointer = visited_parents[-1] while url_pointer != source: sub_path.append(url_pointer) url_pointer = visited_parents[visited_pages.index(url_pointer)] if sub_path: if len(sub_path) ==1: path.append(sub_path[0]) else: sub_path.reverse() path.extend(sub_path) path.append(goal) return path # Return the path as a list of URL strings class DFSProblem: def __init__(self): self.internet = Internet() self.visited_pages = list() self.sub_path = list() # Define subpath to be used between source and goal nodes to create the full path def dfs(self, source = "/wiki/Calvin_Li", goal = "/wiki/Wikipedia"): """Recursive DFS Algorithm""" def dfs_traversal(source, goal): target_found = False self.visited_pages.append(source) html = self.internet.get_page(source) links = Parser.get_links_in_page(html) if goal in links: # Return True if goal was found self.sub_path.append(goal) target_found = True return target_found links.reverse() # Reverse the links list to recurse given every page's last link for link in links: if link not in self.visited_pages: downstream = dfs_traversal(link, goal) if downstream: self.sub_path.append(link) return True path = [source] dfs_traversal(source, goal) # Append the URL sequence in correct order before returning the path if not self.sub_path: return None else: if len(self.sub_path) == 1: path.append(self.sub_path[0]) else: self.sub_path.reverse() path.extend(self.sub_path) if goal not in path: path.append(goal) return path class DijkstrasProblem: def __init__(self): self.internet = Internet() def dijkstras(self, source = "/wiki/Calvin_Li", goal = "/wiki/Wikipedia", costFn = lambda x, y: len(y)): """Dijkstra's algorithm set with a priority queue""" priority_queue = PriorityQueue() visited_pages = list() priority_queue.put((costFn(source, source), [source])) target_found = False path = [source] while not priority_queue.empty(): next_element = priority_queue.get() sub_path = next_element[1] sub_destination = sub_path[-1] if sub_destination not in visited_pages: visited_pages.append(sub_destination) html = self.internet.get_page(sub_destination) links = Parser.get_links_in_page(html) if goal in links: path = sub_path target_found = True break for link in links: if link not in visited_pages: new_path = sub_path + [link] priority_queue.put((costFn(new_path[0], new_path[-1]) + next_element[0], new_path)) if not target_found: return None # if no path exists, return None path.append(goal) return path class WikiracerProblem: def __init__(self): self.internet = Internet() self.possible_backlinks = list() # Define the list to store the links in the goal page to be used to reduce the cost of any URL that is a member of this list (for Dijkstra prioritization) # Common links within random Wikipedia pages to be excluded from every search to reduce unnecessary internet requests self.blacklist = set(['/wiki/Wikidata', '/wiki/Trove_(identifier)','/wiki/VIAF_(identifier)', '/wiki/Wikispecies', '/wiki/Animal', '/wiki/Interim_Register_of_Marine_and_Nonmarine_Genera', '/wiki/ISNI_(identifier)', '/wiki/Wayback_Machine', '/wiki/Geographic_coordinate_system', '/wiki/Main_Page', '/wiki/Global_Biodiversity_Information_Facility', '/wiki/Time_zone', '/wiki/Daylight_saving_time', '/wiki/Doi_(identifier)', '/wiki/Integrated_Taxonomic_Information_System', '/wiki/World_War_II', '/wiki/SUDOC_(identifier)', '/wiki/ISBN_(identifier)', '/wiki/ISSN_(identifier)', '/wiki/OCLC_(identifier)','/wiki/RISM_(identifier)', '/wiki/PMID_(identifier)', '/wiki/RERO_(identifier)', '/wiki/Public_domain']) def wikiracer(self, source = "/wiki/Calvin_Li", goal = "/wiki/Wikipedia"): """Initialize the goal page's backlinks (removing common links)""" def get_possible_backlinks(goal): html = self.internet.get_page(goal) return remove_common_links(Parser.get_links_in_page(html)) """Define Dijkstra's cost by rewarding nodes in goal page's links & as well nodes with length close to the goal (within 5 characters)""" def costFn(node, goal): return 2 - int(node in self.possible_backlinks) - int(pow(len(goal)-len(node), 2) <= 25) """Remove common URLs from the given list to reduce the internet requests of the algorithm""" def remove_common_links(links): links_set = set(links) links_set = links_set.difference(self.blacklist) return list(links_set) """Setup Dijsktra's algorithm (while removing common Wikipedia links with every internet request)""" priority_queue = PriorityQueue() visited_pages = list() priority_queue.put((0, [source])) target_found = False path = [source] self.possible_backlinks = get_possible_backlinks(goal) while not priority_queue.empty(): next_element = priority_queue.get() sub_path = next_element[1] sub_destination = sub_path[-1] if sub_destination not in visited_pages: visited_pages.append(sub_destination) html = self.internet.get_page(sub_destination) links = Parser.get_links_in_page(html) links = remove_common_links(links) if goal in links: path = sub_path target_found = True break for link in links: if link not in visited_pages: new_path = sub_path + [link] priority_queue.put((costFn(new_path[-1], goal) + next_element[0], new_path)) if not target_found: return None path.append(goal) return path # if no path exists, return None # KARMA class FindInPageProblem: def __init__(self): self.internet = Internet() # This Karma problem is a little different. In this, we give you a source page, and then ask you to make up some heuristics that will allow you to efficiently # find a page containing all of the words in `query`. Again, optimize for the fewest number of internet downloads, not for the shortest path. def find_in_page(self, source = "/wiki/Calvin_Li", query = ["ham", "cheese"]): raise NotImplementedError("Karma method find_in_page") path = [source] # find a path to a page that contains ALL of the words in query in any place within the page # path[-1] should be the page that fulfills the query. # YOUR CODE HERE return path # if no path exists, return None