#Brian Wang 3/22/2020 #based on https://people.cs.uct.ac.za/~jkenwood/JasonBrownbridge.pdf import random, time, re, math random.seed(time.time()) #PROBABLY DONT NEED TO CHANGE THESE ngrams = {} alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] text = "9. W CTZV VYQXDVD MCWJ IVJJTHV, TYD VYQXDVD WM BVAA, FXK WM QXYMTWYJ MCV JVQKVM XF MCV PYWZVKJV! YX KVTAAS, WM DXVJ! SXP DXY'M NVAWVZV IV? BCS BXPAD SXP YXM NVAWVZV MCTM MCWJ RVKFVQMAS QKXIPAVYM JVQKVM MVGM QXYMTWYJ MCV NV TAA, VYD TAA, HKTYDVJM JVQKVM XF TAA MCV QXJIXJ? YXB W FVVA DWJKVJRVQMVD! CTZV SXP DWJQXZVKVD SXPK XBY NVMMVK PAMWITMV MKPMC XF VZVKSMCWYH? W DWDY'M MCWYL JX. JX BCS TKV SXP HVMMWYH TAA PRRWMS TM IV? CXYVJMAS. YX XYV CTJ TYS ITYYVKJ MCVJV DTSJ. ...BCTM'J MCTM? SXP BTYM IV MX MVAA SXP MCV JVQKVM? YXM TFMVK MCWJ LWYD XF DWJKVJRVQM! HXXDYVJJ HKTQWXPJ IV. NTQL BCVY W BTJ T SXPMC W BTJ YXM JX QTAAXPJ. BCVY JXIVXYV BVAA KVJRVQMVD TYD WIRXKMTYM MXAD IV MCTM MCVS CTD JXIVMCWYH BXKMC MVAAWYH IV, W OPJM AWJMVYVD! W DWDY'M DXPNM MCVI! JX KPDV, CXYVJMAS. OPJM PYTQQVRMTNAV." # text = re.sub(r'\s+', '', text) #remove whitespace #SETTINGS ngrams_file = 'ngrams1.tsv' #file to read ngrams from num_individuals = 1000 #population space max_rounds = 500 #maximum number of rounds mutation_chance = 0.2 #chance of mutation (1.0 is 100%) mutation_tries = 1 #number of attempts at mutating convergence_threshold = 60 #if no improvement for this number, assume convergence gram_length = 3 #length of ngram to use timeout = num_individuals * 100 #will probably never trigger (if it does, mutation is probably too low). the number of times to try to create unique keys when making a new generation preserve_top_percent = 0.15 #copy over top this percent of population crossover_chance = 0.65 #chance a crossover will occur crossover_points = 5 #number of points that will crossover debug = True #print debug statements? #GLOBAL VARIABLES (not all of these need to be global but whatever) set_individuals = {} #set of keys and corresponding fitness keys_in_order = [] #keys to set_individuals, sorted from best to worst def main(): start_time = time.time() read_ngrams() best_fitness = 0.0 #higher is better best_key = () stuck_count = 0 while(len(set_individuals.keys()) < num_individuals): #make num_individuals unique keys try_key = random_key() set_individuals[try_key] = evaluate_fitness(try_key) #intitialize keys if set_individuals[try_key] > best_fitness: #update best fitness best_fitness = set_individuals[try_key] best_key = try_key keys_in_order = sorted(list(set_individuals.keys()), key = lambda i:set_individuals[i], reverse = True) for i in range(0, max_rounds): if i % 5 == 0: #print fitness values every 5 generations (debug) if debug: print(best_fitness) new_best_key = new_generation() #changes the set_individuals dictionary, returns the best key from prev generation if best_fitness >= set_individuals[new_best_key]: stuck_count += 1 #generations without improvement if stuck_count > convergence_threshold: print("converged") break else: stuck_count = 0 best_key = new_best_key best_fitness = set_individuals[new_best_key] print("Results:") print("Best key was: " + str(best_key) + " with a fitness of " + str(best_fitness)) print("The decoded message:") print(decode(best_key, text)) print('Runtime was ', time.time() - start_time, ' seconds') def new_generation(): global set_individuals new_gen = {} #can't overwrite yet, downside of using global variable, change if performance improvements are needed best_fitness = 0.0 best_key = () num_tries = 0 old_keys = list(set_individuals.keys()) #copy over top few percent old_keys.sort(key = lambda i: set_individuals[i], reverse = True) for i in range(0, int(num_individuals * preserve_top_percent)): new_gen[old_keys[i]] = set_individuals[old_keys[i]] while(len(new_gen.keys()) < num_individuals): #make num_individuals unique keys num_tries += 1 if num_tries > timeout: #if too many breeding attempts have failed while(len(new_gen.keys()) < num_individuals): #make num_individuals unique keys try_key = random_key() new_gen[try_key] = evaluate_fitness(try_key) if new_gen[try_key] > best_fitness: #update best fitness best_fitness = new_gen[try_key] best_key = try_key break else: #select two parents, create up to two children parent1 = choose_parent(set_individuals) parent2 = choose_parent(set_individuals) try_key = crossover(parent1, parent2) #mutate for i in range(0, mutation_tries): r = random.random() if r < mutation_chance: try_key = mutate(try_key) new_gen[try_key] = evaluate_fitness(try_key) #add to new generation if new_gen[try_key] > best_fitness: #update best fitness best_fitness = new_gen[try_key] best_key = try_key if(len(new_gen.keys()) >= num_individuals): break try_key = crossover(parent2, parent1) #same but swap parents #mutate for i in range(0, mutation_tries): r = random.random() if r < mutation_chance: try_key = mutate(try_key) new_gen[try_key] = evaluate_fitness(try_key) #add to new generation if new_gen[try_key] > best_fitness: #update best fitness best_fitness = new_gen[try_key] best_key = try_key set_individuals = new_gen #overwrite old generation keys_in_order = sorted(list(set_individuals.keys()), key = lambda i:set_individuals[i], reverse = True) #update order return best_key def crossover(parent1, parent2): if random.random() > crossover_chance: return parent1 new_key = [None] * 26 #so it isnt empty p1_influence = (26 - crossover_points) #random.randint(0, 26) #the amount to take from the first parent indices = random.sample(range(0, 26), p1_influence) taken_letters = set([]) for index in indices: #takes letters from selected (random) indices and puts them in the same spot in the child new_key[index] = parent1[index] taken_letters.add(parent1[index]) for i in range(0, 26): #fill in with parent 2 when possible if new_key[i] == None: if not parent2[i] in taken_letters: new_key[i] = parent2[i] taken_letters.add(parent2[i]) leftover_letters = list(set(alphabet).difference(taken_letters)) #what letters still need to be filled? random.shuffle(leftover_letters) count = 0 for i in range(0, 26): if new_key[i] == None: new_key[i] = leftover_letters[count] count += 1 return tuple(new_key) def choose_parent(pop): #picks a parent from population, weighted based on the fitness ordered_keys = list(pop.keys()) total = 0.0 #total value of fitness for key in ordered_keys: total += pop[key] r = random.uniform(0, total) for key in ordered_keys: r = r - pop[key] #keep subtracting fitnesses from random fitness until value is less than or equal to 0, then return if r <= 0: return key print("choose parent random failed... resorting to full random") return random.sample(pop.keys(), 1) def read_ngrams(): #reads all grams and frequency, then puts it into a dictionary. relies on a line telling it when grams change number (ie a header saying 3-gram/4-gram) f = open(ngrams_file, 'r') lines = f.readlines() activated = False for line in lines: if not activated: if (str(gram_length) + '-gram') in line: activated = True continue else: if (str(gram_length + 1) + '-gram') in line: break words = re.split(r'\t+', line) #split on tabs (can be multiple) ngrams[words[0]] = math.log(int(words[1]), 2) return def random_key(): #returns a shuffled alphabet in a tuple to_ret = alphabet[:] random.shuffle(to_ret) return tuple(to_ret) def encode(key, text): #returns string, could probably be folded into decoded, since the only difference is key, alphabet vs alphabet, key encoded = '' key_dict = key_to_dict(alphabet, key) for i in range(0, len(text)): if text[i:i + 1] in key: encoded = encoded + key_dict[text[i:i + 1]] else: encoded = encoded + text[i:i + 1] return encoded def decode(key, encoded): #returns string decoded = '' key_dict = key_to_dict(key, alphabet) for i in range(0, len(encoded)): if encoded[i:i + 1] in key: decoded = decoded + key_dict[encoded[i:i + 1]] else: decoded = decoded + encoded[i:i + 1] return decoded #returns string def key_to_dict(key, master): #returns a dictionary for easy lookup during encoding and decoding. key is original, master is what the key maps to. dict = {' ': ' '} #space is always space for i in range(0, len(key)): dict[key[i]] = master[i] return dict def mutate(key): #randomly swaps two letters in a key rand1 = random.randint(0, 25) rand2 = random.randint(0, 25) key_list = list(key) temp = key_list[rand1] key_list[rand1] = key_list[rand2] key_list[rand2] = temp return tuple(key_list) def evaluate_fitness(key): #evaluates the fitness of a key based on ngrams after using the key to decode the text log_freq_sum = 0.0 decoded = decode(key, text) decoded_words = re.split(r' +', decoded) for word in decoded_words: if len(word) < gram_length: continue #1 letter words could be super helpful but the research paper say only 3 ngrams... grams = get_grams(word) for gram in grams: if gram in ngrams.keys(): log_freq_sum += ngrams[gram] return log_freq_sum def get_grams(word): #generates grams based on the variable gram_length grams = [] for i in range(0, len(word) - (gram_length - 1)): grams.append(word[i:i + gram_length]) return grams if __name__ == '__main__': main()