#Brian Wang 3/22/2020 #based on https://people.cs.uct.ac.za/~jkenwood/JasonBrownbridge.pdf import random, time, re, math random.seed(time.time()) #PROBABLY DONT NEED TO CHANGE THESE ngrams = {} alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] default_texts = ["PF HACYHTTRQ VF N PBYYRPGVBA BS SERR YRNEAVAT NPGVIVGVRF GUNG GRNPU PBZCHGRE FPVRAPR GUEBHTU RATNTVAT TNZRF NAQ CHMMYRF GUNG HFR PNEQF, FGEVAT, PENLBAF NAQ YBGF BS EHAAVAT NEBHAQ. JR BEVTVANYYL QRIRYBCRQ GUVF FB GUNG LBHAT FGHQRAGF PBHYQ QVIR URNQ-SVEFG VAGB PBZCHGRE FPVRAPR, RKCREVRAPVAT GUR XVAQF BS DHRFGVBAF NAQ PUNYYRATRF GUNG PBZCHGRE FPVRAGVFGF RKCREVRAPR, OHG JVGUBHG UNIVAT GB YRNEA CEBTENZZVAT SVEFG. GUR PBYYRPGVBA JNF BEVTVANYYL VAGRAQRQ NF N ERFBHEPR SBE BHGERNPU NAQ RKGRAFVBA, OHG JVGU GUR NQBCGVBA BS PBZCHGVAT NAQ PBZCHGNGVBANY GUVAXVAT VAGB ZNAL PYNFFEBBZF NEBHAQ GUR JBEYQ, VG VF ABJ JVQRYL HFRQ SBE GRNPUVAT. GUR ZNGREVNY UNF ORRA HFRQ VA ZNAL PBAGRKGF BHGFVQR GUR PYNFFEBBZ NF JRYY, VAPYHQVAT FPVRAPR FUBJF, GNYXF SBE FRAVBE PVGVMRAF, NAQ FCRPVNY RIRAGF. GUNAXF GB TRAREBHF FCBAFBEFUVCF JR UNIR ORRA NOYR GB PERNGR NFFBPVNGRQ ERFBHEPRF FHPU NF GUR IVQRBF, JUVPU NER VAGRAQRQ GB URYC GRNPUREF FRR UBJ GUR NPGVIVGVRF JBEX (CYRNFR QBA'G FUBJ GURZ GB LBHE PYNFFRF - YRG GURZ RKCREVRAPR GUR NPGVIVGVRF GURZFRYIRF!). NYY BS GUR NPGVIVGVRF GUNG JR CEBIVQR NER BCRA FBHEPR - GURL NER ERYRNFRQ HAQRE N PERNGVIR PBZZBAF NGGEVOHGVBA-FUNERNYVXR YVPRAPR, FB LBH PNA PBCL, FUNER NAQ ZBQVSL GUR ZNGREVNY. SBE NA RKCYNANGVBA BA GUR PBAARPGVBAF ORGJRRA PF HACYHTTRQ NAQ PBZCHGNGVBANY GUVAXVAT FXVYYF, FRR BHE PBZCHGNGVBANY GUVAXVAT NAQ PF HACYHTTRQ CNTR. GB IVRJ GUR GRNZ BS PBAGEVOHGBEF JUB JBEX BA GUVF CEBWRPG, FRR BHE CRBCYR CNTR. SBE QRGNVYF BA UBJ GB PBAGNPG HF, FRR BHE PBAGNPG HF CNTR. SBE ZBER VASBEZNGVBA NOBHG GUR CEVAPVCYRF ORUVAQ PF HACYHTTRQ, FRR BHE CEVAPVCYRF CNTR.", "LTQCXT LRJJ HJRDECD, EZT CDJP SXTFRYTDE EC ZNKT LTTD RASTNHZTY VNF NDYXTV WCZDFCD. ZT VNF NHUBREETY LP N FRDGJT KCET VZTD N LXNKT FTDNECX QXCA ONDFNF XTQBFTY EC PRTJY QXCA SXTFFBXT EC HCDKRHE EZT SXTFRYTDE. ZNY WCZDFCD LTTD HCDKRHETY, EZT FSTNOTX CQ EZT ZCBFT VCBJY ZNKT LTHCAT SXTFRYTDE FRDHT WCZDFCD ZNY DC KRHTSXTFRYTDE. RDHXTYRLJP, RE VNF EZRF FNAT FSTNOTX VZC JTY EZT RASTNHZATDE RD EZT ZCBFT CQ XTSXTFTDENERKTF. EZBF, ZNY EZT FTDNET HCDKRHETY EZT SXTFRYTDE, EZRF VCBJY ZNKT NACBDETY EC N SCJRERHNJ HCBS.", "ZRTGO Y JPEYPGZA, RP'J IKPGO HIJJRMWG PI RSHEITG PUG JPEYPGZA MA SYDROZ EYOBIS XUYOZGJ, PGJPROZ PUG EGJLWP IK PUIJG XUYOZGJ, YOB DGGHROZ IOWA PUG MGPPGE ILPXISGJ. PURJ RJ XYWWGB URWW XWRSMROZ. PURJ RJ EGWYPRTGWA JRSHWG PI XIBG, MLP BIGJO'P CIED RO GTGEA JRPLYPRIO - RP XYO IKPGO ZGP XYLZUP RO Y WIXYW SYFRSLS, Y JPEYPGZA PUYP RJ OIP RBGYW MLP KEIS CURXU YOA JROZWG XUYOZG RJ OIP YO RSHEITGSGOP IO RPJ ICO. ZGOGPRX YWZIERPUSJ YEG Y HICGEKLW PIIW KIE RSHEITROZ IO PUG RBGY IK URWW XWRSMROZ PI IHPRSRVG Y JIWLPRIO RO JRPLYPRIOJ CUGEG YWW IK PUG KIWWICROZ YEG PELG: Y JPEYPGZA XYO MG HEGXRJGWA QLYOPRKRGB MA Y JHGXRKRX JGP IK TYERYMWGJ ZRTGO XGEPYRO OLSGERX TYWLGJ. PUG ILPXISG IK PUG JPEYPGZA XYO YWJI MG HEGXRJGWA QLYOPRKRGB. PUGEG YEG ROPGEYXPRIOJ MGPCGGO PUG TYERYMWGJ PUYP SYDG JRSHWG URWW XWRSMROZ ROGKKRXRGOP IE LOWRDGWA PI JLXXGGB.", "CWQ KHTTQKC TFAZJAB HS FGG HS CWQ ECFT YFTE PHRJQE TQGQFEQM EH SFT JE CWQ QPXJTQ ECTJZQE VFKZ, F AQY WHXQ, CWQ GFEC OQMJ, TQCLTA HS CWQ OQMJ, THBLQ HAQ, EHGH, TQRQABQ HS CWQ EJCW, CWQ SHTKQ FYFZQAE, TJEQ HS CWQ EZNYFGZQT, CWQ XWFACHP PQAFKQ, FCCFKZ HS CWQ KGHAQE. CWQ KHTTQKC TFAZJAB HS CWQ CWTQQ JAMJFAF OHAQE PHRJQE JE CWQ GFEC KTLEFMQ, TFJMQTE HS CWQ GHEC FTZ, CQPXGQ HS MHHP. CWQTQ JE AH SHLTCW JAMJFAF OHAQE PHRJQ, FAM FANHAQ YWH CQGGE NHL HCWQTYJEQ JE F GJFT. OLEC CQGG CWQP CH CLTA FTHLAM FAM YFGZ FYFN VQSHTQ CWQN KFA VGQEE NHL YJCW FAN HCWQT JAKHTTQKC HXJAJHAE. FANYFN, EH EFNQCW PN STJQAM VJGG, YWH WFXXQAQM CH VQ HAGJAQ YWJGQ J YFE PFZJAB CWJE FEEJBAPQAC, YWQA J FEZQM WJP 'YWFC YHLGM VQ F BHHM EQKTQC PQEEFBQ SHT PN ECLMQACE CH MQKHMQ?' XGQFEQ CFZQ LX FAN KHPXGFJACE YJCW WJP.", "XMTP CGPQR BWEKNJB GQ OTGRB EL BEQX BWEKNJB, G RFGLI. GR GQ BEQX ABSETQB RFGQ QBLRBLSB TQBQ EJJ RBL KMQR SMKKML VMPYQ GL BLDJGQF: 'G FEUB RM AB E DMMY QRTYBLR GL RFER SJEQQ GL RFB PMMK MC RFER RBESFBP.'", "XTV B CHDQCL BHF GCVIVDGDHWPN ABVF ZABPPLHWL, ZTHGDFLV MBJDHW B PTHW BHF XCPPN VLBFBYPL GLVDLG TX UTVFG HLRLV CGDHW B GDHWPL LEBMIPL TX TCV ULPP-PTRLF LHWPDGA WPNIA UADZA TZZCVG GLZTHF IPBZL DH TRLVBPP XVLQCLHZN. DX D BM WLHCDHL, D UDPP GBN MBHN, MBHN GLZTHFG ABRL IBGGLF UADPL D ABRL YLLH ALVL ITHFLVDHW MBJDHW GCZA B UTVJ. FDGZTRLVDHW NTC ZVBZJLF MN YVBDHZADPF, ALVL, DH B GMBPPLV HCMYLV TX GLZTHFG UTCPF WDRL ML HT GCVIVDGL.", "NU XTZEIMYTNEVZ INUHU YM, ZML SPYVI NXILNFFZ XNFF IVPU N API VNTD. NU PI ILTWU MLI, P XNW YM N FMWY JNZ JPIVMLI LUPWY NWZ MC IVNI YFZEV IVNI ITNDPIPMWNFFZ CMFFMJU 'D' NI NFF. PUW'I IVNI ULTETPUPWY? P CMLWD IVPU ULTETPUPWY, NWZJNZ! NW NLIVMT JVM NFUM CMLWD IVPU ULTETPUPWY, FMWY NYM, NXILNFFZ FMUI SNWZ SMWIVU JTPIPWY N AMMH - N CLFF CPXIPMWNF UIMTZ - JPIVMLI IVNI YFZEV NI NFF. NSNRPWY, TPYVI?", "RHNJJCBXVCXJYQJNEJNDYDCELTHNBFTVTHNJJREFCLBEECANOTREFDNEBXTHJTNXTXECPCBAPZNSSPXTNYTXFVZCNXTSXRKRJTGTYECJRKTRDFSNHTRANGRDTNKNFEFZTTECQSNSTXCDVZRHZFEXNRGZEJRDTFEXRNDGJTFFUBNXTFSTDENGCDFZTINGCDFNDYCEZTXQRGBXTFRDFETNYCQXTANRDRDGQRITYRDEZTRXSJNHTFACKTQXTTJPNLCBECDCXRDEZTFBXQNHTLBEVREZCBEEZTSCVTXCQXRFRDGNLCKTCXFRDORDGLTJCVREKTXPABHZJROTFZNYCVFCDJPZNXYVREZJBARDCBFTYGTFNDYPCBVRJJEZTDZNKTNSXTEEPHCXXTHEDCERCDCQAPHCBDEXPNDYHCBDEXPATDNJNFNQTVPTNXFNGCRFZCBJYZNKTFNRYAPBDRKTXFTLBEDCVAPARDYZNFLTTDCSTDTYECZRGZTXKRTVFCQEZRDGF", "W CTZV VYQXDVD MCWJ IVJJTHV, TYD VYQXDVD WM BVAA, FXK WM QXYMTWYJ MCV JVQKVM XF MCV PYWZVKJV! YX KVTAAS, WM DXVJ! SXP DXY'M NVAWVZV IV? BCS BXPAD SXP YXM NVAWVZV MCTM MCWJ RVKFVQMAS QKXIPAVYM JVQKVM MVGM QXYMTWYJ MCV NV TAA, VYD TAA, HKTYDVJM JVQKVM XF TAA MCV QXJIXJ? YXB W FVVA DWJKVJRVQMVD! CTZV SXP DWJQXZVKVD SXPK XBY NVMMVK PAMWITMV MKPMC XF VZVKSMCWYH? W DWDY'M MCWYL JX. JX BCS TKV SXP HVMMWYH TAA PRRWMS TM IV? CXYVJMAS. YX XYV CTJ TYS ITYYVKJ MCVJV DTSJ. ...BCTM'J MCTM? SXP BTYM IV MX MVAA SXP MCV JVQKVM? YXM TFMVK MCWJ LWYD XF DWJKVJRVQM! HXXDYVJJ HKTQWXPJ IV. NTQL BCVY W BTJ T SXPMC W BTJ YXM JX QTAAXPJ. BCVY JXIVXYV BVAA KVJRVQMVD TYD WIRXKMTYM MXAD IV MCTM MCVS CTD JXIVMCWYH BXKMC MVAAWYH IV, W OPJM AWJMVYVD! W DWDY'M DXPNM MCVI! JX KPDV, CXYVJMAS. OPJM PYTQQVRMTNAV.", "CIJUQCIZQFIZALSBPUFCRLIPBFIEIHYLXQIHYLOLILEIBFQXBZJSXDJXXDLSOLILELIPLNXASUBICJAXDJUMSBFYLISCFNDXDLSOLILXDLAJZXELBEALSBFQLKELNXXBRLHUYBAYLQHUJUSXDHUWZXIJUWLBICSZXLIHBFZRLNJFZLXDLSVFZXQHQUXDBAQOHXDZFNDUBUZLUZL", "ZFNNANWJWYBZLKEHBZTNSKDDGJWYLWSBFNSSJWYFNKBGLKOCNKSJEBDWZFNGKLJKJNQFJPFJBXHBZTNRDKNZFNPDEJWYDRPDEGCNZNWJYFZZFLZTCNBBNBZFNNLKZFSLKONWBLCCKJANKBPHGBZFNGNLOBLWSRDCSBZFNRJWLCBFDKNJWLWSWDTDSUWDTDSUOWDQBQFLZBYDJWYZDFLGGNWZDLWUTDSUTNBJSNBZFNRDKCDKWKLYBDRYKDQJWYDCSJZFJWODRSNLWEDKJLKZUJNANWZFJWODRDCSSNLWEDKJLKZUZFNRLZFNKQNWNANKRDHWSJZFJWODRSNLWEDKJLKZU", "FBYSNRBIYVNIJRJZSRSRJZNQCQNIJXCGTNEBSJNYKCUXCGTNONNIRNBUMZSIVSIJZNYBUAXCGURENBJRCBASIVJZUCGVZJZNKFCCUBIYOGUSNYSIXCGUOCINRJZNUNRBIBMZNJZBJXCGMBIJSVICUNJBASIVXCGUOUNBJZRJNBFSIVXCGUQSIYBIYBFFJZBJEBRUNBFSRFNKJONZSIYYCIJKSVZJSJSJRMCQSIVKCUXCGUGIISIVBJXCGSJRCIFXJZSRQCQNIJYCIJMBUNEZBJMCQNRBKJNUXCGUKNTNUYUNBQMBIJXCGRNNSJVNJJSIVMFCRNUPGRJRGUUNIYNUMBGRNXCGKNNFJZNKNNFSIVJBASIVCTNUSJRKSUNSJRKUNNYCQSJRKFCCYSIVCLNISJRJZNLUNBMZNUSIJZNLGFLSJBIYXCGUOFSIYYNTCJSCIJZNUNRRCQNJZSIVOUNBASIVBJJZNOUSMACKNTNUXEBFFSJRZCFYSIVBFFJZBJXCGAICERCJNFFQNYCXCGEBIIBVCEZNUNSJRMCTNUNYSIBFFJZNMCFCUNYFSVZJREZNUNJZNUGIBEBXRBUNUGIISIVJZNISVZJSQLCRRSOFNMCQNRJUGNSJRJBASIVCTNUXCGCZJZSRSRJZNVUNBJNRJRZCEENFSVZJSJGLENECIJMCQNYCEIBIYJZNRGIMBIJRJCLGRICEEBJMZSIVSJMCQNJUGNSJRJBASIVCTNUXCGCZJZSRSRJZNVUNBJNRJRZCE", "KDBULISCWSCTLISJLXTNJULIUJBTALISNYULDEIDJCKTCGLISKLAUKLIMBFUGTCGLIMCGUJUGLIJUTLUCSCWLIULIJUUGLIDMWILKDEBTLLIUQLIULIMWTALIDMWILIUTLJSNTAAXSLQTKDCAXLIULISJLUUCLIDMKTCGLISKLAUKTCGLIDJCKLIJDMWILIUMCGUJCUTLIDEISKLISWILITLLIULISJLXXUTJDAGLIMWLIDMWILDELITLBDJCSCW", "ANUYJKHNFL JLNBL, NBENJK YNK KHNKIONS: 'JNYNHYJ - SNJKONS, INHKONS, JNHDSNJ ONBRNJ!'", "MBLBJJBV, HUBBSKDBM SBIMBJK BO BUS JBIBIB KIDBBUK BO LBJBIIB, BXOBJS, IBUBLHB KBV CBCIBJA-IBUBJ LBJLBA HUBKKBL CBSK.", "STHTGHTCTITGHTUSKSTHHTOBTGUKTETIWSTBITSQTWKTESTKWSSTYTGUTGETHHTHHTCTETETWHSTESTUSBTUSKHTGSTHWBTGTWTWTGUTGHTIMOTYTGTHITUTKHTGHTIMTBBTWTKTFTHJFTUTIYTRTCTWGTW"] #0-3 easy, 4-7 medium, 8-11 hard, 12-15 "impossible" text = default_texts[0] # text = re.sub(r'\s+', '', text) #remove whitespace #SETTINGS ngrams_file = 'ngrams1.tsv' #file to read ngrams from num_individuals = 1000 #population space max_rounds = 500 #maximum number of rounds mutation_chance = 0.3 #chance of mutation (1.0 is 100%) mutation_tries = 1 #number of attempts at mutating convergence_threshold = 60 #if no improvement for this number, assume convergence gram_length = 3 #length of ngram to use timeout = num_individuals * 100 #will probably never trigger (if it does, mutation is probably too low). the number of times to try to create unique keys when making a new generation preserve_top_percent = 0.15 #copy over top this percent of population crossover_chance = 0.65 #chance a crossover will occur crossover_points = 5 #number of points that will crossover tournament_size = 25 #how many individuals to randomly select, before selecting the best #tournament_variation = int(0.2 * tournament_size) #randomly choose between top x of tournament tournament_variation_percentage = 0.2 #randomly choose between top 20% of tournament #GLOBAL VARIABLES (not all of these need to be global but whatever) set_individuals = {} #set of keys and corresponding fitness keys_in_order = [] #keys to set_individuals, sorted from best to worst def main(): start_time = time.time() read_ngrams() best_fitness = 0.0 #higher is better best_key = () stuck_count = 0 while(len(set_individuals.keys()) < num_individuals): #make num_individuals unique keys try_key = random_key() set_individuals[try_key] = evaluate_fitness(try_key) #intitialize keys if set_individuals[try_key] > best_fitness: #update best fitness best_fitness = set_individuals[try_key] best_key = try_key keys_in_order = sorted(list(set_individuals.keys()), key = lambda i:set_individuals[i], reverse = True) #print(best_fitness) #print(evaluate_fitness(alphabet)) for i in range(0, max_rounds): if i % 5 == 0: #print fitness values every 5 generations (debug) print(best_fitness) new_best_key = new_generation() #changes the set_individuals dictionary, returns the best key from prev generation if best_fitness >= set_individuals[new_best_key]: stuck_count += 1 if stuck_count > convergence_threshold: #if no improvement for 5 generations, assume convergence print("converged?") break else: stuck_count = 0 best_key = new_best_key best_fitness = set_individuals[new_best_key] print("Results:") print("Best key was: " + str(best_key) + " with a fitness of " + str(best_fitness)) print("The decoded message:") print(decode(best_key, text)) print('Runtime was ', time.time() - start_time, ' seconds') def new_generation(): # parents1 = {} #make two equally sized parent pools randomly (there could be a "stacked pool", where all the fit parents end up in one pool, but its random so...) # parents2 = {} # # for key in set_individuals.keys(): # if len(parents1) > len(parents2): # parents2[key] = set_individuals[key] # else if len(parents2) > len(parents1): # parents1[key] = set_individuals[key] # else: # if random.randint(0, 1) == 1: # parents1[key] = set_individuals[key] # else: # parents2[key] = set_individuals[key] global set_individuals new_gen = {} #can't overwrite yet, downside of using global variable, change if performance improvements are needed best_fitness = 0.0 best_key = () num_tries = 0 old_keys = list(set_individuals.keys()) #copy over top few percent old_keys.sort(key = lambda i: set_individuals[i], reverse = True) for i in range(0, int(num_individuals * preserve_top_percent)): # try_key = old_keys[i] # #mutate # for i in range(0, mutation_tries): # r = random.random() # if r < mutation_chance: # try_key = mutate(try_key) # # new_gen[try_key] = evaluate_fitness(try_key) #add to new generation # # if new_gen[try_key] > best_fitness: #update best fitness # best_fitness = new_gen[try_key] # best_key = try_key new_gen[old_keys[i]] = set_individuals[old_keys[i]] while(len(new_gen.keys()) < num_individuals): #make num_individuals unique keys num_tries += 1 if num_tries > timeout: #if too many breeding attempts have failed while(len(new_gen.keys()) < num_individuals): #make num_individuals unique keys try_key = random_key() new_gen[try_key] = evaluate_fitness(try_key) if new_gen[try_key] > best_fitness: #update best fitness best_fitness = new_gen[try_key] best_key = try_key break else: #select two parents, crossover parent1 = choose_parent(set_individuals) parent2 = choose_parent(set_individuals) try_key = crossover(parent1, parent2) #mutate for i in range(0, mutation_tries): r = random.random() if r < mutation_chance: try_key = mutate(try_key) new_gen[try_key] = evaluate_fitness(try_key) #add to new generation if new_gen[try_key] > best_fitness: #update best fitness best_fitness = new_gen[try_key] best_key = try_key if(len(new_gen.keys()) >= num_individuals): break try_key = crossover(parent2, parent1) #same but swap parents #mutate for i in range(0, mutation_tries): r = random.random() if r < mutation_chance: try_key = mutate(try_key) new_gen[try_key] = evaluate_fitness(try_key) #add to new generation if new_gen[try_key] > best_fitness: #update best fitness best_fitness = new_gen[try_key] best_key = try_key set_individuals = new_gen #overwrite old generation keys_in_order = sorted(list(set_individuals.keys()), key = lambda i:set_individuals[i], reverse = True) #update order return best_key def crossover(parent1, parent2): if random.random() > crossover_chance: return parent1 new_key = [None] * 26 #so it isnt empty p1_influence = (26 - crossover_points) #random.randint(0, 26) #the amount to take from the first parent indices = random.sample(range(0, 26), p1_influence) taken_letters = set([]) for index in indices: #takes letters from selected (random) indices and puts them in the same spot in the child new_key[index] = parent1[index] taken_letters.add(parent1[index]) for i in range(0, 26): #fill in with parent 2 when possible if new_key[i] == None: if not parent2[i] in taken_letters: new_key[i] = parent2[i] taken_letters.add(parent2[i]) leftover_letters = list(set(alphabet).difference(taken_letters)) #what letters still need to be filled? random.shuffle(leftover_letters) count = 0 for i in range(0, 26): if new_key[i] == None: new_key[i] = leftover_letters[count] count += 1 return tuple(new_key) def choose_parent(pop): #picks a parent from population, weighted based on the fitness # ordered_keys = list(pop.keys()) # total = 0.0 #total value of fitness # for key in ordered_keys: # total += pop[key] # # r = random.uniform(0, total) # for key in ordered_keys: # r = r - pop[key] #keep subtracting fitnesses from random fitness until value is less than or equal to 0, then return # if r <= 0: # return key # # print("choose parent random failed... resorting to full random") # return random.sample(pop.keys(), 1) #return sorted(list(random.sample(pop.keys(), tournament_size)), key = lambda i: pop[i], reverse = True)[random.randint(0, tournament_variation - 1)] pool = sorted(list(random.sample(pop.keys(), tournament_size)), key = lambda i: pop[i], reverse = True) min = pop[pool[0]] * 0.8 min_index = 0 for i in range(0, len(pool)): if pop[pool[i]] > min: min_index = i return pool[random.randint(0, min_index)] def read_ngrams(): #reads all grams and frequency, then puts it into a dictionary. relies on a line telling it when grams change number (ie a header saying 3-gram/4-gram) f = open(ngrams_file, 'r') lines = f.readlines() activated = False for line in lines: if not activated: if (str(gram_length) + '-gram') in line: activated = True continue else: if (str(gram_length + 1) + '-gram') in line: break words = re.split(r'\t+', line) #split on tabs (can be multiple) ngrams[words[0]] = math.log(int(words[1]), 2) return def random_key(): #returns a shuffled alphabet in a tuple to_ret = alphabet[:] random.shuffle(to_ret) return tuple(to_ret) def encode(key, text): #returns string, could probably be folded into decoded, since the only difference is key, alphabet vs alphabet, key encoded = '' key_dict = key_to_dict(alphabet, key) for i in range(0, len(text)): if text[i:i + 1] in key: encoded = encoded + key_dict[text[i:i + 1]] else: encoded = encoded + text[i:i + 1] return encoded def decode(key, encoded): #returns string decoded = '' key_dict = key_to_dict(key, alphabet) for i in range(0, len(encoded)): if encoded[i:i + 1] in key: decoded = decoded + key_dict[encoded[i:i + 1]] else: decoded = decoded + encoded[i:i + 1] return decoded #returns string def key_to_dict(key, master): #returns a dictionary for easy lookup during encoding and decoding. key is original, master is what the key maps to. dict = {' ': ' '} #space is always space for i in range(0, len(key)): dict[key[i]] = master[i] return dict def mutate(key): #randomly swaps two letters in a key rand1 = random.randint(0, 25) rand2 = random.randint(0, 25) key_list = list(key) temp = key_list[rand1] key_list[rand1] = key_list[rand2] key_list[rand2] = temp return tuple(key_list) def evaluate_fitness(key): #evaluates the fitness of a key based on ngrams after using the key to decode the text log_freq_sum = 0.0 decoded = decode(key, text) decoded_words = re.split(r' +', decoded) for word in decoded_words: if len(word) < gram_length: continue #1 letter words could be super helpful but the research paper say only 3 ngrams... grams = get_grams(word) for gram in grams: if gram in ngrams.keys(): log_freq_sum += ngrams[gram] return log_freq_sum def get_grams(word): #generates grams based on the variable gram_length grams = [] for i in range(0, len(word) - (gram_length - 1)): grams.append(word[i:i + gram_length]) return grams if __name__ == '__main__': main()