GeneticCipher / Genetic Cipher / bad_at_version_control / Genetic_Cipher.py
Genetic_Cipher.py
Raw
#Brian Wang 3/22/2020
#based on https://people.cs.uct.ac.za/~jkenwood/JasonBrownbridge.pdf

import random, time, re, math

random.seed(time.time())

#PROBABLY DONT NEED TO CHANGE THESE
ngrams = {}
alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
default_texts = ["PF HACYHTTRQ VF N PBYYRPGVBA BS SERR YRNEAVAT NPGVIVGVRF GUNG GRNPU PBZCHGRE FPVRAPR GUEBHTU RATNTVAT TNZRF NAQ CHMMYRF GUNG HFR PNEQF, FGEVAT, PENLBAF NAQ YBGF BS EHAAVAT NEBHAQ. JR BEVTVANYYL QRIRYBCRQ GUVF FB GUNG LBHAT FGHQRAGF PBHYQ QVIR URNQ-SVEFG VAGB PBZCHGRE FPVRAPR, RKCREVRAPVAT GUR XVAQF BS DHRFGVBAF NAQ PUNYYRATRF GUNG PBZCHGRE FPVRAGVFGF RKCREVRAPR, OHG JVGUBHG UNIVAT GB YRNEA CEBTENZZVAT SVEFG.  GUR PBYYRPGVBA JNF BEVTVANYYL VAGRAQRQ NF N ERFBHEPR SBE BHGERNPU NAQ RKGRAFVBA, OHG JVGU GUR NQBCGVBA BS PBZCHGVAT NAQ PBZCHGNGVBANY GUVAXVAT VAGB ZNAL PYNFFEBBZF NEBHAQ GUR JBEYQ, VG VF ABJ JVQRYL HFRQ SBE GRNPUVAT. GUR ZNGREVNY UNF ORRA HFRQ VA ZNAL PBAGRKGF BHGFVQR GUR PYNFFEBBZ NF JRYY, VAPYHQVAT FPVRAPR FUBJF, GNYXF SBE FRAVBE PVGVMRAF, NAQ FCRPVNY RIRAGF.  GUNAXF GB TRAREBHF FCBAFBEFUVCF JR UNIR ORRA NOYR GB PERNGR NFFBPVNGRQ ERFBHEPRF FHPU NF GUR IVQRBF, JUVPU NER VAGRAQRQ GB URYC GRNPUREF FRR UBJ GUR NPGVIVGVRF JBEX (CYRNFR QBA'G FUBJ GURZ GB LBHE PYNFFRF - YRG GURZ RKCREVRAPR GUR NPGVIVGVRF GURZFRYIRF!). NYY BS GUR NPGVIVGVRF GUNG JR CEBIVQR NER BCRA FBHEPR - GURL NER ERYRNFRQ HAQRE N PERNGVIR PBZZBAF NGGEVOHGVBA-FUNERNYVXR YVPRAPR, FB LBH PNA PBCL, FUNER NAQ ZBQVSL GUR ZNGREVNY.  SBE NA RKCYNANGVBA BA GUR PBAARPGVBAF ORGJRRA PF HACYHTTRQ NAQ PBZCHGNGVBANY GUVAXVAT FXVYYF, FRR BHE PBZCHGNGVBANY GUVAXVAT NAQ PF HACYHTTRQ CNTR.  GB IVRJ GUR GRNZ BS PBAGEVOHGBEF JUB JBEX BA GUVF CEBWRPG, FRR BHE CRBCYR CNTR.  SBE QRGNVYF BA UBJ GB PBAGNPG HF, FRR BHE PBAGNPG HF CNTR.  SBE ZBER VASBEZNGVBA NOBHG GUR CEVAPVCYRF ORUVAQ PF HACYHTTRQ, FRR BHE CEVAPVCYRF CNTR.", "LTQCXT LRJJ HJRDECD, EZT CDJP SXTFRYTDE EC ZNKT LTTD RASTNHZTY VNF NDYXTV WCZDFCD. ZT VNF NHUBREETY LP N FRDGJT KCET VZTD N LXNKT FTDNECX QXCA ONDFNF XTQBFTY EC PRTJY QXCA SXTFFBXT EC HCDKRHE EZT SXTFRYTDE. ZNY WCZDFCD LTTD HCDKRHETY, EZT FSTNOTX CQ EZT ZCBFT VCBJY ZNKT LTHCAT SXTFRYTDE FRDHT WCZDFCD ZNY DC KRHTSXTFRYTDE. RDHXTYRLJP, RE VNF EZRF FNAT FSTNOTX VZC JTY EZT RASTNHZATDE RD EZT ZCBFT CQ XTSXTFTDENERKTF. EZBF, ZNY EZT FTDNET HCDKRHETY EZT SXTFRYTDE, EZRF VCBJY ZNKT NACBDETY EC N SCJRERHNJ HCBS.", "ZRTGO Y JPEYPGZA, RP'J IKPGO HIJJRMWG PI RSHEITG PUG JPEYPGZA MA SYDROZ EYOBIS XUYOZGJ, PGJPROZ PUG EGJLWP IK PUIJG XUYOZGJ, YOB DGGHROZ IOWA PUG MGPPGE ILPXISGJ.  PURJ RJ XYWWGB URWW XWRSMROZ.  PURJ RJ EGWYPRTGWA JRSHWG PI XIBG, MLP BIGJO'P CIED RO GTGEA JRPLYPRIO - RP XYO IKPGO ZGP XYLZUP RO Y WIXYW SYFRSLS, Y JPEYPGZA PUYP RJ OIP RBGYW MLP KEIS CURXU YOA JROZWG XUYOZG RJ OIP YO RSHEITGSGOP IO RPJ ICO. ZGOGPRX YWZIERPUSJ YEG Y HICGEKLW PIIW KIE RSHEITROZ IO PUG RBGY IK URWW XWRSMROZ PI IHPRSRVG Y JIWLPRIO RO JRPLYPRIOJ CUGEG YWW IK PUG KIWWICROZ YEG PELG: Y JPEYPGZA XYO MG HEGXRJGWA QLYOPRKRGB MA Y JHGXRKRX JGP IK TYERYMWGJ ZRTGO XGEPYRO OLSGERX TYWLGJ. PUG ILPXISG IK PUG JPEYPGZA XYO YWJI MG HEGXRJGWA QLYOPRKRGB. PUGEG YEG ROPGEYXPRIOJ MGPCGGO PUG TYERYMWGJ PUYP SYDG JRSHWG URWW XWRSMROZ ROGKKRXRGOP IE LOWRDGWA PI JLXXGGB.", "CWQ KHTTQKC TFAZJAB HS FGG HS CWQ ECFT YFTE PHRJQE TQGQFEQM EH SFT JE CWQ QPXJTQ ECTJZQE VFKZ, F AQY WHXQ, CWQ GFEC OQMJ, TQCLTA HS CWQ OQMJ, THBLQ HAQ, EHGH, TQRQABQ HS CWQ EJCW, CWQ SHTKQ FYFZQAE, TJEQ HS CWQ EZNYFGZQT, CWQ XWFACHP PQAFKQ, FCCFKZ HS CWQ KGHAQE.  CWQ KHTTQKC TFAZJAB HS CWQ CWTQQ JAMJFAF OHAQE PHRJQE JE CWQ GFEC KTLEFMQ, TFJMQTE HS CWQ GHEC FTZ, CQPXGQ HS MHHP.  CWQTQ JE AH SHLTCW JAMJFAF OHAQE PHRJQ, FAM FANHAQ YWH CQGGE NHL HCWQTYJEQ JE F GJFT.  OLEC CQGG CWQP CH CLTA FTHLAM FAM YFGZ FYFN VQSHTQ CWQN KFA VGQEE NHL YJCW FAN HCWQT JAKHTTQKC HXJAJHAE.  FANYFN, EH EFNQCW PN STJQAM VJGG, YWH WFXXQAQM CH VQ HAGJAQ YWJGQ J YFE PFZJAB CWJE FEEJBAPQAC, YWQA J FEZQM WJP 'YWFC YHLGM VQ F BHHM EQKTQC PQEEFBQ SHT PN ECLMQACE CH MQKHMQ?'  XGQFEQ CFZQ LX FAN KHPXGFJACE YJCW WJP.", "XMTP CGPQR BWEKNJB GQ OTGRB EL BEQX BWEKNJB, G RFGLI.  GR GQ BEQX ABSETQB RFGQ QBLRBLSB TQBQ EJJ RBL KMQR SMKKML VMPYQ GL BLDJGQF:  'G FEUB RM AB E DMMY QRTYBLR GL RFER SJEQQ GL RFB PMMK MC RFER RBESFBP.'", "XTV B CHDQCL BHF GCVIVDGDHWPN ABVF ZABPPLHWL, ZTHGDFLV MBJDHW B PTHW BHF XCPPN VLBFBYPL GLVDLG TX UTVFG HLRLV CGDHW B GDHWPL LEBMIPL TX TCV ULPP-PTRLF LHWPDGA WPNIA UADZA TZZCVG GLZTHF IPBZL DH TRLVBPP XVLQCLHZN.  DX D BM WLHCDHL, D UDPP GBN MBHN, MBHN GLZTHFG ABRL IBGGLF UADPL D ABRL YLLH ALVL ITHFLVDHW MBJDHW GCZA B UTVJ.  FDGZTRLVDHW NTC ZVBZJLF MN YVBDHZADPF, ALVL, DH B GMBPPLV HCMYLV TX GLZTHFG UTCPF WDRL ML HT GCVIVDGL.", "NU XTZEIMYTNEVZ INUHU YM, ZML SPYVI NXILNFFZ XNFF IVPU N API VNTD.  NU PI ILTWU MLI, P XNW YM N FMWY JNZ JPIVMLI LUPWY NWZ MC IVNI YFZEV IVNI ITNDPIPMWNFFZ CMFFMJU 'D' NI NFF.  PUW'I IVNI ULTETPUPWY?  P CMLWD IVPU ULTETPUPWY, NWZJNZ!  NW NLIVMT JVM NFUM CMLWD IVPU ULTETPUPWY, FMWY NYM, NXILNFFZ FMUI SNWZ SMWIVU JTPIPWY N AMMH - N CLFF CPXIPMWNF UIMTZ - JPIVMLI IVNI YFZEV NI NFF.  NSNRPWY, TPYVI?", "RHNJJCBXVCXJYQJNEJNDYDCELTHNBFTVTHNJJREFCLBEECANOTREFDNEBXTHJTNXTXECPCBAPZNSSPXTNYTXFVZCNXTSXRKRJTGTYECJRKTRDFSNHTRANGRDTNKNFEFZTTECQSNSTXCDVZRHZFEXNRGZEJRDTFEXRNDGJTFFUBNXTFSTDENGCDFZTINGCDFNDYCEZTXQRGBXTFRDFETNYCQXTANRDRDGQRITYRDEZTRXSJNHTFACKTQXTTJPNLCBECDCXRDEZTFBXQNHTLBEVREZCBEEZTSCVTXCQXRFRDGNLCKTCXFRDORDGLTJCVREKTXPABHZJROTFZNYCVFCDJPZNXYVREZJBARDCBFTYGTFNDYPCBVRJJEZTDZNKTNSXTEEPHCXXTHEDCERCDCQAPHCBDEXPNDYHCBDEXPATDNJNFNQTVPTNXFNGCRFZCBJYZNKTFNRYAPBDRKTXFTLBEDCVAPARDYZNFLTTDCSTDTYECZRGZTXKRTVFCQEZRDGF", "W CTZV VYQXDVD MCWJ IVJJTHV, TYD VYQXDVD WM BVAA, FXK WM QXYMTWYJ MCV JVQKVM XF MCV PYWZVKJV!  YX KVTAAS, WM DXVJ!  SXP DXY'M NVAWVZV IV?  BCS BXPAD SXP YXM NVAWVZV MCTM MCWJ RVKFVQMAS QKXIPAVYM JVQKVM MVGM QXYMTWYJ MCV NV TAA, VYD TAA, HKTYDVJM JVQKVM XF TAA MCV QXJIXJ?  YXB W FVVA DWJKVJRVQMVD!  CTZV SXP DWJQXZVKVD SXPK XBY NVMMVK PAMWITMV MKPMC XF VZVKSMCWYH?  W DWDY'M MCWYL JX.  JX BCS TKV SXP HVMMWYH TAA PRRWMS TM IV?  CXYVJMAS.  YX XYV CTJ TYS ITYYVKJ MCVJV DTSJ.  ...BCTM'J MCTM?  SXP BTYM IV MX MVAA SXP MCV JVQKVM?  YXM TFMVK MCWJ LWYD XF DWJKVJRVQM!  HXXDYVJJ HKTQWXPJ IV.  NTQL BCVY W BTJ T SXPMC W BTJ YXM JX QTAAXPJ.  BCVY JXIVXYV BVAA KVJRVQMVD TYD WIRXKMTYM MXAD IV MCTM MCVS CTD JXIVMCWYH BXKMC MVAAWYH IV, W OPJM AWJMVYVD!  W DWDY'M DXPNM MCVI!  JX KPDV, CXYVJMAS.  OPJM PYTQQVRMTNAV.", "CIJUQCIZQFIZALSBPUFCRLIPBFIEIHYLXQIHYLOLILEIBFQXBZJSXDJXXDLSOLILELIPLNXASUBICJAXDJUMSBFYLISCFNDXDLSOLILXDLAJZXELBEALSBFQLKELNXXBRLHUYBAYLQHUJUSXDHUWZXIJUWLBICSZXLIHBFZRLNJFZLXDLSVFZXQHQUXDBAQOHXDZFNDUBUZLUZL", "ZFNNANWJWYBZLKEHBZTNSKDDGJWYLWSBFNSSJWYFNKBGLKOCNKSJEBDWZFNGKLJKJNQFJPFJBXHBZTNRDKNZFNPDEJWYDRPDEGCNZNWJYFZZFLZTCNBBNBZFNNLKZFSLKONWBLCCKJANKBPHGBZFNGNLOBLWSRDCSBZFNRJWLCBFDKNJWLWSWDTDSUWDTDSUOWDQBQFLZBYDJWYZDFLGGNWZDLWUTDSUTNBJSNBZFNRDKCDKWKLYBDRYKDQJWYDCSJZFJWODRSNLWEDKJLKZUJNANWZFJWODRDCSSNLWEDKJLKZUZFNRLZFNKQNWNANKRDHWSJZFJWODRSNLWEDKJLKZU", "FBYSNRBIYVNIJRJZSRSRJZNQCQNIJXCGTNEBSJNYKCUXCGTNONNIRNBUMZSIVSIJZNYBUAXCGURENBJRCBASIVJZUCGVZJZNKFCCUBIYOGUSNYSIXCGUOCINRJZNUNRBIBMZNJZBJXCGMBIJSVICUNJBASIVXCGUOUNBJZRJNBFSIVXCGUQSIYBIYBFFJZBJEBRUNBFSRFNKJONZSIYYCIJKSVZJSJSJRMCQSIVKCUXCGUGIISIVBJXCGSJRCIFXJZSRQCQNIJYCIJMBUNEZBJMCQNRBKJNUXCGUKNTNUYUNBQMBIJXCGRNNSJVNJJSIVMFCRNUPGRJRGUUNIYNUMBGRNXCGKNNFJZNKNNFSIVJBASIVCTNUSJRKSUNSJRKUNNYCQSJRKFCCYSIVCLNISJRJZNLUNBMZNUSIJZNLGFLSJBIYXCGUOFSIYYNTCJSCIJZNUNRRCQNJZSIVOUNBASIVBJJZNOUSMACKNTNUXEBFFSJRZCFYSIVBFFJZBJXCGAICERCJNFFQNYCXCGEBIIBVCEZNUNSJRMCTNUNYSIBFFJZNMCFCUNYFSVZJREZNUNJZNUGIBEBXRBUNUGIISIVJZNISVZJSQLCRRSOFNMCQNRJUGNSJRJBASIVCTNUXCGCZJZSRSRJZNVUNBJNRJRZCEENFSVZJSJGLENECIJMCQNYCEIBIYJZNRGIMBIJRJCLGRICEEBJMZSIVSJMCQNJUGNSJRJBASIVCTNUXCGCZJZSRSRJZNVUNBJNRJRZCE", "KDBULISCWSCTLISJLXTNJULIUJBTALISNYULDEIDJCKTCGLISKLAUKLIMBFUGTCGLIMCGUJUGLIJUTLUCSCWLIULIJUUGLIDMWILKDEBTLLIUQLIULIMWTALIDMWILIUTLJSNTAAXSLQTKDCAXLIULISJLUUCLIDMKTCGLISKLAUKTCGLIDJCKLIJDMWILIUMCGUJCUTLIDEISKLISWILITLLIULISJLXXUTJDAGLIMWLIDMWILDELITLBDJCSCW", "ANUYJKHNFL JLNBL, NBENJK YNK KHNKIONS: 'JNYNHYJ - SNJKONS, INHKONS, JNHDSNJ ONBRNJ!'", "MBLBJJBV, HUBBSKDBM SBIMBJK BO BUS JBIBIB KIDBBUK BO LBJBIIB, BXOBJS, IBUBLHB KBV CBCIBJA-IBUBJ LBJLBA HUBKKBL CBSK.", "STHTGHTCTITGHTUSKSTHHTOBTGUKTETIWSTBITSQTWKTESTKWSSTYTGUTGETHHTHHTCTETETWHSTESTUSBTUSKHTGSTHWBTGTWTWTGUTGHTIMOTYTGTHITUTKHTGHTIMTBBTWTKTFTHJFTUTIYTRTCTWGTW"]
#0-3 easy, 4-7 medium, 8-11 hard, 12-15 "impossible"
text = default_texts[0]
# text = re.sub(r'\s+', '', text)  #remove whitespace

#SETTINGS
ngrams_file = 'ngrams1.tsv'  #file to read ngrams from
num_individuals = 1000         #population space
max_rounds = 500             #maximum number of rounds
mutation_chance = 0.3        #chance of mutation (1.0 is 100%)
mutation_tries = 1           #number of attempts at mutating
convergence_threshold = 60    #if no improvement for this number, assume convergence
gram_length = 3              #length of ngram to use
timeout = num_individuals * 100               #will probably never trigger (if it does, mutation is probably too low).  the number of times to try to create unique keys when making a new generation
preserve_top_percent = 0.15                #copy over top this percent of population
crossover_chance = 0.65                #chance a crossover will occur
crossover_points = 5           #number of points that will crossover
tournament_size = 25           #how many individuals to randomly select, before selecting the best
#tournament_variation = int(0.2 * tournament_size)       #randomly choose between top x of tournament
tournament_variation_percentage = 0.2                #randomly choose between top 20% of tournament

#GLOBAL VARIABLES   (not all of these need to be global but whatever)
set_individuals = {}                  #set of keys and corresponding fitness
keys_in_order = []                 #keys to set_individuals, sorted from best to worst

def main():
   start_time = time.time()
   read_ngrams()

   best_fitness = 0.0          #higher is better
   best_key = ()
   stuck_count = 0
   
   while(len(set_individuals.keys()) < num_individuals):  #make num_individuals unique keys
      try_key = random_key()
      set_individuals[try_key] = evaluate_fitness(try_key) #intitialize keys
      
      if set_individuals[try_key] > best_fitness:  #update best fitness
         best_fitness = set_individuals[try_key]
         best_key = try_key
   
   keys_in_order = sorted(list(set_individuals.keys()), key = lambda i:set_individuals[i], reverse = True)
   #print(best_fitness)
   #print(evaluate_fitness(alphabet))
   
   for i in range(0, max_rounds):
      if i % 5 == 0:        #print fitness values every 5 generations (debug)
         print(best_fitness)
      
      new_best_key = new_generation()                  #changes the set_individuals dictionary, returns the best key from prev generation
      if best_fitness >= set_individuals[new_best_key]:
         stuck_count += 1
         if stuck_count > convergence_threshold:        #if no improvement for 5 generations, assume convergence
            print("converged?")
            break
      else:
         stuck_count = 0
         best_key = new_best_key
         best_fitness = set_individuals[new_best_key]
      
   print("Results:")
   print("Best key was: " + str(best_key) + " with a fitness of " + str(best_fitness))
   print("The decoded message:")
   print(decode(best_key, text))
   print('Runtime was  ', time.time() - start_time, ' seconds')
   
def new_generation():
#    parents1 = {}          #make two equally sized parent pools randomly (there could be a "stacked pool", where all the fit parents end up in one pool, but its random so...)
#    parents2 = {}
#    
#    for key in set_individuals.keys():
#       if len(parents1) > len(parents2):
#          parents2[key] = set_individuals[key]
#       else if len(parents2) > len(parents1):
#          parents1[key] = set_individuals[key]
#       else:
#          if random.randint(0, 1) == 1:
#             parents1[key] = set_individuals[key]
#          else:
#             parents2[key] = set_individuals[key]
   global set_individuals
   new_gen = {}       #can't overwrite yet, downside of using global variable, change if performance improvements are needed
   best_fitness = 0.0
   best_key = ()
   num_tries = 0
   
   old_keys = list(set_individuals.keys())                                #copy over top few percent
   old_keys.sort(key = lambda i: set_individuals[i], reverse = True)
   for i in range(0, int(num_individuals * preserve_top_percent)):
#       try_key = old_keys[i]
#       #mutate
#       for i in range(0, mutation_tries):
#          r = random.random()
#          if r < mutation_chance:
#             try_key = mutate(try_key)
#          
#       new_gen[try_key] = evaluate_fitness(try_key) #add to new generation
#       
#       if new_gen[try_key] > best_fitness:  #update best fitness
#          best_fitness = new_gen[try_key]
#          best_key = try_key
      
       new_gen[old_keys[i]] = set_individuals[old_keys[i]]
   
   while(len(new_gen.keys()) < num_individuals):  #make num_individuals unique keys
      num_tries += 1
      if num_tries > timeout:                 #if too many breeding attempts have failed
         while(len(new_gen.keys()) < num_individuals):  #make num_individuals unique keys
            try_key = random_key()
            new_gen[try_key] = evaluate_fitness(try_key)
            
            if new_gen[try_key] > best_fitness:  #update best fitness
               best_fitness = new_gen[try_key]
               best_key = try_key
         break
      else:                     #select two parents, crossover
         parent1 = choose_parent(set_individuals)
         parent2 = choose_parent(set_individuals)
         
         try_key = crossover(parent1, parent2)
         
         #mutate
         for i in range(0, mutation_tries):
            r = random.random()
            if r < mutation_chance:
               try_key = mutate(try_key)
            
         new_gen[try_key] = evaluate_fitness(try_key) #add to new generation
         
         if new_gen[try_key] > best_fitness:  #update best fitness
            best_fitness = new_gen[try_key]
            best_key = try_key
         
         if(len(new_gen.keys()) >= num_individuals):
            break
         
         try_key = crossover(parent2, parent1)       #same but swap parents
         
         #mutate
         for i in range(0, mutation_tries):
            r = random.random()
            if r < mutation_chance:
               try_key = mutate(try_key)
            
         new_gen[try_key] = evaluate_fitness(try_key) #add to new generation
         
         if new_gen[try_key] > best_fitness:  #update best fitness
            best_fitness = new_gen[try_key]
            best_key = try_key
      
   set_individuals = new_gen #overwrite old generation
   keys_in_order = sorted(list(set_individuals.keys()), key = lambda i:set_individuals[i], reverse = True)  #update order
   return best_key
         
def crossover(parent1, parent2):
   if random.random() > crossover_chance:
      return parent1
   
   new_key = [None] * 26   #so it isnt empty
   p1_influence = (26 - crossover_points) #random.randint(0, 26)   #the amount to take from the first parent
   indices = random.sample(range(0, 26), p1_influence)
   
   taken_letters = set([])
   for index in indices:               #takes letters from selected (random) indices and puts them in the same spot in the child
      new_key[index] = parent1[index]
      taken_letters.add(parent1[index])
   
   for i in range(0, 26):                  #fill in with parent 2 when possible
      if new_key[i] == None:
         if not parent2[i] in taken_letters:
            new_key[i] = parent2[i]
            taken_letters.add(parent2[i])
         
   leftover_letters = list(set(alphabet).difference(taken_letters))   #what letters still need to be filled?
   random.shuffle(leftover_letters)
   count = 0
   for i in range(0, 26):
      if new_key[i] == None:
         new_key[i] = leftover_letters[count]
         count += 1
      
   return tuple(new_key)

def choose_parent(pop):   #picks a parent from population, weighted based on the fitness
#    ordered_keys = list(pop.keys())
#    total = 0.0               #total value of fitness
#    for key in ordered_keys:
#       total += pop[key]
#       
#    r = random.uniform(0, total)
#    for key in ordered_keys:
#       r = r - pop[key]          #keep subtracting fitnesses from random fitness until value is less than or equal to 0, then return
#       if r <= 0:
#          return key
#       
#    print("choose parent random failed... resorting to full random")
#    return random.sample(pop.keys(), 1)

   #return sorted(list(random.sample(pop.keys(), tournament_size)), key = lambda i: pop[i], reverse = True)[random.randint(0, tournament_variation - 1)]
   pool = sorted(list(random.sample(pop.keys(), tournament_size)), key = lambda i: pop[i], reverse = True)
   min = pop[pool[0]] * 0.8
   min_index = 0
   for i in range(0, len(pool)):
      if pop[pool[i]] > min:
         min_index = i
      
   return pool[random.randint(0, min_index)]
   

def read_ngrams():    #reads all grams and frequency, then puts it into a dictionary.   relies on a line telling it when grams change number (ie a header saying 3-gram/4-gram)
   f = open(ngrams_file, 'r')
   lines = f.readlines()
   
   activated = False
   for line in lines:
      if not activated:
         if (str(gram_length) + '-gram') in line:
            activated = True
         continue
      else:
         if (str(gram_length + 1) + '-gram') in line:
            break
         words = re.split(r'\t+', line)   #split on tabs (can be multiple)
         ngrams[words[0]] = math.log(int(words[1]), 2)
   return

def random_key():       #returns a shuffled alphabet in a tuple
   to_ret = alphabet[:]
   random.shuffle(to_ret)
   return tuple(to_ret)

def encode(key, text):                 #returns string, could probably be folded into decoded, since the only difference is key, alphabet vs alphabet, key  
   encoded = ''
   key_dict = key_to_dict(alphabet, key)
   
   for i in range(0, len(text)):
      if text[i:i + 1] in key:
         encoded = encoded + key_dict[text[i:i + 1]]
      else:
         encoded = encoded + text[i:i + 1]
   
   return encoded              

def decode(key, encoded):    #returns string
   decoded = ''
   key_dict = key_to_dict(key, alphabet)
   
   for i in range(0, len(encoded)):
      if encoded[i:i + 1] in key:
         decoded = decoded + key_dict[encoded[i:i + 1]]
      else:
         decoded = decoded + encoded[i:i + 1]
   
   return decoded          #returns string

def key_to_dict(key, master):     #returns a dictionary for easy lookup during encoding and decoding.  key is original, master is what the key maps to.
   dict = {' ': ' '}             #space is always space
   for i in range(0, len(key)):
      dict[key[i]] = master[i]
   return dict
   
def mutate(key):   #randomly swaps two letters in a key
   rand1 = random.randint(0, 25)
   rand2 = random.randint(0, 25)
   
   key_list = list(key)
   temp = key_list[rand1]
   key_list[rand1] = key_list[rand2]
   key_list[rand2] = temp
   
   return tuple(key_list)

def evaluate_fitness(key):        #evaluates the fitness of a key based on ngrams after using the key to decode the text
   log_freq_sum = 0.0
   decoded = decode(key, text)
   decoded_words = re.split(r' +', decoded)
   for word in decoded_words:
      if len(word) < gram_length:
         continue            #1 letter words could be super helpful but the research paper say only 3 ngrams...
      grams = get_grams(word)
      for gram in grams:
         if gram in ngrams.keys():
            log_freq_sum += ngrams[gram]
      
   return log_freq_sum

def get_grams(word):   #generates grams based on the variable gram_length
   grams = []
   for i in range(0, len(word) - (gram_length - 1)):
      grams.append(word[i:i + gram_length])
   return grams

if __name__ == '__main__':
   main()