CodeExamples / Voice Recognition / text_comparison.py
text_comparison.py
Raw
import os
import numpy as np

sentences = []

for i in range(1,503):        
    versions_of_text = []
    previous_text = ""
    speakers_missing_text = []
    for dir in os.listdir("./"):
        try:
            text = open(dir + "/" + dir + "_" + "{:03n}".format(i) + ".txt").read()
            text = text.strip()

            if len(versions_of_text) == 0:
                versions_of_text.append(text)
                sentences.append(text + ", " + str(i))
            else:
                is_match = False
                for t in versions_of_text:                    
                    if text == t:
                        is_match = True
                if not is_match:
                    versions_of_text.append(text)
                    sentences.append(text + ", " + str(i))
        except:
            speakers_missing_text.append(dir)
    if len(versions_of_text) == 1 and len(speakers_missing_text) == 0:
        print("FILE: " + "{:03n}".format(i) + " is okay.")
    elif len(speakers_missing_text) > 0:
        if len(speakers_missing_text) < 10:
            speaker_list = speakers_missing_text
        else:
            speaker_list = speakers_missing_text[:10]
        print("\tFile: " + "{:03n}".format(i) + " is absent for " + str(speaker_list))        
    elif len(versions_of_text) > 1:
        print("\tFile: " + "{:03n}".format(i) + " has " + str(len(versions_of_text)) + " versions across all speakers:")
        for v in versions_of_text:
            print("\t\t" + v) 

print("I found a total of " + str(len(sentences)) + " unique sentences.")
print("Here they are alphabetically:")
sorted_sentences = np.sort(sentences)
n = 0
for s in sorted_sentences:
    if n < 100:
        print(s)
        n += 1

print("\n\nWhoever did this is a war criminal.")