package dasherJava.core.languageModeling; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; public class LanguageAlphabet { //This class contains a list of all characters and various data structures to enable efficiently accessing //them. They are usually referenced by their symbol index, but due to the way the language model is //implemented we need to distinguish between two types of symbol indices: There are world symbol indices //(where all characters are included) and language model symbol indices (where only characters that do //not have a fixed probability are included). private final Map characterToWorldSymbolIndex = new HashMap<>(); private final Map unicodeToLanguageModelSymbolIndex = new HashMap<>(); private final Map fixedProbabilityCharacters = new HashMap<>(); //by world symbol index private final List characters = new ArrayList<>(); //sorted by world symbol index //Note: The order of character groups does not matter, they are automatically sorted when rendering. //Groups may overlap, there may be subgroups (i.e. groups fully contained within another group) //and there may be characters not belonging to any group. //Group specifications use world symbol indices. private final List characterGroups = new ArrayList<>(); public int getNumOfCharacters() { return characters.size(); } public LanguageCharacter getLanguageCharacter(int symbolIndex) { return characters.get(symbolIndex); } public int getWorldSymbolIndex(LanguageCharacter character) throws IllegalArgumentException { Integer symbolIndex = characterToWorldSymbolIndex.get(character); if (symbolIndex<0) throw new IllegalArgumentException("Alphabet doesn't contain the given LanguageCharacter"); return symbolIndex; } public int getLanguageModelSymbolIndex(int unicode) throws UnicodeNotFoundException { Integer symbolIndex = unicodeToLanguageModelSymbolIndex.get(unicode); if (symbolIndex==null) throw new UnicodeNotFoundException(unicode); return symbolIndex; } public Map getFixedProbabilityCharacters() { return fixedProbabilityCharacters; } public void addCharacter(LanguageCharacter character) throws IllegalArgumentException { if (characterToWorldSymbolIndex.containsKey(character)) throw new IllegalArgumentException("Alphabet already contains the given LanguageCharacter"); if (unicodeToLanguageModelSymbolIndex.containsKey(character.getUnicode())) throw new IllegalArgumentException("Alphabet already contains Unicode value "+character.getUnicode()); characterToWorldSymbolIndex.put(character, characters.size()); if (character.getFixedProbability()>=0.0f) fixedProbabilityCharacters.put(characters.size(), character); else unicodeToLanguageModelSymbolIndex.put(character.getUnicode(), unicodeToLanguageModelSymbolIndex.size()); characters.add(character); } public void addCharacterGroup(CharacterGroup characterGroup) { characterGroups.add(characterGroup); } public List getCharacterGroups() { return characterGroups; } public static class UnicodeNotFoundException extends RuntimeException { private final int unicode; public UnicodeNotFoundException(int unicode) { this.unicode=unicode; } public int getUnicode() { return unicode; } } }