/* * Alphabet.java Copyright (C) 2020. Daniel H. Huson * * (Some files contain contributions from other authors, who are then mentioned separately.) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ package jloda.seq; /** * Alphabet base class *

* Created by huson on 9/30/14. */ public class Alphabet implements INormalizer { protected final byte alphabetSize; protected final long[] letter2code; protected final byte[] letter2normalized; protected final byte[] code2letter; protected final int bitsPerLetter; protected final int unusedBits; protected final int lettersPerWord; protected final long letterMask; protected final byte undefinedLetterCode; protected final String definitionString; /** * constructor * * @param definitionString * @param undefinedLetter */ public Alphabet(String definitionString, byte undefinedLetter) { boolean isUndefinedContained = (definitionString.indexOf(undefinedLetter) != -1); this.definitionString = definitionString.replaceAll("\\[", "").replaceAll("]", "").replaceAll(" {2}", " "); String[] letterGroups = this.definitionString.split(" "); alphabetSize = (byte) (letterGroups.length + (isUndefinedContained ? 0 : 1)); { int bits = 1; while (!(Math.pow(2, bits) > alphabetSize)) { bits++; } bitsPerLetter = bits; } letterMask = (1L << bitsPerLetter) - 1; lettersPerWord = 64 / bitsPerLetter; unusedBits = 64 - lettersPerWord * bitsPerLetter; //System.err.println("Alphabet: " + definitionString + " bits: " + bitsPerLetter); code2letter = new byte[alphabetSize + 1]; undefinedLetterCode = alphabetSize; letter2code = new long[127]; letter2normalized = new byte[127]; for (int i = 0; i < 127; i++) { letter2code[i] = undefinedLetterCode; letter2normalized[i] = undefinedLetter; } code2letter[undefinedLetterCode] = undefinedLetter; int bits = 1; for (String letterGroup : letterGroups) { for (int j = 0; j < letterGroup.length(); j++) { int letter = Character.toLowerCase(letterGroup.charAt(j)); letter2code[letter] = bits; letter = Character.toUpperCase(letterGroup.charAt(j)); letter2code[letter] = bits; letter2normalized[letter] = (byte) letterGroup.charAt(0); if (j == 0) code2letter[bits] = (byte) letter; } // System.err.println(letterGroups[i]+" -> "+Integer.toBinaryString(bits)+" -> "+(char)code2letter[bits]); bits++; } } /** * gets the alphabet size * * @return alphabet size */ public byte getAlphabetSize() { return alphabetSize; } /** * gets the number of bits used to encode a letter * * @return number of bits */ public int getBitsPerLetter() { return bitsPerLetter; } /** * gets the letter to code mapping * * @return letter to code */ public long[] getLetter2Code() { return letter2code; } /** * gets the code to letter mapping * * @return code to letter */ public byte[] getCode2Letter() { return code2letter; } /** * gets the mask used for a single letter * * @return letter mask */ public long getLetterMask() { return letterMask; } /** * gets the number of letters per 64-bit word * * @return letters per word */ public int getLettersPerWord() { return lettersPerWord; } /** * gets the number of unused bits * * @return number of unused (per 64-bit word) */ public int getUnusedBits() { return unusedBits; } /** * gets the code assigned to undefined letter * * @return code */ public byte getUndefinedLetterCode() { return undefinedLetterCode; } /** * gets the definition string * * @return defintion */ public String getDefinitionString() { return definitionString; } /** * returns normalized letter * * @param letter * @return normalized letter */ @Override public byte getNormalized(byte letter) { return letter2normalized[letter]; } }