diff --git a/src/jebl/evolution/io/NexusImporter.java b/src/jebl/evolution/io/NexusImporter.java index eb7b995..3f5e3c3 100644 --- a/src/jebl/evolution/io/NexusImporter.java +++ b/src/jebl/evolution/io/NexusImporter.java @@ -60,6 +60,8 @@ public enum NexusBlock { TAXA, CHARACTERS, DATA, + ASSUMPTIONS, // TODO + CALIBRATION, // TODO UNALIGNED, DISTANCES, TREES @@ -479,7 +481,7 @@ public List importDistanceMatrices() throws IOException, ImportE /** * Finds the end of the current block. */ - private void findToken(String query, boolean ignoreCase) throws IOException + protected void findToken(String query, boolean ignoreCase) throws IOException { String token; boolean found = false; @@ -624,21 +626,9 @@ private void readDataBlockHeader(String tokenToLookFor, NexusBlock block) throws } String token3 = helper.readToken(";"); - if (token3.equalsIgnoreCase("NUCLEOTIDE") || - token3.equalsIgnoreCase("DNA") || - token3.equalsIgnoreCase("RNA")) { + // replace getSequenceType if there is new data type + sequenceType = getSequenceType(token3); - sequenceType = SequenceType.NUCLEOTIDE; - - } else if (token3.equalsIgnoreCase("PROTEIN")) { - - sequenceType = SequenceType.AMINO_ACID; - - } else if (token3.equalsIgnoreCase("CONTINUOUS")) { - - throw new ImportException.UnparsableDataException("Continuous data cannot be parsed at present"); - - } } else if (token2.equalsIgnoreCase("INTERLEAVE")) { isInterleaved = true; } @@ -657,6 +647,33 @@ private void readDataBlockHeader(String tokenToLookFor, NexusBlock block) throws } } + /** + * Extract data type after "DATATYPE" keyword, and convert into {@link SequenceType}. + * Override this method if there is new data type. + * @param token the token returned from {@link ImportHelper#readToken(String)}. + * @return the corresponding {@link SequenceType}. + * @throws ImportException.UnparsableDataException + */ + protected SequenceType getSequenceType(String token) throws ImportException.UnparsableDataException { + SequenceType sequenceType = null; + if (token.equalsIgnoreCase("NUCLEOTIDE") || + token.equalsIgnoreCase("DNA") || + token.equalsIgnoreCase("RNA")) { + + sequenceType = SequenceType.NUCLEOTIDE; + + } else if (token.equalsIgnoreCase("PROTEIN")) { + + sequenceType = SequenceType.AMINO_ACID; + + } else if (token.equalsIgnoreCase("CONTINUOUS")) { + + throw new ImportException.UnparsableDataException("Continuous data cannot be parsed at present"); + + } + return sequenceType; + } + /** * Reads sequences in a 'DATA' or 'CHARACTERS' block. */ @@ -852,7 +869,7 @@ private List readTaxaBlock() throws ImportException, IOException { * @param importHelper ImportHelper which may have read meta comments. * @throws ImportException.BadFormatException */ - static void parseAndClearMetaComments(Attributable item, ImportHelper importHelper) throws ImportException.BadFormatException { + public static void parseAndClearMetaComments(Attributable item, ImportHelper importHelper) throws ImportException.BadFormatException { for (String meta : importHelper.getMetaComments()) { // A meta-comment which should be in the form: // \[&label[=value][,label[=value]>[,/..]]\] @@ -1354,7 +1371,7 @@ private Node readExternalNode(SimpleRootedTree tree) throws ImportException, IOE } } - static void parseMetaCommentPairs(String meta, Attributable item) throws ImportException.BadFormatException { + public static void parseMetaCommentPairs(String meta, Attributable item) throws ImportException.BadFormatException { // This regex should match key=value pairs, separated by commas // This can match the following types of meta comment pairs: // value=number, value="string", value={item1, item2, item3} @@ -1393,7 +1410,7 @@ static void parseMetaCommentPairs(String meta, Attributable item) throws ImportE * @param value the string * @return the object */ - static Object parseValue(String value) { + public static Object parseValue(String value) { value = value.trim(); @@ -1467,14 +1484,14 @@ static Object parseValue(String value) { // private stuff private NexusBlock nextBlock = null; - private String nextBlockName = null; - - private int taxonCount = 0, siteCount = 0; - private SequenceType sequenceType = null; - private String gapCharacters = "-"; - private String matchCharacters = "."; - private String missingCharacters = "?"; - private boolean isInterleaved = false; + protected String nextBlockName = null; + + protected int taxonCount = 0, siteCount = 0; + protected SequenceType sequenceType = null; + protected String gapCharacters = "-"; + protected String matchCharacters = "."; + protected String missingCharacters = "?"; + protected boolean isInterleaved = false; protected final ImportHelper helper; } \ No newline at end of file diff --git a/src/jebl/evolution/sequences/Binary.java b/src/jebl/evolution/sequences/Binary.java new file mode 100644 index 0000000..c2a5a6c --- /dev/null +++ b/src/jebl/evolution/sequences/Binary.java @@ -0,0 +1,106 @@ + +package jebl.evolution.sequences; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +/** + * Binary data + * @author Walter Xie + */ +public final class Binary { + + private Binary() { } // make class uninstantiable + + public static final String NAME = "binary"; + + public static final BinaryState ZERO_STATE = new BinaryState("0", "0", 0); + public static final BinaryState ONE_STATE = new BinaryState("1", "1", 1); + + public static final BinaryState[] CANONICAL_STATES = new BinaryState[] { + ZERO_STATE, ONE_STATE + }; + + public static final BinaryState UNKNOWN_STATE = new BinaryState("Unknown base", "?", 2, CANONICAL_STATES); + public static final BinaryState GAP_STATE = new BinaryState("Gap", "-", 3, CANONICAL_STATES); + + public static final BinaryState[] STATES = new BinaryState[] { + ZERO_STATE, ONE_STATE, UNKNOWN_STATE, GAP_STATE + }; + + private static final int STATES_BY_CODE_SIZE = 128; + + /** + * 4, including gap and ambiguity states + */ + public static int getStateCount() { return STATES.length; } // 4 + + /** + * @return A list of all possible states, including the gap and ambiguity states. + */ + public static List getStates() { return Collections.unmodifiableList(Arrays.asList((State[])STATES)); } + + /** + * 2, excluding gap and ambiguity states + */ + public static int getCanonicalStateCount() { return CANONICAL_STATES.length; } + + public static List getCanonicalStates() { return Collections.unmodifiableList(Arrays.asList(CANONICAL_STATES)); } + + public static BinaryState getState(char code) { + if (code < 0 || code >= STATES_BY_CODE_SIZE) { + return null; + } + return statesByCode[code]; + } + + public static BinaryState getState(String code) { + return getState(code.charAt(0)); + } + + public static BinaryState getState(int index) { + return STATES[index]; + } + + public static BinaryState getUnknownState() { return UNKNOWN_STATE; } + + public static BinaryState getGapState() { return GAP_STATE; } + + public static boolean isUnknown(State state) { return state == UNKNOWN_STATE; } + + public static boolean isGap(State state) { return state == GAP_STATE; } + + private static final BinaryState[] statesByCode; + static { + statesByCode = new BinaryState[STATES_BY_CODE_SIZE]; + for (int i = 0; i < statesByCode.length; i++) { + // Undefined characters are mapped to null + statesByCode[i] = null; + } + + for (BinaryState state : STATES) { + final char code = state.getCode().charAt(0); + statesByCode[code] = state; + statesByCode[Character.toLowerCase(code)] = state; + } + } + + public static BinaryState[] toStateArray(String sequenceString) { + BinaryState[] seq = new BinaryState[sequenceString.length()]; + for (int i = 0; i < seq.length; i++) { + seq[i] = getState(sequenceString.charAt(i)); + } + return seq; + } + + public static BinaryState[] toStateArray(byte[] indexArray) { + BinaryState[] seq = new BinaryState[indexArray.length]; + for (int i = 0; i < seq.length; i++) { + seq[i] = getState(indexArray[i]); + } + return seq; + } + + +} diff --git a/src/jebl/evolution/sequences/BinaryState.java b/src/jebl/evolution/sequences/BinaryState.java new file mode 100644 index 0000000..709a5bd --- /dev/null +++ b/src/jebl/evolution/sequences/BinaryState.java @@ -0,0 +1,30 @@ + +package jebl.evolution.sequences; + +/** + * @author Walter Xie + */ +public final class BinaryState extends State { + + BinaryState(String name, String stateCode, int index) { + super(name, stateCode, index); + } + + BinaryState(String name, String stateCode, int index, BinaryState[] ambiguities) { + super(name, stateCode, index, ambiguities); + } + + @Override + public int compareTo(Object o) { + // throws ClassCastException on across-class comparison + BinaryState that = (BinaryState) o; + return super.compareTo(that); + } + + public boolean isGap() { + return this == Binary.GAP_STATE; + } + + public SequenceType getType() { return SequenceType.BINARY; } + +} diff --git a/src/jebl/evolution/sequences/SequenceType.java b/src/jebl/evolution/sequences/SequenceType.java index 3cd621a..0ea994b 100644 --- a/src/jebl/evolution/sequences/SequenceType.java +++ b/src/jebl/evolution/sequences/SequenceType.java @@ -341,6 +341,90 @@ public String toString() { } }; + public static final SequenceType BINARY = new SequenceType() { + + @Override + public int getStateCount() { + return Binary.getStateCount(); + } + + @Override + public List getStates() { + return Binary.getStates(); + } + + @Override + public int getCanonicalStateCount() { + return Binary.getCanonicalStateCount(); + } + + @Override + public List getCanonicalStates() { + return Binary.getCanonicalStates(); + } + + @Override + public State getState(String code) { + return Binary.getState(code); + } + + @Override + public State getState(char code) { + return Binary.getState(code); + } + + @Override + public int getCodeLength() { + return 1; + } + + @Override + public State getState(int index) { + return Binary.getState(index); + } + + @Override + public State getUnknownState() { + return Binary.getUnknownState(); + } + + @Override + public State getGapState() { + return Binary.getGapState(); + } + + @Override + public boolean isUnknown(State state) { + return Binary.isUnknown(state); + } + + @Override + public boolean isGap(State state) { + return Binary.isGap(state); + } + + @Override + public String getName() { + return Binary.NAME; + } + + @Override + public String getNexusDataType() { + return Binary.NAME; + } + + @Override + public State[] toStateArray(String sequenceString) { + return Binary.toStateArray(sequenceString); + } + + @Override + public State[] toStateArray(byte[] indexArray) { + return Binary.toStateArray(indexArray); + } + }; + + public class Utils { private Utils() { } // make class uninstantiable @@ -352,5 +436,29 @@ public static String getAlphabet(SequenceType sequenceType) { } return buffer.toString(); } + + /** + * @param dataTypeName keywords in Nexus or data type descriptions + */ + public static SequenceType getDataType(String dataTypeName) { + // remove spaces, all lower case + switch (dataTypeName.trim().toLowerCase()) { + // keywords in Nexus DATATYPE + case "rna": + case "dna": + case "nucleotide": + return NUCLEOTIDE; + case "aminoacid": + case "protein": + return AMINO_ACID; + case "binary": + return BINARY; +// case "standard": +// case "continuous": +// return STANDARD; // TODO parse continuous DATATYPE in NexusImporter + default: + throw new UnsupportedOperationException(dataTypeName); + } + } } } diff --git a/src/jebl/evolution/sequences/State.java b/src/jebl/evolution/sequences/State.java index e796552..bde1d64 100644 --- a/src/jebl/evolution/sequences/State.java +++ b/src/jebl/evolution/sequences/State.java @@ -18,7 +18,7 @@ */ public abstract class State implements Comparable { - State(String name, String stateCode, int index) { + public State(String name, String stateCode, int index) { this.name = name; this.stateCode = stateCode; @@ -26,7 +26,7 @@ public abstract class State implements Comparable { this.index = index; } - State(String name, String stateCode, int index, State[] ambiguities) { + public State(String name, String stateCode, int index, State[] ambiguities) { this.name = name; this.stateCode = stateCode; this.ambiguities = Collections.unmodifiableSortedSet(new TreeSet(Arrays.asList(ambiguities)));