/* RE.java - Regular Expression Class in Java Package RegularExpression Copyright (C) 2001, 2002 Brian Westphal This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package RegularExpression; import java.util.*; /** * The RE class handles standard PERL-type regular expression operations.
*
*

RE pattern matching and escape characters list:
 * /d        ANY DIGIT
 * /D        ANY NON-DIGIT
 * /s        ANY TYPE OF WHITESPACE
 * /S        ANY TYPE OF NON-WHITESPACE
 * /w        ANY TYPE OF ALPHANUMERIC
 * /W        ANY TYPE OF NON-ALPHANUMERIC
 * .         ANY NON NEWLINE CHARACTER
 * 
 * #x[0-9A-F]+ HEX REPRESENTATION OF A CHARACTER
 * 
 * /?        WHERE ? IS ANY OTHER CHARACTER YIELDS THAT CHARACTER
 * 
 * *         0 OR MORE TIMES
 * +         1 OR MORE TIMES
 * ?         0 OR 1 TIMES
 * {M}       M TIMES
 * {M,}      AT LEAST M TIMES
 * {M,N}     AT LEAST M AND AT MOST N TIMES (N >= M)
 * 
 * [al-z]    CHARACTER LIST, INDIVIDUAL CHARACTERS OR CHARACTER RANGES

* * @author Brian Westphal * @version 1.10 * @since JDK1.3.1 */ public class RE { /** * A flag denoting case-sensitivity. */ private boolean casesensitive; /** * The automaton used to process regular expression operation. */ private NFA automaton; /** * The string used to build the regular expression automaton. */ private String re; /** * Denotes the set of numeric characters. */ private static final char SET_DIGIT = (char) 256; /** * Denotes the set of non-numeric characters. */ private static final char SET_NONDIGIT = (char) 257; /** * Denotes the set of whitespace characters. */ private static final char SET_WHITESPACE = (char) 258; /** * Denotes the set of non-whitespace characters. */ private static final char SET_NONWHITESPACE = (char) 259; /** * Denotes the set of alpha-numeric characters. */ private static final char SET_ALPHANUMERIC = (char) 260; /** * Denotes the set of non-alpha-numeric characters. */ private static final char SET_NONALPHANUMERIC = (char) 261; /** * Denotes the set of non-newline characters. */ private static final char SET_NONNEWLINE = (char) 262; /** * The number indicating the char value of the next dynamic character * set. */ private char charsetbase = (char) 263; /** * The list of character sets. Used whenever a character set * (i.e. [al-z]) is needed. */ private Vector charsets = new Vector (); /** * Empty constructor. */ public RE () { } /** * Contructs a regular expression handler with the specified regular * expression (as a string) and a boolean denoting case-sensitivity for * operations. * * Example: RE regexp = new RE ("ab*c", true); * * @param re the regular expression string. * @param casesensitive the flags used to specify case-sensitivity. */ public RE (String re, boolean casesensitive) throws Exception { this.re = re; this.casesensitive = casesensitive; automaton = buildNFA (re); automaton.shrinkfit (); } /** * Returns the regular expression automaton. * * @return the regular expression automaton. */ public NFA getAutomaton () { return automaton; } /** * Returns the next token and delimiter in a specified string from the * specified offset. The delimiter is the first portion of text that * matches the regular expression. The token is the text immediately * preceeding the delimiter. * * @param input the string to be parsed. * @param offset the offset from the beginning of the input string. * * @return an Object array with three elements (int, String, String): * the index of the character following the delimiter (the next * offset), the token string, and the delimiter string (null if none * exists). Returns null if at the end of input string. */ public Object [] nextTokenAndDelim (String input, int offset) { //If at end of input string no next token/delim exists. if (offset > input.length ()) return null; int input_length = input.length (); for (int beginning = offset; beginning < input_length; beginning++) { for (int length = input_length - beginning; length > 0; length--) { //Searching for first/longest delim. String teststring = input.substring (beginning, beginning + length); Object [] value = matchesWithFailPoint (teststring); boolean success = ((Boolean) value[0]).booleanValue (); //If delim is found return token and delim. if (success) { Object [] output = new Object[3]; output[0] = new Integer (beginning + length); output[1] = input.substring (offset, beginning); output[2] = teststring; return output; } //If no delim is found update string search possibilities. else { int failpoint = ((Integer) value[1]).intValue (); if (failpoint >= length) failpoint = -1; length = failpoint + 1; } } } //If no delim exists return token only. Object [] output = new Object[3]; output[0] = new Integer (input.length () + 1); output[1] = input.substring (offset); output[2] = null; return output; } /** * Returns a boolean value specifying whether the beginning of the * specified string matches the regular expression or not. * * @param input the string to be tested. * * @return the value specifying whether the string matched or not. */ public boolean beginningMatches (String input) { //Calls secondary matches function, returns simplified value. Object [] value = beginningMatchesWithLength (input); return ((Boolean) value[0]).booleanValue (); } /** * Returns a boolean value specifying whether the beginning of the * specified string matches the regular expression or not. * * @param input the string to be tested. * * @return an Object array with two elements (boolean, int): the value * specifying whether the beginning matched or not, the length of the * matched substring if one was found. */ public Object [] beginningMatchesWithLength (String input) { for (int length = input.length (); length >= 0; length--) { //Searching for longest match from beginning. String teststring = input.substring (0, length); Object [] value = matchesWithFailPoint (teststring); boolean success = ((Boolean) value[0]).booleanValue (); //If a match is found return true and length. if (success) { Object [] output = new Object[2]; output[0] = new Boolean (true); output[1] = new Integer (length); return output; } //If a match if not found update string search possibilities. else { int failpoint = ((Integer) value[1]).intValue (); if (failpoint >= length) length = 0; else length = failpoint + 1; } } //If no match exists return false and null. Object [] output = new Object[2]; output[0] = new Boolean (false); output[1] = null; return output; } /** * Returns a boolean value specifying whether the specified string * matches the regular expression or not. This is the primary matches * function, used by end-user programmers. * * @param input the string to be tested. * * @return the value specifying whether the string matched or not. */ public boolean matches (String input) { //Calls secondary matches function, returns simplified value. Object [] value = matchesWithFailPoint (input); return ((Boolean) value[0]).booleanValue (); } /** * Returns a boolean value specifying whether the specified string * matches the regular expression or not and if the string does not * match it returns the position at which the string first failed. This * is the secondary matches function that is used by developers looking * to expand the RegularExpression package. * * @param input the string to be tested. * * @return an Object array with two elements (boolean, int): the value * specifying whether the string matched or not, the first point of * failure (if applicable). */ public Object [] matchesWithFailPoint (String input) { Object [] output = new Object[2]; int size; //Resetting automaton. automaton.reset (); //Feeding epsilon transitions. automaton.input (automaton.epsilon); int charsets_size = charsets.size (); char [] chararray = new char[charsets_size + 5]; int charsUsed = 0; //Detects special characters. boolean specialInAlphabetEnabled = false; for (int index = 0; index < automaton.specialInAlphabet.length && !specialInAlphabetEnabled; index++) { if (automaton.specialInAlphabet[index]) specialInAlphabetEnabled = true; } //Feeding input string characters one at a time. int input_length = input.length (); for (int index = 0; index < input_length; index++) { charsUsed = 0; //Getting input string character. char character; if (casesensitive) character = input.charAt (index); else character = Character.toLowerCase (input.charAt (index)); chararray[charsUsed++] = character; //Optimized to detect if an automaton contains any special characters (256-262). if (specialInAlphabetEnabled) { if (automaton.specialInAlphabet[SET_ALPHANUMERIC - 256] && Character.isLetterOrDigit (character)) chararray[charsUsed++] = SET_ALPHANUMERIC; else if (automaton.specialInAlphabet[SET_NONALPHANUMERIC - 256]) chararray[charsUsed++] = SET_NONALPHANUMERIC; if (automaton.specialInAlphabet[SET_DIGIT - 256] && Character.isDigit (character)) chararray[charsUsed++] = SET_DIGIT; else if (automaton.specialInAlphabet[SET_NONDIGIT - 256]) chararray[charsUsed++] = SET_NONDIGIT; if (automaton.specialInAlphabet[SET_WHITESPACE - 256] && Character.isWhitespace (character)) chararray[charsUsed++] = SET_WHITESPACE; else if (automaton.specialInAlphabet[SET_NONWHITESPACE - 256]) chararray[charsUsed++] = SET_NONWHITESPACE; if (automaton.specialInAlphabet[SET_NONNEWLINE - 256] && character != '\n') chararray[charsUsed++] = SET_NONNEWLINE; } //Creating list of dynamic character sets that the input string character belongs to. for (int csindex = 0; csindex < charsets_size; csindex++) { CharacterSet charset = (CharacterSet) charsets.get (csindex); if (!casesensitive) { if (charset.inSet (Character.toLowerCase (character))) chararray[charsUsed++] = (char) (charsetbase + csindex); } else { if (charset.inSet (character)) chararray[charsUsed++] = (char) (charsetbase + csindex); } } //Feeding all character possibilities simultaneously. //Return false and failure position if no character is accepted. if (!feedpattern (automaton, chararray, charsUsed)) { output[0] = new Boolean (false); output[1] = new Integer (index); return output; } //Feeding epsilon transitions. automaton.input (automaton.epsilon); } //Returns the acceptingness and current position (after all input). output[0] = new Boolean (automaton.accepting ()); output[1] = new Integer (input.length ()); return output; } /** * Returns the index of the first instance of the regular expression * after the offset. * * @param input the input string. * @param offset the offset of the input string. * * @return the index result or -1 if no index is found. */ public int indexOf (String input, int offset) { Object [] value = nextTokenAndDelim (input, offset); if (value == null || value[2] == null) return -1; return ((Integer) value[0]).intValue () - ((String) value[2]).length (); } /** * Replaces all matching substrings in a specified string with a * specified replacement string. * * @param input the string to be manipulated. * @param replacement the string to replace matching portions. * * @return the string after all replacements have been made. */ public String replace (String input, String replacement) { String output = ""; //Splits input string into tokens. String [] value = split (input); //Inserts replacement values between tokens. for (int index = 0; index < value.length; index++) { output += value[index]; if (index < value.length - 1) output += replacement; } return output; } /** * Splits the specified string into an array of tokens. * * @param input the string to be parsed. * * @return an array of tokens. */ public String [] split (String input) { LinkedList outputlist = new LinkedList (); int offset = 0; Object [] value; //Adds all tokens from an input string into a linked list. while ((value = nextTokenAndDelim (input, offset)) != null) { offset = ((Integer) value[0]).intValue (); outputlist.add (value[1]); } //Converts linked list into String array. String [] output = new String[outputlist.size ()]; int outputlist_size = outputlist.size (); for (int index = 0; index < outputlist_size; index++) { output[index] = (String) outputlist.get (index); } return output; } /** * Returns a string formatted to look like a regular expression. * * @return a string formatted to look like a regular expression. */ public String toString () { return "/" + re + "/"; } /** * Builds the automaton described by the specified regular expression. * This is a recursive function. * * @param re the regular expression that describes the automaton. * * @return the automaton described by the regular expression. */ private NFA buildNFA (String re) throws Exception { NFA output = null; //Used to denote special case characters. boolean escape = false; boolean hexcharacter = false; //Building NFA one character at a time. int re_length = re.length (); for (int index = 0; index < re_length; index++) { //Handling parentheses (groups). if (!hexcharacter && !escape && re.charAt (index) == '(') { String group = ""; int level = 1; boolean success = false; boolean internalescape = false; //Collecting group (using multilevel parentheses). for (int gindex = index + 1; gindex < re_length; gindex++) { if (!internalescape && re.charAt (gindex) == '/') internalescape = true; else { if (!internalescape && re.charAt (gindex) == '(') level++; else if (!internalescape && re.charAt (gindex) == ')') level--; //If at original parenthesis level. if (level == 0) { success = true; //Breaks out of for loop (w/o storing closing parenthesis). break; } internalescape = false; } group += new Character (re.charAt (gindex)).toString (); } //If matching closing parenthesis found. if (success) { //Get NFA for group. NFA concatnfa = buildNFA (group); index += group.length () + 1; //Add any modifiers. Object [] values = addModifiers (concatnfa, index, re); index += ((Integer) values[0]).intValue (); concatnfa = (NFA) values[1]; //Concatenate NFA to output. if (output == null) output = concatnfa; else output.concat (concatnfa); } else throw new Exception ("Malformed regular expression"); } //Handling pipes (union). else if (!hexcharacter && !escape && re.charAt (index) == '|') { //Get NFA for everything else. //Union a|b|c can be treated as a|(b|c). NFA unionnfa = buildNFA (re.substring (index + 1)); index = re.length (); //Unioning NFA with output. if (output == null) output = unionnfa; output.union (unionnfa); } //Handling brackets (character sets). else if (!hexcharacter && !escape && re.charAt (index) == '[') { //Creates dynamic character set. CharacterSet charset = new CharacterSet (); boolean innerescape = false; boolean innerhexcharacter = false; //Adding characters and character lists to character set. for (int cindex = index + 1; cindex < re_length; cindex++) { //If carrot set inversion property. //[^ab] means any character but a or b. if (!innerhexcharacter && re.charAt (cindex) == '^' && cindex == index + 1) charset.inverse = true; else if (!innerhexcharacter && !innerescape && re.charAt (cindex) == '/') innerescape = true; //If closing bracket found. else if (!innerhexcharacter && !innerescape && re.charAt (cindex) == ']') { index = cindex; cindex = re.length (); } //If hex character signal found. //Format: #x???...? where ? is a hex digit (at least one hex digit). else if (!innerhexcharacter && !innerescape && re.charAt (cindex) == '#') { if (cindex < re.length () - 1 && re.charAt (cindex + 1) == 'x') { int lastfound = -1; //Collecting all following hex digits. for (int character = cindex + 2; character < re_length; character++) { char use = re.charAt (character); if (use >= '0' && use <= '9' || use >= 'A' && use <= 'F') lastfound = character; else break; } //Converting hex representation into character representation. String convert = re.substring (cindex + 2, lastfound + 1); int value = HEX2int (convert); //Replacing value in string. re = re.substring (0, cindex) + (char) value + re.substring (lastfound + 1); re_length = re.length (); cindex--; } else throw new Exception ("Invalid representation type after #"); innerhexcharacter = true; } //If normal character or special character. else { //If character range. if (!innerescape && cindex < re.length () - 2 && re.charAt (cindex + 1) == '-') { //If range ends with hex character. if (re.charAt (cindex + 2) == '#') { if (cindex < re.length () - 3 && re.charAt (cindex + 3) == 'x') { int lastfound = -1; //Collecting all following hex digits. for (int character = cindex + 4; character < re_length; character++) { char use = re.charAt (character); if (use >= '0' && use <= '9' || use >= 'A' && use <= 'F') lastfound = character; else break; } //Converting hex representation into character representation. String convert = re.substring (cindex + 4, lastfound + 1); int value = HEX2int (convert); //Replacing value in string. re = re.substring (0, cindex + 2) + (char) value + re.substring (lastfound + 1); re_length = re.length (); } else throw new Exception ("Invalid representation type after #"); } //Adding range to dynamic character set. if (!casesensitive) charset.addRange (Character.toLowerCase (re.charAt (cindex)), Character.toLowerCase (re.charAt (cindex + 2))); else charset.addRange (re.charAt (cindex), re.charAt (cindex + 2)); cindex += 2; } //If single character. else { //Adding character to dynamic character set. if (!casesensitive) charset.addCharacter (Character.toLowerCase (re.charAt (cindex))); else charset.addCharacter (re.charAt (cindex)); } innerescape = false; innerhexcharacter = false; } } //Adding any modifiers. Object [] values = addModifiers ((char) (charsetbase + charsets.size ()), index, re); int charactersUsed = ((Integer) values[0]).intValue (); index += charactersUsed; NFA concatnfa = (NFA) values[1]; //Concatenating NFA to output. if (output == null) { if (charactersUsed > 0) { output = concatnfa; } else { output = new NFA (); output.concat ((char) (charsetbase + charsets.size ())); } } else if (charactersUsed > 0) output.concat (concatnfa); else output.concat ((char) (charsetbase + charsets.size ())); //Adding character set to dynamic character set list. charsets.add (charset); } //Handling scores (hex characters). else if (!hexcharacter && !escape && re.charAt (index) == '#') { if (index < re.length () - 1 && re.charAt (index + 1) == 'x') { int lastfound = -1; //Collecting all following hex digits. for (int character = index + 2; character < re_length; character++) { char use = re.charAt (character); if (use >= '0' && use <= '9' || use >= 'A' && use <= 'F') lastfound = character; else break; } //Converting hex representation into character representation. String convert = re.substring (index + 2, lastfound + 1); int value = HEX2int (convert); //Replacing value in string. re = re.substring (0, index) + (char) value + re.substring (lastfound + 1); re_length = re.length (); index--; } else throw new Exception ("Invalid representation type after #"); hexcharacter = true; } //Handling slashes (escape characters). else if (!hexcharacter && !escape && re.charAt (index) == '/') { escape = true; } //Handling normal characters and special characters. else { char symbol; //If normal character (or period). if (!escape || hexcharacter) { //If period character (matches any non-newline character) if (!hexcharacter && re.charAt (index) == '.') symbol = SET_NONNEWLINE; else { //Creating alphabet with single character if (casesensitive) symbol = re.charAt (index); else symbol = Character.toLowerCase (re.charAt (index)); } } //If escape character. else { if (re.charAt (index) == 'd') symbol = SET_DIGIT; else if (re.charAt (index) == 'D') symbol = SET_NONDIGIT; else if (re.charAt (index) == 's') symbol = SET_WHITESPACE; else if (re.charAt (index) == 'S') symbol = SET_NONWHITESPACE; else if (re.charAt (index) == 'w') symbol = SET_ALPHANUMERIC; else if (re.charAt (index) == 'W') symbol = SET_NONALPHANUMERIC; //If normal escape character (no special meaning or canceling meaning) else { //Creating alphabet with single character if (casesensitive) symbol = re.charAt (index); else symbol = Character.toLowerCase (re.charAt (index)); } } //Adding any modifiers. Object [] values = addModifiers (symbol, index, re); int charactersUsed = ((Integer) values[0]).intValue (); index += charactersUsed; NFA concatnfa = (NFA) values[1]; //Concatenating NFA to output. if (output == null) { if (charactersUsed > 0) { output = concatnfa; } else { output = new NFA (); output.concat (symbol); } } else if (charactersUsed > 0) output.concat (concatnfa); else output.concat (symbol); escape = false; } } return output; } /** * Returns a boolean specifying whether the specified character is a * modifier or not. Modifiers include: '*', '+', '?', and '{'. * * @param character the character to be tested. * * @return the value specifying whether the specified character is a * modifier or not. */ private boolean isModifier (char character) { if (character == '*' || character == '+' || character == '?' || character == '{') return true; else return false; } /** * Returns the number of modifiers following a definition and the * automaton after applying the modifiers. * * @param character the character to add modifiers to. * @param offset the index in the regular expression string to start looking for modifiers. * @param re the regular expression string. * * @return an Object array with two elements (int, NFA): the number of * modifier characters used, and the automaton after applying the * modifiers. */ private Object [] addModifiers (char character, int offset, String re) { NFA automaton = new NFA (); Object [] output = new Object[2]; int modifiers = 0; //Searching for modifiers (must immediately follow pattern). //a+ not a +. int re_length = re.length (); int modindex = offset + 1; if (modindex >= re.length ()) { output[0] = new Integer (modifiers); output[1] = automaton; return output; } char modifier; //If modifier found. if (isModifier (modifier = re.charAt (modindex))) { modifiers++; if (modifier == '*') automaton = NFA.star (character); else if (modifier == '+') automaton = NFA.plus (character); else if (modifier == '?') automaton = NFA.maybe (character); //If repetition pattern. //{m} exactly m times. //{m,} at least m times. //{m,n} between (inclusive) m and n times. else if (modifier == '{') { int commaindex = -1; int endindex = -1; //Searching for comma and/or closing brace. for (int charindex = modindex + 1; charindex < re_length && endindex == -1; charindex++) { if (re.charAt (charindex) == ',') commaindex = charindex; else if (re.charAt (charindex) == '}') endindex = charindex; } //If contains comma (at least or between matching). if (commaindex != -1) { //If comma next to close brace get low value (at least). if (endindex == commaindex + 1) { int value = new Integer (re.substring (modindex + 1, commaindex)).intValue (); automaton = NFA.repeat (character, value, -1); } //Else get low and high values (between). else { int lowvalue = new Integer (re.substring (modindex + 1, commaindex)).intValue (); int hivalue = new Integer (re.substring (commaindex + 1, endindex)).intValue (); automaton = NFA.repeat (character, lowvalue, hivalue); } } //Else get value (exactly matching). else { int value = new Integer (re.substring (modindex + 1, endindex)).intValue (); automaton = NFA.repeat (character, value, value); } modifiers += endindex - modindex; modindex += endindex - modindex; } } //If not modifier return number of modifier characters used and automaton. else { output[0] = new Integer (modifiers); output[1] = automaton; return output; } //Return number of modifier characters used and automaton (if at end of pattern). output[0] = new Integer (modifiers); output[1] = automaton; return output; } /** * Returns the number of modifiers following a definition and the * automaton after applying the modifiers. * * @param automaton the automaton to add modifiers to. * @param offset the index in the regular expression string to start looking for modifiers. * @param re the regular expression string. * * @return an Object array with two elements (int, NFA): the number of * modifier characters used, and the automaton after applying the * modifiers. */ private Object [] addModifiers (NFA automaton, int offset, String re) { Object [] output = new Object[2]; int modifiers = 0; //Searching for modifiers (must immediately follow pattern). //a+ not a +. int re_length = re.length (); int modindex = offset + 1; if (modindex >= re.length ()) { output[0] = new Integer (modifiers); output[1] = automaton; return output; } char modifier; //If modifier found. if (isModifier (modifier = re.charAt (modindex))) { modifiers++; if (modifier == '*') automaton.star (); else if (modifier == '+') automaton.plus (); else if (modifier == '?') automaton.maybe (); //If repetition pattern. //{m} exactly m times. //{m,} at least m times. //{m,n} between (inclusive) m and n times. else if (modifier == '{') { int commaindex = -1; int endindex = -1; //Searching for comma and/or closing brace. for (int charindex = modindex + 1; charindex < re_length && endindex == -1; charindex++) { if (re.charAt (charindex) == ',') commaindex = charindex; else if (re.charAt (charindex) == '}') endindex = charindex; } //If contains comma (at least or between matching). if (commaindex != -1) { //If comma next to close brace get low value (at least). if (endindex == commaindex + 1) { int value = new Integer (re.substring (modindex + 1, commaindex)).intValue (); automaton.repeat (value, -1); } //Else get low and high values (between). else { int lowvalue = new Integer (re.substring (modindex + 1, commaindex)).intValue (); int hivalue = new Integer (re.substring (commaindex + 1, endindex)).intValue (); automaton.repeat (lowvalue, hivalue); } } //Else get value (exactly matching). else { int value = new Integer (re.substring (modindex + 1, endindex)).intValue (); automaton.repeat (value, value); } modifiers += endindex - modindex; modindex += endindex - modindex; } } //If not modifier return number of modifier characters used and automaton. else { output[0] = new Integer (modifiers); output[1] = automaton; return output; } //Return number of modifier characters used and automaton (if at end of pattern). output[0] = new Integer (modifiers); output[1] = automaton; return output; } /** * Simulates the simultaneous feeding of multiple characters into an * automaton. * * @param automaton the automaton to be fed. * @param patternchar the list of characters to feed simultaneously. * * @return a boolean specifying whether or not any of the characters were accepted. */ private boolean feedpattern (NFA automaton, char [] patternchar, int charsUsed) { boolean found = false; //Storing current states of machine. Vector currentstate = (Vector) automaton.getCurrentStates ().clone (); Vector savestate = new Vector (); boolean [] inSaveState = new boolean[automaton.Q]; //Feeding each possible character from array. for (int index = 0; index < charsUsed; index++) { //If input is accepted. if (automaton.input (patternchar[index])) { found = true; Vector newCurrentStates = automaton.getCurrentStates (); savestate.addAll (newCurrentStates); int newCurrentStates_size = newCurrentStates.size (); for (int subindex = 0; subindex < newCurrentStates_size; subindex++) { NFANode node = (NFANode) newCurrentStates.get (subindex); if (!inSaveState[node.nodeNumber]) { savestate.add (node); inSaveState[node.nodeNumber] = true; } } } //Resetting machine to initial states. automaton.setCurrentStates (currentstate); } automaton.setCurrentStates (savestate); return found; } /** * Converts from hexidecimal to int. * * @param hexvalue the value of a number in hexidecimal notation. * * @return the int value from the conversion. */ private int HEX2int (String hexvalue) { int power = 1; int value = 0; //Adding value one character at a time. //0-9 = 0- 9. //A-F = 10-15. int hexvalue_length = hexvalue.length (); for (int index = 0; index < hexvalue_length; index++) { char use = hexvalue.charAt (hexvalue.length () - 1 - index); if (use >= '0' && use <= '9') value += power * ((int) use - (int) '0'); else if (use >= 'A' && use <= 'F') value += power * (10 + (int) use - (int) 'A'); power *= 16; } return value; } } /** * The NFA class is an implementation for handling non-deterministic finite * state automata. This class simulates determinism in real-time rather than * converting to a DFA. * * @author Brian Westphal * @version 1.10 * @since JDK1.3.1 */ class NFA implements Cloneable { /** * The number of states in the NFA. */ public int Q; /** * Epsilon constant. */ protected static final char epsilon = (char) 0; /** * Flags denoting whether special characters (256 to 262) are in the alphabet. */ protected boolean [] specialInAlphabet; //256 to 262 /** * The collection of nodes. */ private Vector nodes; /** * The collection of current nodes. */ private Vector currentNodes; /** * The collection of final state nodes. */ private Vector finalStateNodes; /** * Flag denoting whether node numbers have been assigned (used in reset). */ private boolean nodeNumbersAssigned = false; /** * Constructs a new non-deterministic finite state automaton. */ public NFA () { Q = 1; NFANode newState = new NFANode (true); nodes = new Vector (); nodes.add (newState); finalStateNodes = new Vector (); finalStateNodes.add (newState); specialInAlphabet = new boolean[7]; } /** * Assigns node numbers (used for optimization) to each node. */ private void assignNodeNumbers () { //Assigning node numbers. for (int index = 0; index < nodes.size (); index++) { NFANode node = (NFANode) nodes.get (index); node.nodeNumber = index; } } /** * Performs a deep copy of the automaton. * * @return a deep copy of the automaton. */ public Object clone () { assignNodeNumbers (); //Copying nodes. Vector newNodes = new Vector (); Vector newFinalStateNodes = new Vector (); for (int index = 0; index < nodes.size (); index++) { NFANode node = (NFANode) nodes.get (index); NFANode newNode = new NFANode (node.finalState); newNodes.add (newNode); if (node.finalState) newFinalStateNodes.add (newNode); } //Copying transitions. for (int index = 0; index < nodes.size (); index++) { NFANode node = (NFANode) nodes.get (index); NFANode newNode = (NFANode) newNodes.get (index); int node_branches_size = node.branches.size (); for (int subindex = 0; subindex < node_branches_size; subindex++) { NFATransition transition = (NFATransition) node.branches.get (subindex); newNode.branches.add (new NFATransition (transition.inChar, (NFANode) newNodes.get (transition.outNode.nodeNumber))); } } //Creating new NFA. NFA output = new NFA (); output.Q = Q; output.nodes = newNodes; output.finalStateNodes = newFinalStateNodes; System.arraycopy (specialInAlphabet, 0, output.specialInAlphabet, 0, specialInAlphabet.length); return output; } /** * Concatenates a single character. * * @param character the character to be concated. * * @param character the character to be concatenated. */ public void concat (char character) { //If special character, record in flag array. if (character >= (char) 256 && character <= (char) 262) { specialInAlphabet[(int) character - 256] = true; } //Creating new final state. NFANode newFinalState = new NFANode (true); //Creating transitions from previous final states to new final state. int finalStateNodes_size = finalStateNodes.size (); for (int index = 0; index < finalStateNodes_size; index++) { NFANode node = (NFANode) finalStateNodes.get (index); node.branches.add (new NFATransition (character, newFinalState)); node.finalState = false; } Q++; nodes.add (newFinalState); finalStateNodes.clear (); finalStateNodes.add (newFinalState); } /** * Concatenates an automaton. * * @param RHS the automaton to be concatenated. */ public void concat (NFA RHS) { NFANode newInitialState = (NFANode) RHS.nodes.get (0); //Creating transitions from final states to initial state of RHS. int finalStateNodes_size = finalStateNodes.size (); for (int index = 0; index < finalStateNodes_size; index++) { NFANode node = (NFANode) finalStateNodes.get (index); node.branches.add (new NFATransition (epsilon, newInitialState)); node.finalState = false; } Q += RHS.Q; nodes.addAll (RHS.nodes); finalStateNodes = RHS.finalStateNodes; //Checking for special characters in RHS alphabet. for (int index = 0; index < specialInAlphabet.length; index++) { if (RHS.specialInAlphabet[index]) specialInAlphabet[index] = true; } } /** * Stars a character. * * @param character the character to be stared. * * @return an NFA that is the star of the character. */ public static NFA star (char character) { NFA output = new NFA (); //If special character, record in flag array. if (character >= (char) 256 && character <= (char) 262) { output.specialInAlphabet[(int) character - 256] = true; } NFANode node = (NFANode) output.nodes.get (0); node.branches.add (new NFATransition (character, node)); return output; } /** * Stars an automaton. */ public void star () { //Adding branches from final states to initial state. int finalStateNodes_size = finalStateNodes.size (); for (int index = 0; index < finalStateNodes_size; index++) { NFANode node = (NFANode) finalStateNodes.get (index); node.branches.add (new NFATransition (epsilon, (NFANode) nodes.get (0))); node.finalState = false; } finalStateNodes.clear (); ((NFANode) nodes.get (0)).finalState = true; finalStateNodes.add (nodes.get (0)); } /** * Pluses a character. * * @param character the character to be plused. * * @return an NFA that is the plus of the character. */ public static NFA plus (char character) { NFA output = new NFA (); output.concat (character); NFANode node = (NFANode) output.finalStateNodes.get (0); node.branches.add (new NFATransition (character, node)); return output; } /** * Pluses an automaton. */ public void plus () { //Duplicating NFA. NFA clonedNFA = (NFA) clone (); //Building NFA using concat and star. clonedNFA.star (); concat (clonedNFA); } /** * Maybies a character. * * @param character the character to be maybied. * * @return an NFA that is the maybe of the character. */ public static NFA maybe (char character) { NFA output = new NFA (); output.concat (character); //Creating new final state. NFANode headNode = (NFANode) output.nodes.get (0); NFANode tailNode = (NFANode) output.finalStateNodes.get (0); NFANode newFinalState = new NFANode (true); headNode.branches.add (new NFATransition (epsilon, newFinalState)); tailNode.branches.add (new NFATransition (epsilon, newFinalState)); output.Q++; output.nodes.add (newFinalState); output.finalStateNodes.clear (); output.finalStateNodes.add (newFinalState); return output; } /** * Maybies an automaton. */ public void maybe () { concat (epsilon); NFANode node = (NFANode) nodes.get (0); node.branches.add (new NFATransition (epsilon, (NFANode) nodes.get (nodes.size () - 1))); } /** * Repeats a character.
*

a{m}        //REPEAT EXACTLY M TIMES (N IS IMPLIED = M)
	 * a{m,}            //REPEAT AT LEAST M TIMES (N IS IMPLIED = -1)
	 * a{m,n}           //REPEAT BETWEEN (INCLUSIVE) M AND N TIMES

* * @param character the character to repeat. * @param lower the value of m, the lower bound. * @param upper the value of n, the upper bound. * * @return the result of the repeating. */ public static NFA repeat (char character, int lower, int upper) { NFA output = new NFA (); output.concat (character); //Adding lower limit. for (int index = 1; index < lower; index++) { output.concat (character); } //Adding upper limit. if (upper != -1) { if (lower == 0) { output = new NFA (); int storeQ = 1; for (int index = 0; index < upper; index++) { output.concat (character); } NFANode finalNode = (NFANode) output.finalStateNodes.get (0); for (int index = storeQ - 1; index < output.Q; index++) { NFANode node = (NFANode) output.nodes.get (index); node.branches.add (new NFATransition (epsilon, finalNode)); } } else { int storeQ = output.Q; for (int index = lower; index < upper; index++) { output.concat (character); } NFANode finalNode = (NFANode) output.finalStateNodes.get (0); for (int index = storeQ - 1; index < output.Q; index++) { NFANode node = (NFANode) output.nodes.get (index); node.branches.add (new NFATransition (epsilon, finalNode)); } } } //Adding at least. else { if (lower == 0) { output = star (character); } else { NFANode finalNode = (NFANode) output.finalStateNodes.get (0); finalNode.branches.add (new NFATransition (character, finalNode)); } } return output; } /** * Repeats an automaton.
*

a{m}        //REPEAT EXACTLY M TIMES (N IS IMPLIED = M)
	 * a{m,}            //REPEAT AT LEAST M TIMES (N IS IMPLIED = -1)
	 * a{m,n}           //REPEAT BETWEEN (INCLUSIVE) M AND N TIMES

* * @param lower the value of m, the lower bound. * @param upper the value of n, the upper bound. * * @return the result of the repeating. */ public void repeat (int lower, int upper) { //Duplicating NFA. NFA clonedNFA = (NFA) clone (); //Adding lower limit. for (int index = 1; index < lower; index++) { concat ((NFA) clonedNFA.clone ()); } //Adding upper limit. if (upper != -1) { if (lower == 0) { maybe (); for (int index = 1; index < upper; index++) { NFA tempNFA = (NFA) clonedNFA.clone (); tempNFA.maybe (); concat (tempNFA); } } else { for (int index = lower; index < upper; index++) { NFA tempNFA = (NFA) clonedNFA.clone (); tempNFA.maybe (); concat (tempNFA); } } } //Adding at least. else { if (lower == 0) { star (); } else { clonedNFA.star (); concat (clonedNFA); } } } /** * Unions an automaton. * * @param RHS the automaton to be unioned with. */ public void union (NFA RHS) { NFANode newInitialState = new NFANode (false); newInitialState.branches.add (new NFATransition (epsilon, (NFANode) nodes.get (0))); newInitialState.branches.add (new NFATransition (epsilon, (NFANode) RHS.nodes.get (0))); Q += 1 + RHS.Q; nodes.add (0, newInitialState); nodes.addAll (RHS.nodes); finalStateNodes.addAll (RHS.finalStateNodes); for (int index = 0; index < specialInAlphabet.length; index++) { if (RHS.specialInAlphabet[index]) specialInAlphabet[index] = true; } } /** * Function not currently implemented. Intended to allow further * optimization of NFA by removing unnecessary complication. */ public void shrinkfit () { } /** * Resets the automaton by setting its set of current states to include * only 0. */ public void reset () { if (!nodeNumbersAssigned) { assignNodeNumbers (); nodeNumbersAssigned = true; } currentNodes = new Vector (Q); currentNodes.add (nodes.get (0)); } /** * Returns the currentstate variable. * * @return the currentstate variable. */ public Vector getCurrentStates () { return currentNodes; } /** * Sets the currentstate variable. * * @param currentstate the currentstate variable. */ public void setCurrentStates (Vector currentNodes) { this.currentNodes = currentNodes; } /** * Takes a character as input into the automaton. * * @param inputchar the character to input into the automaton. * * @return a boolean value specifying whether the character was accepted or not. */ public boolean input (char inputchar) { Vector newNodes; boolean found = false; //If input is epsilon, copy current nodes into new nodes (epsilon cannot fail). if (inputchar == epsilon) { newNodes = (Vector) currentNodes.clone (); found = true; } else { newNodes = new Vector (); } //Loop while no more epsilons should be added (if applicable) int currentNodes_size; do { currentNodes_size = currentNodes.size (); for (int index = 0; index < currentNodes_size; index++) { NFANode node = (NFANode) currentNodes.get (index); int node_branches_size = node.branches.size (); for (int subindex = 0; subindex < node_branches_size; subindex++) { NFATransition transition = (NFATransition) node.branches.get (subindex); if (transition.inChar == inputchar) { if (!newNodes.contains (transition.outNode)) { newNodes.add (transition.outNode); found = true; } } } } currentNodes = (Vector) newNodes.clone (); } while (inputchar == epsilon && currentNodes_size != currentNodes.size ()); return found; } /** * Returns a boolean value specifying whether the automaton is in an * accepting state or not. * * @return a value specifying whether the automaton is in an accepting * state or not. */ public boolean accepting () { int currentNodes_size = currentNodes.size (); for (int index = 0; index < currentNodes_size; index++) { NFANode node = (NFANode) currentNodes.get (index); if (node.finalState) return true; } return false; } } /** * A single node for an NFA. */ class NFANode { /** * A flag denoting whether the state is final or not. */ public boolean finalState; /** * A vector of transitions. */ public Vector branches; /** * The number in the nodes vector (used for optimiation). */ public int nodeNumber; /** * Constructs a new NFANode. * * @param finalState the flag denoting whether the state is final or not. */ public NFANode (boolean finalState) { this.finalState = finalState; branches = new Vector (); } } /** * A single NFA transition. */ class NFATransition { /** * The character needed to make the transition. */ public char inChar; /** * A reference to the node to be transitioned to. */ public NFANode outNode; /** * Constructs a new NFATransition. * * @param inChar the character needed to make the transition. * @param outNode the reference to the node to be transitioned to. */ NFATransition (char inChar, NFANode outNode) { this.inChar = inChar; this.outNode = outNode; } } /** * A set of characters which may include character ranges. */ class CharacterSet { /** * Flag used to denote set inversion. */ public boolean inverse = false; /** * List of characters and character ranges. */ private Vector set = new Vector (); /** * Adds a single character to the set. * * @param character the character to be added. */ public void addCharacter (char character) { set.add (new Character (character)); } /** * Adds a range of characters to the set. * * @param lowchar the lower bound character in the range. * @param hichar the upper bound character in the range. */ public void addRange (char lowchar, char hichar) { set.add (new CharacterRange (lowchar, hichar)); } /** * Returns a boolean value specifying whether a specified character is * in the set or not. * * @param character the character to be tested. * * @return a value specifying whether a specified character is in the set or not. */ public boolean inSet (char character) { int set_size = set.size (); for (int index = 0; index < set_size; index++) { Object item = set.get (index); if (item instanceof Character && character == ((Character) item).charValue ()) return returnValue (true); else if (item instanceof CharacterRange && character >= ((CharacterRange) item).lowchar && character <= ((CharacterRange) item).hichar) return returnValue (true); } return returnValue (false); } /** * Returns a value relative to the inversion flag. * * @param value the value to return. * * @return if inverse is false, returns value, otherwise returns the * opposite of value. */ private boolean returnValue (boolean value) { if (inverse) return !value; else return value; } } /** * The CharacterRange class represents a range of characters. */ class CharacterRange { /** * The lower bound character. */ public char lowchar; /** * The upper bound character. */ public char hichar; /** * Constructs a new character range. * * @param lowchar the lower bound character. * @param hichar the upper bound character. */ public CharacterRange (char lowchar, char hichar) { this.lowchar = lowchar; this.hichar = hichar; } }