View Javadoc
1   /*
2    * reserved comment block
3    * DO NOT REMOVE OR ALTER!
4    */
5   /*
6    * Copyright 1999-2004 The Apache Software Foundation.
7    *
8    * Licensed under the Apache License, Version 2.0 (the "License");
9    * you may not use this file except in compliance with the License.
10   * You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  
21  package com.sun.org.apache.regexp.internal;
22  
23  import java.io.Serializable;
24  import java.util.Vector;
25  
26  /**
27   * RE is an efficient, lightweight regular expression evaluator/matcher
28   * class. Regular expressions are pattern descriptions which enable
29   * sophisticated matching of strings.  In addition to being able to
30   * match a string against a pattern, you can also extract parts of the
31   * match.  This is especially useful in text parsing! Details on the
32   * syntax of regular expression patterns are given below.
33   *
34   * <p>
35   * To compile a regular expression (RE), you can simply construct an RE
36   * matcher object from the string specification of the pattern, like this:
37   *
38   * <pre>
39   *  RE r = new RE("a*b");
40   * </pre>
41   *
42   * <p>
43   * Once you have done this, you can call either of the RE.match methods to
44   * perform matching on a String.  For example:
45   *
46   * <pre>
47   *  boolean matched = r.match("aaaab");
48   * </pre>
49   *
50   * will cause the boolean matched to be set to true because the
51   * pattern "a*b" matches the string "aaaab".
52   *
53   * <p>
54   * If you were interested in the <i>number</i> of a's which matched the
55   * first part of our example expression, you could change the expression to
56   * "(a*)b".  Then when you compiled the expression and matched it against
57   * something like "xaaaab", you would get results like this:
58   *
59   * <pre>
60   *  RE r = new RE("(a*)b");                  // Compile expression
61   *  boolean matched = r.match("xaaaab");     // Match against "xaaaab"
62   *
63   *  String wholeExpr = r.getParen(0);        // wholeExpr will be 'aaaab'
64   *  String insideParens = r.getParen(1);     // insideParens will be 'aaaa'
65   *
66   *  int startWholeExpr = r.getParenStart(0); // startWholeExpr will be index 1
67   *  int endWholeExpr = r.getParenEnd(0);     // endWholeExpr will be index 6
68   *  int lenWholeExpr = r.getParenLength(0);  // lenWholeExpr will be 5
69   *
70   *  int startInside = r.getParenStart(1);    // startInside will be index 1
71   *  int endInside = r.getParenEnd(1);        // endInside will be index 5
72   *  int lenInside = r.getParenLength(1);     // lenInside will be 4
73   * </pre>
74   *
75   * You can also refer to the contents of a parenthesized expression
76   * within a regular expression itself.  This is called a
77   * 'backreference'.  The first backreference in a regular expression is
78   * denoted by \1, the second by \2 and so on.  So the expression:
79   *
80   * <pre>
81   *  ([0-9]+)=\1
82   * </pre>
83   *
84   * will match any string of the form n=n (like 0=0 or 2=2).
85   *
86   * <p>
87   * The full regular expression syntax accepted by RE is described here:
88   *
89   * <pre>
90   *
91   *  <b><font face=times roman>Characters</font></b>
92   *
93   *    <i>unicodeChar</i>   Matches any identical unicode character
94   *    \                    Used to quote a meta-character (like '*')
95   *    \\                   Matches a single '\' character
96   *    \0nnn                Matches a given octal character
97   *    \xhh                 Matches a given 8-bit hexadecimal character
98   *    \\uhhhh              Matches a given 16-bit hexadecimal character
99   *    \t                   Matches an ASCII tab character
100  *    \n                   Matches an ASCII newline character
101  *    \r                   Matches an ASCII return character
102  *    \f                   Matches an ASCII form feed character
103  *
104  *
105  *  <b><font face=times roman>Character Classes</font></b>
106  *
107  *    [abc]                Simple character class
108  *    [a-zA-Z]             Character class with ranges
109  *    [^abc]               Negated character class
110  * </pre>
111  *
112  * <b>NOTE:</b> Incomplete ranges will be interpreted as &quot;starts
113  * from zero&quot; or &quot;ends with last character&quot;.
114  * <br>
115  * I.e. [-a] is the same as [\\u0000-a], and [a-] is the same as [a-\\uFFFF],
116  * [-] means &quot;all characters&quot;.
117  *
118  * <pre>
119  *
120  *  <b><font face=times roman>Standard POSIX Character Classes</font></b>
121  *
122  *    [:alnum:]            Alphanumeric characters.
123  *    [:alpha:]            Alphabetic characters.
124  *    [:blank:]            Space and tab characters.
125  *    [:cntrl:]            Control characters.
126  *    [:digit:]            Numeric characters.
127  *    [:graph:]            Characters that are printable and are also visible.
128  *                         (A space is printable, but not visible, while an
129  *                         `a' is both.)
130  *    [:lower:]            Lower-case alphabetic characters.
131  *    [:print:]            Printable characters (characters that are not
132  *                         control characters.)
133  *    [:punct:]            Punctuation characters (characters that are not letter,
134  *                         digits, control characters, or space characters).
135  *    [:space:]            Space characters (such as space, tab, and formfeed,
136  *                         to name a few).
137  *    [:upper:]            Upper-case alphabetic characters.
138  *    [:xdigit:]           Characters that are hexadecimal digits.
139  *
140  *
141  *  <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b>
142  *
143  *    [:javastart:]        Start of a Java identifier
144  *    [:javapart:]         Part of a Java identifier
145  *
146  *
147  *  <b><font face=times roman>Predefined Classes</font></b>
148  *
149  *    .         Matches any character other than newline
150  *    \w        Matches a "word" character (alphanumeric plus "_")
151  *    \W        Matches a non-word character
152  *    \s        Matches a whitespace character
153  *    \S        Matches a non-whitespace character
154  *    \d        Matches a digit character
155  *    \D        Matches a non-digit character
156  *
157  *
158  *  <b><font face=times roman>Boundary Matchers</font></b>
159  *
160  *    ^         Matches only at the beginning of a line
161  *    $         Matches only at the end of a line
162  *    \b        Matches only at a word boundary
163  *    \B        Matches only at a non-word boundary
164  *
165  *
166  *  <b><font face=times roman>Greedy Closures</font></b>
167  *
168  *    A*        Matches A 0 or more times (greedy)
169  *    A+        Matches A 1 or more times (greedy)
170  *    A?        Matches A 1 or 0 times (greedy)
171  *    A{n}      Matches A exactly n times (greedy)
172  *    A{n,}     Matches A at least n times (greedy)
173  *    A{n,m}    Matches A at least n but not more than m times (greedy)
174  *
175  *
176  *  <b><font face=times roman>Reluctant Closures</font></b>
177  *
178  *    A*?       Matches A 0 or more times (reluctant)
179  *    A+?       Matches A 1 or more times (reluctant)
180  *    A??       Matches A 0 or 1 times (reluctant)
181  *
182  *
183  *  <b><font face=times roman>Logical Operators</font></b>
184  *
185  *    AB        Matches A followed by B
186  *    A|B       Matches either A or B
187  *    (A)       Used for subexpression grouping
188  *   (?:A)      Used for subexpression clustering (just like grouping but
189  *              no backrefs)
190  *
191  *
192  *  <b><font face=times roman>Backreferences</font></b>
193  *
194  *    \1    Backreference to 1st parenthesized subexpression
195  *    \2    Backreference to 2nd parenthesized subexpression
196  *    \3    Backreference to 3rd parenthesized subexpression
197  *    \4    Backreference to 4th parenthesized subexpression
198  *    \5    Backreference to 5th parenthesized subexpression
199  *    \6    Backreference to 6th parenthesized subexpression
200  *    \7    Backreference to 7th parenthesized subexpression
201  *    \8    Backreference to 8th parenthesized subexpression
202  *    \9    Backreference to 9th parenthesized subexpression
203  * </pre>
204  *
205  * <p>
206  * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning
207  * that they match as many elements of the string as possible without
208  * causing the overall match to fail.  If you want a closure to be
209  * reluctant (non-greedy), you can simply follow it with a '?'.  A
210  * reluctant closure will match as few elements of the string as
211  * possible when finding matches.  {m,n} closures don't currently
212  * support reluctancy.
213  *
214  * <p>
215  * <b><font face="times roman">Line terminators</font></b>
216  * <br>
217  * A line terminator is a one- or two-character sequence that marks
218  * the end of a line of the input character sequence. The following
219  * are recognized as line terminators:
220  * <ul>
221  * <li>A newline (line feed) character ('\n'),</li>
222  * <li>A carriage-return character followed immediately by a newline character ("\r\n"),</li>
223  * <li>A standalone carriage-return character ('\r'),</li>
224  * <li>A next-line character ('\u0085'),</li>
225  * <li>A line-separator character ('\u2028'), or</li>
226  * <li>A paragraph-separator character ('\u2029).</li>
227  * </ul>
228  *
229  * <p>
230  * RE runs programs compiled by the RECompiler class.  But the RE
231  * matcher class does not include the actual regular expression compiler
232  * for reasons of efficiency.  In fact, if you want to pre-compile one
233  * or more regular expressions, the 'recompile' class can be invoked
234  * from the command line to produce compiled output like this:
235  *
236  * <pre>
237  *    // Pre-compiled regular expression "a*b"
238  *    char[] re1Instructions =
239  *    {
240  *        0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
241  *        0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
242  *        0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
243  *        0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
244  *        0x0000,
245  *    };
246  *
247  *
248  *    REProgram re1 = new REProgram(re1Instructions);
249  * </pre>
250  *
251  * You can then construct a regular expression matcher (RE) object from
252  * the pre-compiled expression re1 and thus avoid the overhead of
253  * compiling the expression at runtime. If you require more dynamic
254  * regular expressions, you can construct a single RECompiler object and
255  * re-use it to compile each expression. Similarly, you can change the
256  * program run by a given matcher object at any time. However, RE and
257  * RECompiler are not threadsafe (for efficiency reasons, and because
258  * requiring thread safety in this class is deemed to be a rare
259  * requirement), so you will need to construct a separate compiler or
260  * matcher object for each thread (unless you do thread synchronization
261  * yourself). Once expression compiled into the REProgram object, REProgram
262  * can be safely shared across multiple threads and RE objects.
263  *
264  * <br><p><br>
265  *
266  * <font color="red">
267  * <i>ISSUES:</i>
268  *
269  * <ul>
270  *  <li>com.weusours.util.re is not currently compatible with all
271  *      standard POSIX regcomp flags</li>
272  *  <li>com.weusours.util.re does not support POSIX equivalence classes
273  *      ([=foo=] syntax) (I18N/locale issue)</li>
274  *  <li>com.weusours.util.re does not support nested POSIX character
275  *      classes (definitely should, but not completely trivial)</li>
276  *  <li>com.weusours.util.re Does not support POSIX character collation
277  *      concepts ([.foo.] syntax) (I18N/locale issue)</li>
278  *  <li>Should there be different matching styles (simple, POSIX, Perl etc?)</li>
279  *  <li>Should RE support character iterators (for backwards RE matching!)?</li>
280  *  <li>Should RE support reluctant {m,n} closures (does anyone care)?</li>
281  *  <li>Not *all* possibilities are considered for greediness when backreferences
282  *      are involved (as POSIX suggests should be the case).  The POSIX RE
283  *      "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
284  *      of acdacaa where \1 is "a".  This is not the case in this RE package,
285  *      and actually Perl doesn't go to this extent either!  Until someone
286  *      actually complains about this, I'm not sure it's worth "fixing".
287  *      If it ever is fixed, test #137 in RETest.txt should be updated.</li>
288  * </ul>
289  *
290  * </font>
291  *
292  * @see recompile
293  * @see RECompiler
294  *
295  * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
296  * @author <a href="mailto:ts@sch-fer.de">Tobias Sch&auml;fer</a>
297  */
298 public class RE implements Serializable
299 {
300     /**
301      * Specifies normal, case-sensitive matching behaviour.
302      */
303     public static final int MATCH_NORMAL          = 0x0000;
304 
305     /**
306      * Flag to indicate that matching should be case-independent (folded)
307      */
308     public static final int MATCH_CASEINDEPENDENT = 0x0001;
309 
310     /**
311      * Newlines should match as BOL/EOL (^ and $)
312      */
313     public static final int MATCH_MULTILINE       = 0x0002;
314 
315     /**
316      * Consider all input a single body of text - newlines are matched by .
317      */
318     public static final int MATCH_SINGLELINE      = 0x0004;
319 
320     /************************************************
321      *                                              *
322      * The format of a node in a program is:        *
323      *                                              *
324      * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
325      *                                              *
326      * char OPCODE - instruction                    *
327      * char OPDATA - modifying data                 *
328      * char OPNEXT - next node (relative offset)    *
329      *                                              *
330      ************************************************/
331 
332                  //   Opcode              Char       Opdata/Operand  Meaning
333                  //   ----------          ---------- --------------- --------------------------------------------------
334     static final char OP_END              = 'E';  //                 end of program
335     static final char OP_BOL              = '^';  //                 match only if at beginning of line
336     static final char OP_EOL              = '$';  //                 match only if at end of line
337     static final char OP_ANY              = '.';  //                 match any single character except newline
338     static final char OP_ANYOF            = '[';  // count/ranges    match any char in the list of ranges
339     static final char OP_BRANCH           = '|';  // node            match this alternative or the next one
340     static final char OP_ATOM             = 'A';  // length/string   length of string followed by string itself
341     static final char OP_STAR             = '*';  // node            kleene closure
342     static final char OP_PLUS             = '+';  // node            positive closure
343     static final char OP_MAYBE            = '?';  // node            optional closure
344     static final char OP_ESCAPE           = '\\'; // escape          special escape code char class (escape is E_* code)
345     static final char OP_OPEN             = '(';  // number          nth opening paren
346     static final char OP_OPEN_CLUSTER     = '<';  //                 opening cluster
347     static final char OP_CLOSE            = ')';  // number          nth closing paren
348     static final char OP_CLOSE_CLUSTER    = '>';  //                 closing cluster
349     static final char OP_BACKREF          = '#';  // number          reference nth already matched parenthesized string
350     static final char OP_GOTO             = 'G';  //                 nothing but a (back-)pointer
351     static final char OP_NOTHING          = 'N';  //                 match null string such as in '(a|)'
352     static final char OP_RELUCTANTSTAR    = '8';  // none/expr       reluctant '*' (mnemonic for char is unshifted '*')
353     static final char OP_RELUCTANTPLUS    = '=';  // none/expr       reluctant '+' (mnemonic for char is unshifted '+')
354     static final char OP_RELUCTANTMAYBE   = '/';  // none/expr       reluctant '?' (mnemonic for char is unshifted '?')
355     static final char OP_POSIXCLASS       = 'P';  // classid         one of the posix character classes
356 
357     // Escape codes
358     static final char E_ALNUM             = 'w';  // Alphanumeric
359     static final char E_NALNUM            = 'W';  // Non-alphanumeric
360     static final char E_BOUND             = 'b';  // Word boundary
361     static final char E_NBOUND            = 'B';  // Non-word boundary
362     static final char E_SPACE             = 's';  // Whitespace
363     static final char E_NSPACE            = 'S';  // Non-whitespace
364     static final char E_DIGIT             = 'd';  // Digit
365     static final char E_NDIGIT            = 'D';  // Non-digit
366 
367     // Posix character classes
368     static final char POSIX_CLASS_ALNUM   = 'w';  // Alphanumerics
369     static final char POSIX_CLASS_ALPHA   = 'a';  // Alphabetics
370     static final char POSIX_CLASS_BLANK   = 'b';  // Blanks
371     static final char POSIX_CLASS_CNTRL   = 'c';  // Control characters
372     static final char POSIX_CLASS_DIGIT   = 'd';  // Digits
373     static final char POSIX_CLASS_GRAPH   = 'g';  // Graphic characters
374     static final char POSIX_CLASS_LOWER   = 'l';  // Lowercase characters
375     static final char POSIX_CLASS_PRINT   = 'p';  // Printable characters
376     static final char POSIX_CLASS_PUNCT   = '!';  // Punctuation
377     static final char POSIX_CLASS_SPACE   = 's';  // Spaces
378     static final char POSIX_CLASS_UPPER   = 'u';  // Uppercase characters
379     static final char POSIX_CLASS_XDIGIT  = 'x';  // Hexadecimal digits
380     static final char POSIX_CLASS_JSTART  = 'j';  // Java identifier start
381     static final char POSIX_CLASS_JPART   = 'k';  // Java identifier part
382 
383     // Limits
384     static final int maxNode  = 65536;            // Maximum number of nodes in a program
385     static final int MAX_PAREN = 16;              // Number of paren pairs (only 9 can be backrefs)
386 
387     // Node layout constants
388     static final int offsetOpcode = 0;            // Opcode offset (first character)
389     static final int offsetOpdata = 1;            // Opdata offset (second char)
390     static final int offsetNext   = 2;            // Next index offset (third char)
391     static final int nodeSize     = 3;            // Node size (in chars)
392 
393     // State of current program
394     REProgram program;                            // Compiled regular expression 'program'
395     transient CharacterIterator search;           // The string being matched against
396     int matchFlags;                               // Match behaviour flags
397     int maxParen = MAX_PAREN;
398 
399     // Parenthesized subexpressions
400     transient int parenCount;                     // Number of subexpressions matched (num open parens + 1)
401     transient int start0;                         // Cache of start[0]
402     transient int end0;                           // Cache of start[0]
403     transient int start1;                         // Cache of start[1]
404     transient int end1;                           // Cache of start[1]
405     transient int start2;                         // Cache of start[2]
406     transient int end2;                           // Cache of start[2]
407     transient int[] startn;                       // Lazy-alloced array of sub-expression starts
408     transient int[] endn;                         // Lazy-alloced array of sub-expression ends
409 
410     // Backreferences
411     transient int[] startBackref;                 // Lazy-alloced array of backref starts
412     transient int[] endBackref;                   // Lazy-alloced array of backref ends
413 
414     /**
415      * Constructs a regular expression matcher from a String by compiling it
416      * using a new instance of RECompiler.  If you will be compiling many
417      * expressions, you may prefer to use a single RECompiler object instead.
418      *
419      * @param pattern The regular expression pattern to compile.
420      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
421      * @see RECompiler
422      * @see recompile
423      */
424     public RE(String pattern) throws RESyntaxException
425     {
426         this(pattern, MATCH_NORMAL);
427     }
428 
429     /**
430      * Constructs a regular expression matcher from a String by compiling it
431      * using a new instance of RECompiler.  If you will be compiling many
432      * expressions, you may prefer to use a single RECompiler object instead.
433      *
434      * @param pattern The regular expression pattern to compile.
435      * @param matchFlags The matching style
436      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
437      * @see RECompiler
438      * @see recompile
439      */
440     public RE(String pattern, int matchFlags) throws RESyntaxException
441     {
442         this(new RECompiler().compile(pattern));
443         setMatchFlags(matchFlags);
444     }
445 
446     /**
447      * Construct a matcher for a pre-compiled regular expression from program
448      * (bytecode) data.  Permits special flags to be passed in to modify matching
449      * behaviour.
450      *
451      * @param program Compiled regular expression program (see RECompiler and/or recompile)
452      * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
453      *
454      * <pre>
455      *   MATCH_NORMAL              // Normal (case-sensitive) matching
456      *   MATCH_CASEINDEPENDENT     // Case folded comparisons
457      *   MATCH_MULTILINE           // Newline matches as BOL/EOL
458      * </pre>
459      *
460      * @see RECompiler
461      * @see REProgram
462      * @see recompile
463      */
464     public RE(REProgram program, int matchFlags)
465     {
466         setProgram(program);
467         setMatchFlags(matchFlags);
468     }
469 
470     /**
471      * Construct a matcher for a pre-compiled regular expression from program
472      * (bytecode) data.
473      *
474      * @param program Compiled regular expression program
475      * @see RECompiler
476      * @see recompile
477      */
478     public RE(REProgram program)
479     {
480         this(program, MATCH_NORMAL);
481     }
482 
483     /**
484      * Constructs a regular expression matcher with no initial program.
485      * This is likely to be an uncommon practice, but is still supported.
486      */
487     public RE()
488     {
489         this((REProgram)null, MATCH_NORMAL);
490     }
491 
492     /**
493      * Converts a 'simplified' regular expression to a full regular expression
494      *
495      * @param pattern The pattern to convert
496      * @return The full regular expression
497      */
498     public static String simplePatternToFullRegularExpression(String pattern)
499     {
500         StringBuffer buf = new StringBuffer();
501         for (int i = 0; i < pattern.length(); i++)
502         {
503             char c = pattern.charAt(i);
504             switch (c)
505             {
506                 case '*':
507                     buf.append(".*");
508                     break;
509 
510                 case '.':
511                 case '[':
512                 case ']':
513                 case '\\':
514                 case '+':
515                 case '?':
516                 case '{':
517                 case '}':
518                 case '$':
519                 case '^':
520                 case '|':
521                 case '(':
522                 case ')':
523                     buf.append('\\');
524                 default:
525                     buf.append(c);
526                     break;
527             }
528         }
529         return buf.toString();
530     }
531 
532     /**
533      * Sets match behaviour flags which alter the way RE does matching.
534      * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
535      *
536      * <pre>
537      *   MATCH_NORMAL              // Normal (case-sensitive) matching
538      *   MATCH_CASEINDEPENDENT     // Case folded comparisons
539      *   MATCH_MULTILINE           // Newline matches as BOL/EOL
540      * </pre>
541      */
542     public void setMatchFlags(int matchFlags)
543     {
544         this.matchFlags = matchFlags;
545     }
546 
547     /**
548      * Returns the current match behaviour flags.
549      * @return Current match behaviour flags (RE.MATCH_*).
550      *
551      * <pre>
552      *   MATCH_NORMAL              // Normal (case-sensitive) matching
553      *   MATCH_CASEINDEPENDENT     // Case folded comparisons
554      *   MATCH_MULTILINE           // Newline matches as BOL/EOL
555      * </pre>
556      *
557      * @see #setMatchFlags
558      */
559     public int getMatchFlags()
560     {
561         return matchFlags;
562     }
563 
564     /**
565      * Sets the current regular expression program used by this matcher object.
566      *
567      * @param program Regular expression program compiled by RECompiler.
568      * @see RECompiler
569      * @see REProgram
570      * @see recompile
571      */
572     public void setProgram(REProgram program)
573     {
574         this.program = program;
575         if (program != null && program.maxParens != -1) {
576             this.maxParen = program.maxParens;
577         } else {
578             this.maxParen = MAX_PAREN;
579         }
580     }
581 
582     /**
583      * Returns the current regular expression program in use by this matcher object.
584      *
585      * @return Regular expression program
586      * @see #setProgram
587      */
588     public REProgram getProgram()
589     {
590         return program;
591     }
592 
593     /**
594      * Returns the number of parenthesized subexpressions available after a successful match.
595      *
596      * @return Number of available parenthesized subexpressions
597      */
598     public int getParenCount()
599     {
600         return parenCount;
601     }
602 
603     /**
604      * Gets the contents of a parenthesized subexpression after a successful match.
605      *
606      * @param which Nesting level of subexpression
607      * @return String
608      */
609     public String getParen(int which)
610     {
611         int start;
612         if (which < parenCount && (start = getParenStart(which)) >= 0)
613         {
614             return search.substring(start, getParenEnd(which));
615         }
616         return null;
617     }
618 
619     /**
620      * Returns the start index of a given paren level.
621      *
622      * @param which Nesting level of subexpression
623      * @return String index
624      */
625     public final int getParenStart(int which)
626     {
627         if (which < parenCount)
628         {
629             switch (which)
630             {
631                 case 0:
632                     return start0;
633 
634                 case 1:
635                     return start1;
636 
637                 case 2:
638                     return start2;
639 
640                 default:
641                     if (startn == null)
642                     {
643                         allocParens();
644                     }
645                     return startn[which];
646             }
647         }
648         return -1;
649     }
650 
651     /**
652      * Returns the end index of a given paren level.
653      *
654      * @param which Nesting level of subexpression
655      * @return String index
656      */
657     public final int getParenEnd(int which)
658     {
659         if (which < parenCount)
660         {
661             switch (which)
662             {
663                 case 0:
664                     return end0;
665 
666                 case 1:
667                     return end1;
668 
669                 case 2:
670                     return end2;
671 
672                 default:
673                     if (endn == null)
674                     {
675                         allocParens();
676                     }
677                     return endn[which];
678             }
679         }
680         return -1;
681     }
682 
683     /**
684      * Returns the length of a given paren level.
685      *
686      * @param which Nesting level of subexpression
687      * @return Number of characters in the parenthesized subexpression
688      */
689     public final int getParenLength(int which)
690     {
691         if (which < parenCount)
692         {
693             return getParenEnd(which) - getParenStart(which);
694         }
695         return -1;
696     }
697 
698     /**
699      * Sets the start of a paren level
700      *
701      * @param which Which paren level
702      * @param i Index in input array
703      */
704     protected final void setParenStart(int which, int i)
705     {
706         if (which < parenCount)
707         {
708             switch (which)
709             {
710                 case 0:
711                     start0 = i;
712                     break;
713 
714                 case 1:
715                     start1 = i;
716                     break;
717 
718                 case 2:
719                     start2 = i;
720                     break;
721 
722                 default:
723                     if (startn == null)
724                     {
725                         allocParens();
726                     }
727                     startn[which] = i;
728                     break;
729             }
730         }
731     }
732 
733     /**
734      * Sets the end of a paren level
735      *
736      * @param which Which paren level
737      * @param i Index in input array
738      */
739     protected final void setParenEnd(int which, int i)
740     {
741         if (which < parenCount)
742         {
743             switch (which)
744             {
745                 case 0:
746                     end0 = i;
747                     break;
748 
749                 case 1:
750                     end1 = i;
751                     break;
752 
753                 case 2:
754                     end2 = i;
755                     break;
756 
757                 default:
758                     if (endn == null)
759                     {
760                         allocParens();
761                     }
762                     endn[which] = i;
763                     break;
764             }
765         }
766     }
767 
768     /**
769      * Throws an Error representing an internal error condition probably resulting
770      * from a bug in the regular expression compiler (or possibly data corruption).
771      * In practice, this should be very rare.
772      *
773      * @param s Error description
774      */
775     protected void internalError(String s) throws Error
776     {
777         throw new Error("RE internal error: " + s);
778     }
779 
780     /**
781      * Performs lazy allocation of subexpression arrays
782      */
783     private final void allocParens()
784     {
785         // Allocate arrays for subexpressions
786         startn = new int[maxParen];
787         endn = new int[maxParen];
788 
789         // Set sub-expression pointers to invalid values
790         for (int i = 0; i < maxParen; i++)
791         {
792             startn[i] = -1;
793             endn[i] = -1;
794         }
795     }
796 
797     /**
798      * Try to match a string against a subset of nodes in the program
799      *
800      * @param firstNode Node to start at in program
801      * @param lastNode  Last valid node (used for matching a subexpression without
802      *                  matching the rest of the program as well).
803      * @param idxStart  Starting position in character array
804      * @return Final input array index if match succeeded.  -1 if not.
805      */
806     protected int matchNodes(int firstNode, int lastNode, int idxStart)
807     {
808         // Our current place in the string
809         int idx = idxStart;
810 
811         // Loop while node is valid
812         int next, opcode, opdata;
813         int idxNew;
814         char[] instruction = program.instruction;
815         for (int node = firstNode; node < lastNode; )
816         {
817             opcode = instruction[node + offsetOpcode];
818             next   = node + (short)instruction[node + offsetNext];
819             opdata = instruction[node + offsetOpdata];
820 
821             switch (opcode)
822             {
823                 case OP_RELUCTANTMAYBE:
824                     {
825                         int once = 0;
826                         do
827                         {
828                             // Try to match the rest without using the reluctant subexpr
829                             if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
830                             {
831                                 return idxNew;
832                             }
833                         }
834                         while ((once++ == 0) && (idx = matchNodes(node + nodeSize, next, idx)) != -1);
835                         return -1;
836                     }
837 
838                 case OP_RELUCTANTPLUS:
839                     while ((idx = matchNodes(node + nodeSize, next, idx)) != -1)
840                     {
841                         // Try to match the rest without using the reluctant subexpr
842                         if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
843                         {
844                             return idxNew;
845                         }
846                     }
847                     return -1;
848 
849                 case OP_RELUCTANTSTAR:
850                     do
851                     {
852                         // Try to match the rest without using the reluctant subexpr
853                         if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
854                         {
855                             return idxNew;
856                         }
857                     }
858                     while ((idx = matchNodes(node + nodeSize, next, idx)) != -1);
859                     return -1;
860 
861                 case OP_OPEN:
862 
863                     // Match subexpression
864                     if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
865                     {
866                         startBackref[opdata] = idx;
867                     }
868                     if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
869                     {
870                         // Increase valid paren count
871                         if ((opdata + 1) > parenCount)
872                         {
873                             parenCount = opdata + 1;
874                         }
875 
876                         // Don't set paren if already set later on
877                         if (getParenStart(opdata) == -1)
878                         {
879                             setParenStart(opdata, idx);
880                         }
881                     }
882                     return idxNew;
883 
884                 case OP_CLOSE:
885 
886                     // Done matching subexpression
887                     if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
888                     {
889                         endBackref[opdata] = idx;
890                     }
891                     if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
892                     {
893                         // Increase valid paren count
894                         if ((opdata + 1) > parenCount)
895                         {
896                             parenCount = opdata + 1;
897                         }
898 
899                         // Don't set paren if already set later on
900                         if (getParenEnd(opdata) == -1)
901                         {
902                             setParenEnd(opdata, idx);
903                         }
904                     }
905                     return idxNew;
906 
907                 case OP_OPEN_CLUSTER:
908                 case OP_CLOSE_CLUSTER:
909                     // starting or ending the matching of a subexpression which has no backref.
910                     return matchNodes( next, maxNode, idx );
911 
912                 case OP_BACKREF:
913                     {
914                         // Get the start and end of the backref
915                         int s = startBackref[opdata];
916                         int e = endBackref[opdata];
917 
918                         // We don't know the backref yet
919                         if (s == -1 || e == -1)
920                         {
921                             return -1;
922                         }
923 
924                         // The backref is empty size
925                         if (s == e)
926                         {
927                             break;
928                         }
929 
930                         // Get the length of the backref
931                         int l = e - s;
932 
933                         // If there's not enough input left, give up.
934                         if (search.isEnd(idx + l - 1))
935                         {
936                             return -1;
937                         }
938 
939                         // Case fold the backref?
940                         final boolean caseFold =
941                             ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
942                         // Compare backref to input
943                         for (int i = 0; i < l; i++)
944                         {
945                             if (compareChars(search.charAt(idx++), search.charAt(s + i), caseFold) != 0)
946                             {
947                                 return -1;
948                             }
949                         }
950                     }
951                     break;
952 
953                 case OP_BOL:
954 
955                     // Fail if we're not at the start of the string
956                     if (idx != 0)
957                     {
958                         // If we're multiline matching, we could still be at the start of a line
959                         if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
960                         {
961                             // If not at start of line, give up
962                             if (idx <= 0 || !isNewline(idx - 1)) {
963                                 return -1;
964                             } else {
965                                 break;
966                             }
967                         }
968                         return -1;
969                     }
970                     break;
971 
972                 case OP_EOL:
973 
974                     // If we're not at the end of string
975                     if (!search.isEnd(0) && !search.isEnd(idx))
976                     {
977                         // If we're multi-line matching
978                         if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
979                         {
980                             // Give up if we're not at the end of a line
981                             if (!isNewline(idx)) {
982                                 return -1;
983                             } else {
984                                 break;
985                             }
986                         }
987                         return -1;
988                     }
989                     break;
990 
991                 case OP_ESCAPE:
992 
993                     // Which escape?
994                     switch (opdata)
995                     {
996                         // Word boundary match
997                         case E_NBOUND:
998                         case E_BOUND:
999                             {
1000                                 char cLast = ((idx == 0) ? '\n' : search.charAt(idx - 1));
1001                                 char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx));
1002                                 if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND))
1003                                 {
1004                                     return -1;
1005                                 }
1006                             }
1007                             break;
1008 
1009                         // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
1010                         case E_ALNUM:
1011                         case E_NALNUM:
1012                         case E_DIGIT:
1013                         case E_NDIGIT:
1014                         case E_SPACE:
1015                         case E_NSPACE:
1016 
1017                             // Give up if out of input
1018                             if (search.isEnd(idx))
1019                             {
1020                                 return -1;
1021                             }
1022 
1023                             char c = search.charAt(idx);
1024 
1025                             // Switch on escape
1026                             switch (opdata)
1027                             {
1028                                 case E_ALNUM:
1029                                 case E_NALNUM:
1030                                     if (!((Character.isLetterOrDigit(c) || c == '_') == (opdata == E_ALNUM)))
1031                                     {
1032                                         return -1;
1033                                     }
1034                                     break;
1035 
1036                                 case E_DIGIT:
1037                                 case E_NDIGIT:
1038                                     if (!(Character.isDigit(c) == (opdata == E_DIGIT)))
1039                                     {
1040                                         return -1;
1041                                     }
1042                                     break;
1043 
1044                                 case E_SPACE:
1045                                 case E_NSPACE:
1046                                     if (!(Character.isWhitespace(c) == (opdata == E_SPACE)))
1047                                     {
1048                                         return -1;
1049                                     }
1050                                     break;
1051                             }
1052                             idx++;
1053                             break;
1054 
1055                         default:
1056                             internalError("Unrecognized escape '" + opdata + "'");
1057                     }
1058                     break;
1059 
1060                 case OP_ANY:
1061 
1062                     if ((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) {
1063                         // Match anything
1064                         if (search.isEnd(idx))
1065                         {
1066                             return -1;
1067                         }
1068                     }
1069                     else
1070                     {
1071                         // Match anything but a newline
1072                         if (search.isEnd(idx) || isNewline(idx))
1073                         {
1074                             return -1;
1075                         }
1076                     }
1077                     idx++;
1078                     break;
1079 
1080                 case OP_ATOM:
1081                     {
1082                         // Match an atom value
1083                         if (search.isEnd(idx))
1084                         {
1085                             return -1;
1086                         }
1087 
1088                         // Get length of atom and starting index
1089                         int lenAtom = opdata;
1090                         int startAtom = node + nodeSize;
1091 
1092                         // Give up if not enough input remains to have a match
1093                         if (search.isEnd(lenAtom + idx - 1))
1094                         {
1095                             return -1;
1096                         }
1097 
1098                         // Match atom differently depending on casefolding flag
1099                         final boolean caseFold =
1100                             ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
1101 
1102                         for (int i = 0; i < lenAtom; i++)
1103                         {
1104                             if (compareChars(search.charAt(idx++), instruction[startAtom + i], caseFold) != 0)
1105                             {
1106                                 return -1;
1107                             }
1108                         }
1109                     }
1110                     break;
1111 
1112                 case OP_POSIXCLASS:
1113                     {
1114                         // Out of input?
1115                         if (search.isEnd(idx))
1116                         {
1117                             return -1;
1118                         }
1119 
1120                         switch (opdata)
1121                         {
1122                             case POSIX_CLASS_ALNUM:
1123                                 if (!Character.isLetterOrDigit(search.charAt(idx)))
1124                                 {
1125                                     return -1;
1126                                 }
1127                                 break;
1128 
1129                             case POSIX_CLASS_ALPHA:
1130                                 if (!Character.isLetter(search.charAt(idx)))
1131                                 {
1132                                     return -1;
1133                                 }
1134                                 break;
1135 
1136                             case POSIX_CLASS_DIGIT:
1137                                 if (!Character.isDigit(search.charAt(idx)))
1138                                 {
1139                                     return -1;
1140                                 }
1141                                 break;
1142 
1143                             case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
1144                                 if (!Character.isSpaceChar(search.charAt(idx)))
1145                                 {
1146                                     return -1;
1147                                 }
1148                                 break;
1149 
1150                             case POSIX_CLASS_SPACE:
1151                                 if (!Character.isWhitespace(search.charAt(idx)))
1152                                 {
1153                                     return -1;
1154                                 }
1155                                 break;
1156 
1157                             case POSIX_CLASS_CNTRL:
1158                                 if (Character.getType(search.charAt(idx)) != Character.CONTROL)
1159                                 {
1160                                     return -1;
1161                                 }
1162                                 break;
1163 
1164                             case POSIX_CLASS_GRAPH: // JWL - bugbug???
1165                                 switch (Character.getType(search.charAt(idx)))
1166                                 {
1167                                     case Character.MATH_SYMBOL:
1168                                     case Character.CURRENCY_SYMBOL:
1169                                     case Character.MODIFIER_SYMBOL:
1170                                     case Character.OTHER_SYMBOL:
1171                                         break;
1172 
1173                                     default:
1174                                         return -1;
1175                                 }
1176                                 break;
1177 
1178                             case POSIX_CLASS_LOWER:
1179                                 if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER)
1180                                 {
1181                                     return -1;
1182                                 }
1183                                 break;
1184 
1185                             case POSIX_CLASS_UPPER:
1186                                 if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER)
1187                                 {
1188                                     return -1;
1189                                 }
1190                                 break;
1191 
1192                             case POSIX_CLASS_PRINT:
1193                                 if (Character.getType(search.charAt(idx)) == Character.CONTROL)
1194                                 {
1195                                     return -1;
1196                                 }
1197                                 break;
1198 
1199                             case POSIX_CLASS_PUNCT:
1200                             {
1201                                 int type = Character.getType(search.charAt(idx));
1202                                 switch(type)
1203                                 {
1204                                     case Character.DASH_PUNCTUATION:
1205                                     case Character.START_PUNCTUATION:
1206                                     case Character.END_PUNCTUATION:
1207                                     case Character.CONNECTOR_PUNCTUATION:
1208                                     case Character.OTHER_PUNCTUATION:
1209                                         break;
1210 
1211                                     default:
1212                                         return -1;
1213                                 }
1214                             }
1215                             break;
1216 
1217                             case POSIX_CLASS_XDIGIT: // JWL - bugbug??
1218                             {
1219                                 boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9') ||
1220                                                     (search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') ||
1221                                                     (search.charAt(idx) >= 'A' && search.charAt(idx) <= 'F'));
1222                                 if (!isXDigit)
1223                                 {
1224                                     return -1;
1225                                 }
1226                             }
1227                             break;
1228 
1229                             case POSIX_CLASS_JSTART:
1230                                 if (!Character.isJavaIdentifierStart(search.charAt(idx)))
1231                                 {
1232                                     return -1;
1233                                 }
1234                                 break;
1235 
1236                             case POSIX_CLASS_JPART:
1237                                 if (!Character.isJavaIdentifierPart(search.charAt(idx)))
1238                                 {
1239                                     return -1;
1240                                 }
1241                                 break;
1242 
1243                             default:
1244                                 internalError("Bad posix class");
1245                                 break;
1246                         }
1247 
1248                         // Matched.
1249                         idx++;
1250                     }
1251                     break;
1252 
1253                 case OP_ANYOF:
1254                     {
1255                         // Out of input?
1256                         if (search.isEnd(idx))
1257                         {
1258                             return -1;
1259                         }
1260 
1261                         // Get character to match against character class and maybe casefold
1262                         char c = search.charAt(idx);
1263                         boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1264                         // Loop through character class checking our match character
1265                         int idxRange = node + nodeSize;
1266                         int idxEnd = idxRange + (opdata * 2);
1267                         boolean match = false;
1268                         for (int i = idxRange; !match && i < idxEnd; )
1269                         {
1270                             // Get start, end and match characters
1271                             char s = instruction[i++];
1272                             char e = instruction[i++];
1273 
1274                             match = ((compareChars(c, s, caseFold) >= 0)
1275                                      && (compareChars(c, e, caseFold) <= 0));
1276                         }
1277 
1278                         // Fail if we didn't match the character class
1279                         if (!match)
1280                         {
1281                             return -1;
1282                         }
1283                         idx++;
1284                     }
1285                     break;
1286 
1287                 case OP_BRANCH:
1288                 {
1289                     // Check for choices
1290                     if (instruction[next + offsetOpcode] != OP_BRANCH)
1291                     {
1292                         // If there aren't any other choices, just evaluate this branch.
1293                         node += nodeSize;
1294                         continue;
1295                     }
1296 
1297                     // Try all available branches
1298                     short nextBranch;
1299                     do
1300                     {
1301                         // Try matching the branch against the string
1302                         if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1)
1303                         {
1304                             return idxNew;
1305                         }
1306 
1307                         // Go to next branch (if any)
1308                         nextBranch = (short)instruction[node + offsetNext];
1309                         node += nextBranch;
1310                     }
1311                     while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH));
1312 
1313                     // Failed to match any branch!
1314                     return -1;
1315                 }
1316 
1317                 case OP_NOTHING:
1318                 case OP_GOTO:
1319 
1320                     // Just advance to the next node without doing anything
1321                     break;
1322 
1323                 case OP_END:
1324 
1325                     // Match has succeeded!
1326                     setParenEnd(0, idx);
1327                     return idx;
1328 
1329                 default:
1330 
1331                     // Corrupt program
1332                     internalError("Invalid opcode '" + opcode + "'");
1333             }
1334 
1335             // Advance to the next node in the program
1336             node = next;
1337         }
1338 
1339         // We "should" never end up here
1340         internalError("Corrupt program");
1341         return -1;
1342     }
1343 
1344     /**
1345      * Match the current regular expression program against the current
1346      * input string, starting at index i of the input string.  This method
1347      * is only meant for internal use.
1348      *
1349      * @param i The input string index to start matching at
1350      * @return True if the input matched the expression
1351      */
1352     protected boolean matchAt(int i)
1353     {
1354         // Initialize start pointer, paren cache and paren count
1355         start0 = -1;
1356         end0   = -1;
1357         start1 = -1;
1358         end1   = -1;
1359         start2 = -1;
1360         end2   = -1;
1361         startn = null;
1362         endn   = null;
1363         parenCount = 1;
1364         setParenStart(0, i);
1365 
1366         // Allocate backref arrays (unless optimizations indicate otherwise)
1367         if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
1368         {
1369             startBackref = new int[maxParen];
1370             endBackref = new int[maxParen];
1371         }
1372 
1373         // Match against string
1374         int idx;
1375         if ((idx = matchNodes(0, maxNode, i)) != -1)
1376         {
1377             setParenEnd(0, idx);
1378             return true;
1379         }
1380 
1381         // Didn't match
1382         parenCount = 0;
1383         return false;
1384     }
1385 
1386     /**
1387      * Matches the current regular expression program against a character array,
1388      * starting at a given index.
1389      *
1390      * @param search String to match against
1391      * @param i Index to start searching at
1392      * @return True if string matched
1393      */
1394     public boolean match(String search, int i)
1395     {
1396         return match(new StringCharacterIterator(search), i);
1397     }
1398 
1399     /**
1400      * Matches the current regular expression program against a character array,
1401      * starting at a given index.
1402      *
1403      * @param search String to match against
1404      * @param i Index to start searching at
1405      * @return True if string matched
1406      */
1407     public boolean match(CharacterIterator search, int i)
1408     {
1409         // There is no compiled program to search with!
1410         if (program == null)
1411         {
1412             // This should be uncommon enough to be an error case rather
1413             // than an exception (which would have to be handled everywhere)
1414             internalError("No RE program to run!");
1415         }
1416 
1417         // Save string to search
1418         this.search = search;
1419 
1420         // Can we optimize the search by looking for a prefix string?
1421         if (program.prefix == null)
1422         {
1423             // Unprefixed matching must try for a match at each character
1424             for ( ;! search.isEnd(i - 1); i++)
1425             {
1426                 // Try a match at index i
1427                 if (matchAt(i))
1428                 {
1429                     return true;
1430                 }
1431             }
1432             return false;
1433         }
1434         else
1435         {
1436             // Prefix-anchored matching is possible
1437             boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1438             char[] prefix = program.prefix;
1439             for ( ; !search.isEnd(i + prefix.length - 1); i++)
1440             {
1441                 int j = i;
1442                 int k = 0;
1443 
1444                 boolean match;
1445                 do {
1446                     // If there's a mismatch of any character in the prefix, give up
1447                     match = (compareChars(search.charAt(j++), prefix[k++], caseIndependent) == 0);
1448                 } while (match && k < prefix.length);
1449 
1450                 // See if the whole prefix string matched
1451                 if (k == prefix.length)
1452                 {
1453                     // We matched the full prefix at firstChar, so try it
1454                     if (matchAt(i))
1455                     {
1456                         return true;
1457                     }
1458                 }
1459             }
1460             return false;
1461         }
1462     }
1463 
1464     /**
1465      * Matches the current regular expression program against a String.
1466      *
1467      * @param search String to match against
1468      * @return True if string matched
1469      */
1470     public boolean match(String search)
1471     {
1472         return match(search, 0);
1473     }
1474 
1475     /**
1476      * Splits a string into an array of strings on regular expression boundaries.
1477      * This function works the same way as the Perl function of the same name.
1478      * Given a regular expression of "[ab]+" and a string to split of
1479      * "xyzzyababbayyzabbbab123", the result would be the array of Strings
1480      * "[xyzzy, yyz, 123]".
1481      *
1482      * <p>Please note that the first string in the resulting array may be an empty
1483      * string. This happens when the very first character of input string is
1484      * matched by the pattern.
1485      *
1486      * @param s String to split on this regular exression
1487      * @return Array of strings
1488      */
1489     public String[] split(String s)
1490     {
1491         // Create new vector
1492         Vector v = new Vector();
1493 
1494         // Start at position 0 and search the whole string
1495         int pos = 0;
1496         int len = s.length();
1497 
1498         // Try a match at each position
1499         while (pos < len && match(s, pos))
1500         {
1501             // Get start of match
1502             int start = getParenStart(0);
1503 
1504             // Get end of match
1505             int newpos = getParenEnd(0);
1506 
1507             // Check if no progress was made
1508             if (newpos == pos)
1509             {
1510                 v.addElement(s.substring(pos, start + 1));
1511                 newpos++;
1512             }
1513             else
1514             {
1515                 v.addElement(s.substring(pos, start));
1516             }
1517 
1518             // Move to new position
1519             pos = newpos;
1520         }
1521 
1522         // Push remainder if it's not empty
1523         String remainder = s.substring(pos);
1524         if (remainder.length() != 0)
1525         {
1526             v.addElement(remainder);
1527         }
1528 
1529         // Return vector as an array of strings
1530         String[] ret = new String[v.size()];
1531         v.copyInto(ret);
1532         return ret;
1533     }
1534 
1535     /**
1536      * Flag bit that indicates that subst should replace all occurrences of this
1537      * regular expression.
1538      */
1539     public static final int REPLACE_ALL            = 0x0000;
1540 
1541     /**
1542      * Flag bit that indicates that subst should only replace the first occurrence
1543      * of this regular expression.
1544      */
1545     public static final int REPLACE_FIRSTONLY      = 0x0001;
1546 
1547     /**
1548      * Flag bit that indicates that subst should replace backreferences
1549      */
1550     public static final int REPLACE_BACKREFERENCES = 0x0002;
1551 
1552     /**
1553      * Substitutes a string for this regular expression in another string.
1554      * This method works like the Perl function of the same name.
1555      * Given a regular expression of "a*b", a String to substituteIn of
1556      * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1557      * resulting String returned by subst would be "-foo-garply-wacky-".
1558      *
1559      * @param substituteIn String to substitute within
1560      * @param substitution String to substitute for all matches of this regular expression.
1561      * @return The string substituteIn with zero or more occurrences of the current
1562      * regular expression replaced with the substitution String (if this regular
1563      * expression object doesn't match at any position, the original String is returned
1564      * unchanged).
1565      */
1566     public String subst(String substituteIn, String substitution)
1567     {
1568         return subst(substituteIn, substitution, REPLACE_ALL);
1569     }
1570 
1571     /**
1572      * Substitutes a string for this regular expression in another string.
1573      * This method works like the Perl function of the same name.
1574      * Given a regular expression of "a*b", a String to substituteIn of
1575      * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1576      * resulting String returned by subst would be "-foo-garply-wacky-".
1577      * <p>
1578      * It is also possible to reference the contents of a parenthesized expression
1579      * with $0, $1, ... $9. A regular expression of "http://[\\.\\w\\-\\?/~_@&=%]+",
1580      * a String to substituteIn of "visit us: http://www.apache.org!" and the
1581      * substitution String "&lt;a href=\"$0\"&gt;$0&lt;/a&gt;", the resulting String
1582      * returned by subst would be
1583      * "visit us: &lt;a href=\"http://www.apache.org\"&gt;http://www.apache.org&lt;/a&gt;!".
1584      * <p>
1585      * <i>Note:</i> $0 represents the whole match.
1586      *
1587      * @param substituteIn String to substitute within
1588      * @param substitution String to substitute for matches of this regular expression
1589      * @param flags One or more bitwise flags from REPLACE_*.  If the REPLACE_FIRSTONLY
1590      * flag bit is set, only the first occurrence of this regular expression is replaced.
1591      * If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be
1592      * replaced. If the flag REPLACE_BACKREFERENCES is set, all backreferences will
1593      * be processed.
1594      * @return The string substituteIn with zero or more occurrences of the current
1595      * regular expression replaced with the substitution String (if this regular
1596      * expression object doesn't match at any position, the original String is returned
1597      * unchanged).
1598      */
1599     public String subst(String substituteIn, String substitution, int flags)
1600     {
1601         // String to return
1602         StringBuffer ret = new StringBuffer();
1603 
1604         // Start at position 0 and search the whole string
1605         int pos = 0;
1606         int len = substituteIn.length();
1607 
1608         // Try a match at each position
1609         while (pos < len && match(substituteIn, pos))
1610         {
1611             // Append string before match
1612             ret.append(substituteIn.substring(pos, getParenStart(0)));
1613 
1614             if ((flags & REPLACE_BACKREFERENCES) != 0)
1615             {
1616                 // Process backreferences
1617                 int lCurrentPosition = 0;
1618                 int lLastPosition = -2;
1619                 int lLength = substitution.length();
1620                 boolean bAddedPrefix = false;
1621 
1622                 while ((lCurrentPosition = substitution.indexOf("$", lCurrentPosition)) >= 0)
1623                 {
1624                     if ((lCurrentPosition == 0 || substitution.charAt(lCurrentPosition - 1) != '\\')
1625                         && lCurrentPosition+1 < lLength)
1626                     {
1627                         char c = substitution.charAt(lCurrentPosition + 1);
1628                         if (c >= '0' && c <= '9')
1629                         {
1630                             if (bAddedPrefix == false)
1631                             {
1632                                 // Append everything between the beginning of the
1633                                 // substitution string and the current $ sign
1634                                 ret.append(substitution.substring(0, lCurrentPosition));
1635                                 bAddedPrefix = true;
1636                             }
1637                             else
1638                             {
1639                                 // Append everything between the last and the current $ sign
1640                                 ret.append(substitution.substring(lLastPosition + 2, lCurrentPosition));
1641                             }
1642 
1643                             // Append the parenthesized expression
1644                             // Note: if a parenthesized expression of the requested
1645                             // index is not available "null" is added to the string
1646                             ret.append(getParen(c - '0'));
1647                             lLastPosition = lCurrentPosition;
1648                         }
1649                     }
1650 
1651                     // Move forward, skipping past match
1652                     lCurrentPosition++;
1653                 }
1654 
1655                 // Append everything after the last $ sign
1656                 ret.append(substitution.substring(lLastPosition + 2, lLength));
1657             }
1658             else
1659             {
1660                 // Append substitution without processing backreferences
1661                 ret.append(substitution);
1662             }
1663 
1664             // Move forward, skipping past match
1665             int newpos = getParenEnd(0);
1666 
1667             // We always want to make progress!
1668             if (newpos == pos)
1669             {
1670                 newpos++;
1671             }
1672 
1673             // Try new position
1674             pos = newpos;
1675 
1676             // Break out if we're only supposed to replace one occurrence
1677             if ((flags & REPLACE_FIRSTONLY) != 0)
1678             {
1679                 break;
1680             }
1681         }
1682 
1683         // If there's remaining input, append it
1684         if (pos < len)
1685         {
1686             ret.append(substituteIn.substring(pos));
1687         }
1688 
1689         // Return string buffer as string
1690         return ret.toString();
1691     }
1692 
1693     /**
1694      * Returns an array of Strings, whose toString representation matches a regular
1695      * expression. This method works like the Perl function of the same name.  Given
1696      * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
1697      * aaaab], the array of Strings returned by grep would be [aab, aaaab].
1698      *
1699      * @param search Array of Objects to search
1700      * @return Array of Strings whose toString() value matches this regular expression.
1701      */
1702     public String[] grep(Object[] search)
1703     {
1704         // Create new vector to hold return items
1705         Vector v = new Vector();
1706 
1707         // Traverse array of objects
1708         for (int i = 0; i < search.length; i++)
1709         {
1710             // Get next object as a string
1711             String s = search[i].toString();
1712 
1713             // If it matches this regexp, add it to the list
1714             if (match(s))
1715             {
1716                 v.addElement(s);
1717             }
1718         }
1719 
1720         // Return vector as an array of strings
1721         String[] ret = new String[v.size()];
1722         v.copyInto(ret);
1723         return ret;
1724     }
1725 
1726     /**
1727      * @return true if character at i-th position in the <code>search</code> string is a newline
1728      */
1729     private boolean isNewline(int i)
1730     {
1731         char nextChar = search.charAt(i);
1732 
1733         if (nextChar == '\n' || nextChar == '\r' || nextChar == '\u0085'
1734             || nextChar == '\u2028' || nextChar == '\u2029')
1735         {
1736             return true;
1737         }
1738 
1739         return false;
1740     }
1741 
1742     /**
1743      * Compares two characters.
1744      *
1745      * @param c1 first character to compare.
1746      * @param c2 second character to compare.
1747      * @param caseIndependent whether comparision is case insensitive or not.
1748      * @return negative, 0, or positive integer as the first character
1749      *         less than, equal to, or greater then the second.
1750      */
1751     private int compareChars(char c1, char c2, boolean caseIndependent)
1752     {
1753         if (caseIndependent)
1754         {
1755             c1 = Character.toLowerCase(c1);
1756             c2 = Character.toLowerCase(c2);
1757         }
1758         return ((int)c1 - (int)c2);
1759     }
1760 }