View Javadoc
1   /*
2    * reserved comment block
3    * DO NOT REMOVE OR ALTER!
4    */
5   /*
6    * Copyright 1999-2004 The Apache Software Foundation.
7    *
8    * Licensed under the Apache License, Version 2.0 (the "License");
9    * you may not use this file except in compliance with the License.
10   * You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  
21  package com.sun.org.apache.regexp.internal;
22  
23  import com.sun.org.apache.regexp.internal.RE;
24  import java.util.Hashtable;
25  
26  /**
27   * A regular expression compiler class.  This class compiles a pattern string into a
28   * regular expression program interpretable by the RE evaluator class.  The 'recompile'
29   * command line tool uses this compiler to pre-compile regular expressions for use
30   * with RE.  For a description of the syntax accepted by RECompiler and what you can
31   * do with regular expressions, see the documentation for the RE matcher class.
32   *
33   * @see RE
34   * @see recompile
35   *
36   * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
37   * @author <a href="mailto:gholam@xtra.co.nz">Michael McCallum</a>
38   */
39  public class RECompiler
40  {
41      // The compiled program
42      char[] instruction;                                 // The compiled RE 'program' instruction buffer
43      int lenInstruction;                                 // The amount of the program buffer currently in use
44  
45      // Input state for compiling regular expression
46      String pattern;                                     // Input string
47      int len;                                            // Length of the pattern string
48      int idx;                                            // Current input index into ac
49      int parens;                                         // Total number of paren pairs
50  
51      // Node flags
52      static final int NODE_NORMAL   = 0;                 // No flags (nothing special)
53      static final int NODE_NULLABLE = 1;                 // True if node is potentially null
54      static final int NODE_TOPLEVEL = 2;                 // True if top level expr
55  
56      // Special types of 'escapes'
57      static final int ESC_MASK      = 0xffff0;           // Escape complexity mask
58      static final int ESC_BACKREF   = 0xfffff;           // Escape is really a backreference
59      static final int ESC_COMPLEX   = 0xffffe;           // Escape isn't really a true character
60      static final int ESC_CLASS     = 0xffffd;           // Escape represents a whole class of characters
61  
62      // {m,n} stacks
63      int maxBrackets = 10;                               // Maximum number of bracket pairs
64      static final int bracketUnbounded = -1;             // Unbounded value
65      int brackets = 0;                                   // Number of bracket sets
66      int[] bracketStart = null;                          // Starting point
67      int[] bracketEnd = null;                            // Ending point
68      int[] bracketMin = null;                            // Minimum number of matches
69      int[] bracketOpt = null;                            // Additional optional matches
70  
71      // Lookup table for POSIX character class names
72      static Hashtable hashPOSIX = new Hashtable();
73      static
74      {
75          hashPOSIX.put("alnum",     new Character(RE.POSIX_CLASS_ALNUM));
76          hashPOSIX.put("alpha",     new Character(RE.POSIX_CLASS_ALPHA));
77          hashPOSIX.put("blank",     new Character(RE.POSIX_CLASS_BLANK));
78          hashPOSIX.put("cntrl",     new Character(RE.POSIX_CLASS_CNTRL));
79          hashPOSIX.put("digit",     new Character(RE.POSIX_CLASS_DIGIT));
80          hashPOSIX.put("graph",     new Character(RE.POSIX_CLASS_GRAPH));
81          hashPOSIX.put("lower",     new Character(RE.POSIX_CLASS_LOWER));
82          hashPOSIX.put("print",     new Character(RE.POSIX_CLASS_PRINT));
83          hashPOSIX.put("punct",     new Character(RE.POSIX_CLASS_PUNCT));
84          hashPOSIX.put("space",     new Character(RE.POSIX_CLASS_SPACE));
85          hashPOSIX.put("upper",     new Character(RE.POSIX_CLASS_UPPER));
86          hashPOSIX.put("xdigit",    new Character(RE.POSIX_CLASS_XDIGIT));
87          hashPOSIX.put("javastart", new Character(RE.POSIX_CLASS_JSTART));
88          hashPOSIX.put("javapart",  new Character(RE.POSIX_CLASS_JPART));
89      }
90  
91      /**
92       * Constructor.  Creates (initially empty) storage for a regular expression program.
93       */
94      public RECompiler()
95      {
96          // Start off with a generous, yet reasonable, initial size
97          instruction = new char[128];
98          lenInstruction = 0;
99      }
100 
101     /**
102      * Ensures that n more characters can fit in the program buffer.
103      * If n more can't fit, then the size is doubled until it can.
104      * @param n Number of additional characters to ensure will fit.
105      */
106     void ensure(int n)
107     {
108         // Get current program length
109         int curlen = instruction.length;
110 
111         // If the current length + n more is too much
112         if (lenInstruction + n >= curlen)
113         {
114             // Double the size of the program array until n more will fit
115             while (lenInstruction + n >= curlen)
116             {
117                 curlen *= 2;
118             }
119 
120             // Allocate new program array and move data into it
121             char[] newInstruction = new char[curlen];
122             System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction);
123             instruction = newInstruction;
124         }
125     }
126 
127     /**
128      * Emit a single character into the program stream.
129      * @param c Character to add
130      */
131     void emit(char c)
132     {
133         // Make room for character
134         ensure(1);
135 
136         // Add character
137         instruction[lenInstruction++] = c;
138     }
139 
140     /**
141      * Inserts a node with a given opcode and opdata at insertAt.  The node relative next
142      * pointer is initialized to 0.
143      * @param opcode Opcode for new node
144      * @param opdata Opdata for new node (only the low 16 bits are currently used)
145      * @param insertAt Index at which to insert the new node in the program
146      */
147     void nodeInsert(char opcode, int opdata, int insertAt)
148     {
149         // Make room for a new node
150         ensure(RE.nodeSize);
151 
152         // Move everything from insertAt to the end down nodeSize elements
153         System.arraycopy(instruction, insertAt, instruction, insertAt + RE.nodeSize, lenInstruction - insertAt);
154         instruction[insertAt + RE.offsetOpcode] = opcode;
155         instruction[insertAt + RE.offsetOpdata] = (char)opdata;
156         instruction[insertAt + RE.offsetNext] = 0;
157         lenInstruction += RE.nodeSize;
158     }
159 
160     /**
161      * Appends a node to the end of a node chain
162      * @param node Start of node chain to traverse
163      * @param pointTo Node to have the tail of the chain point to
164      */
165     void setNextOfEnd(int node, int pointTo)
166     {
167         // Traverse the chain until the next offset is 0
168         int next = instruction[node + RE.offsetNext];
169         // while the 'node' is not the last in the chain
170         // and the 'node' is not the last in the program.
171         while ( next != 0 && node < lenInstruction )
172         {
173             // if the node we are supposed to point to is in the chain then
174             // point to the end of the program instead.
175             // Michael McCallum <gholam@xtra.co.nz>
176             // FIXME: // This is a _hack_ to stop infinite programs.
177             // I believe that the implementation of the reluctant matches is wrong but
178             // have not worked out a better way yet.
179             if ( node == pointTo ) {
180               pointTo = lenInstruction;
181             }
182             node += next;
183             next = instruction[node + RE.offsetNext];
184         }
185         // if we have reached the end of the program then dont set the pointTo.
186         // im not sure if this will break any thing but passes all the tests.
187         if ( node < lenInstruction ) {
188             // Point the last node in the chain to pointTo.
189             instruction[node + RE.offsetNext] = (char)(short)(pointTo - node);
190         }
191     }
192 
193     /**
194      * Adds a new node
195      * @param opcode Opcode for node
196      * @param opdata Opdata for node (only the low 16 bits are currently used)
197      * @return Index of new node in program
198      */
199     int node(char opcode, int opdata)
200     {
201         // Make room for a new node
202         ensure(RE.nodeSize);
203 
204         // Add new node at end
205         instruction[lenInstruction + RE.offsetOpcode] = opcode;
206         instruction[lenInstruction + RE.offsetOpdata] = (char)opdata;
207         instruction[lenInstruction + RE.offsetNext] = 0;
208         lenInstruction += RE.nodeSize;
209 
210         // Return index of new node
211         return lenInstruction - RE.nodeSize;
212     }
213 
214 
215     /**
216      * Throws a new internal error exception
217      * @exception Error Thrown in the event of an internal error.
218      */
219     void internalError() throws Error
220     {
221         throw new Error("Internal error!");
222     }
223 
224     /**
225      * Throws a new syntax error exception
226      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
227      */
228     void syntaxError(String s) throws RESyntaxException
229     {
230         throw new RESyntaxException(s);
231     }
232 
233     /**
234      * Allocate storage for brackets only as needed
235      */
236     void allocBrackets()
237     {
238         // Allocate bracket stacks if not already done
239         if (bracketStart == null)
240         {
241             // Allocate storage
242             bracketStart = new int[maxBrackets];
243             bracketEnd   = new int[maxBrackets];
244             bracketMin   = new int[maxBrackets];
245             bracketOpt   = new int[maxBrackets];
246 
247             // Initialize to invalid values
248             for (int i = 0; i < maxBrackets; i++)
249             {
250                 bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1;
251             }
252         }
253     }
254 
255     /** Enlarge storage for brackets only as needed. */
256     synchronized void reallocBrackets() {
257         // trick the tricky
258         if (bracketStart == null) {
259             allocBrackets();
260         }
261 
262         int new_size = maxBrackets * 2;
263         int[] new_bS = new int[new_size];
264         int[] new_bE = new int[new_size];
265         int[] new_bM = new int[new_size];
266         int[] new_bO = new int[new_size];
267         // Initialize to invalid values
268         for (int i=brackets; i<new_size; i++) {
269             new_bS[i] = new_bE[i] = new_bM[i] = new_bO[i] = -1;
270         }
271         System.arraycopy(bracketStart,0, new_bS,0, brackets);
272         System.arraycopy(bracketEnd,0,   new_bE,0, brackets);
273         System.arraycopy(bracketMin,0,   new_bM,0, brackets);
274         System.arraycopy(bracketOpt,0,   new_bO,0, brackets);
275         bracketStart = new_bS;
276         bracketEnd   = new_bE;
277         bracketMin   = new_bM;
278         bracketOpt   = new_bO;
279         maxBrackets  = new_size;
280     }
281 
282     /**
283      * Match bracket {m,n} expression put results in bracket member variables
284      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
285      */
286     void bracket() throws RESyntaxException
287     {
288         // Current character must be a '{'
289         if (idx >= len || pattern.charAt(idx++) != '{')
290         {
291             internalError();
292         }
293 
294         // Next char must be a digit
295         if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
296         {
297             syntaxError("Expected digit");
298         }
299 
300         // Get min ('m' of {m,n}) number
301         StringBuffer number = new StringBuffer();
302         while (idx < len && Character.isDigit(pattern.charAt(idx)))
303         {
304             number.append(pattern.charAt(idx++));
305         }
306         try
307         {
308             bracketMin[brackets] = Integer.parseInt(number.toString());
309         }
310         catch (NumberFormatException e)
311         {
312             syntaxError("Expected valid number");
313         }
314 
315         // If out of input, fail
316         if (idx >= len)
317         {
318             syntaxError("Expected comma or right bracket");
319         }
320 
321         // If end of expr, optional limit is 0
322         if (pattern.charAt(idx) == '}')
323         {
324             idx++;
325             bracketOpt[brackets] = 0;
326             return;
327         }
328 
329         // Must have at least {m,} and maybe {m,n}.
330         if (idx >= len || pattern.charAt(idx++) != ',')
331         {
332             syntaxError("Expected comma");
333         }
334 
335         // If out of input, fail
336         if (idx >= len)
337         {
338             syntaxError("Expected comma or right bracket");
339         }
340 
341         // If {m,} max is unlimited
342         if (pattern.charAt(idx) == '}')
343         {
344             idx++;
345             bracketOpt[brackets] = bracketUnbounded;
346             return;
347         }
348 
349         // Next char must be a digit
350         if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
351         {
352             syntaxError("Expected digit");
353         }
354 
355         // Get max number
356         number.setLength(0);
357         while (idx < len && Character.isDigit(pattern.charAt(idx)))
358         {
359             number.append(pattern.charAt(idx++));
360         }
361         try
362         {
363             bracketOpt[brackets] = Integer.parseInt(number.toString()) - bracketMin[brackets];
364         }
365         catch (NumberFormatException e)
366         {
367             syntaxError("Expected valid number");
368         }
369 
370         // Optional repetitions must be >= 0
371         if (bracketOpt[brackets] < 0)
372         {
373             syntaxError("Bad range");
374         }
375 
376         // Must have close brace
377         if (idx >= len || pattern.charAt(idx++) != '}')
378         {
379             syntaxError("Missing close brace");
380         }
381     }
382 
383     /**
384      * Match an escape sequence.  Handles quoted chars and octal escapes as well
385      * as normal escape characters.  Always advances the input stream by the
386      * right amount. This code "understands" the subtle difference between an
387      * octal escape and a backref.  You can access the type of ESC_CLASS or
388      * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
389      * @return ESC_* code or character if simple escape
390      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
391      */
392     int escape() throws RESyntaxException
393     {
394         // "Shouldn't" happen
395         if (pattern.charAt(idx) != '\\')
396         {
397             internalError();
398         }
399 
400         // Escape shouldn't occur as last character in string!
401         if (idx + 1 == len)
402         {
403             syntaxError("Escape terminates string");
404         }
405 
406         // Switch on character after backslash
407         idx += 2;
408         char escapeChar = pattern.charAt(idx - 1);
409         switch (escapeChar)
410         {
411             case RE.E_BOUND:
412             case RE.E_NBOUND:
413                 return ESC_COMPLEX;
414 
415             case RE.E_ALNUM:
416             case RE.E_NALNUM:
417             case RE.E_SPACE:
418             case RE.E_NSPACE:
419             case RE.E_DIGIT:
420             case RE.E_NDIGIT:
421                 return ESC_CLASS;
422 
423             case 'u':
424             case 'x':
425                 {
426                     // Exact required hex digits for escape type
427                     int hexDigits = (escapeChar == 'u' ? 4 : 2);
428 
429                     // Parse up to hexDigits characters from input
430                     int val = 0;
431                     for ( ; idx < len && hexDigits-- > 0; idx++)
432                     {
433                         // Get char
434                         char c = pattern.charAt(idx);
435 
436                         // If it's a hexadecimal digit (0-9)
437                         if (c >= '0' && c <= '9')
438                         {
439                             // Compute new value
440                             val = (val << 4) + c - '0';
441                         }
442                         else
443                         {
444                             // If it's a hexadecimal letter (a-f)
445                             c = Character.toLowerCase(c);
446                             if (c >= 'a' && c <= 'f')
447                             {
448                                 // Compute new value
449                                 val = (val << 4) + (c - 'a') + 10;
450                             }
451                             else
452                             {
453                                 // If it's not a valid digit or hex letter, the escape must be invalid
454                                 // because hexDigits of input have not been absorbed yet.
455                                 syntaxError("Expected " + hexDigits + " hexadecimal digits after \\" + escapeChar);
456                             }
457                         }
458                     }
459                     return val;
460                 }
461 
462             case 't':
463                 return '\t';
464 
465             case 'n':
466                 return '\n';
467 
468             case 'r':
469                 return '\r';
470 
471             case 'f':
472                 return '\f';
473 
474             case '0':
475             case '1':
476             case '2':
477             case '3':
478             case '4':
479             case '5':
480             case '6':
481             case '7':
482             case '8':
483             case '9':
484 
485                 // An octal escape starts with a 0 or has two digits in a row
486                 if ((idx < len && Character.isDigit(pattern.charAt(idx))) || escapeChar == '0')
487                 {
488                     // Handle \nnn octal escapes
489                     int val = escapeChar - '0';
490                     if (idx < len && Character.isDigit(pattern.charAt(idx)))
491                     {
492                         val = ((val << 3) + (pattern.charAt(idx++) - '0'));
493                         if (idx < len && Character.isDigit(pattern.charAt(idx)))
494                         {
495                             val = ((val << 3) + (pattern.charAt(idx++) - '0'));
496                         }
497                     }
498                     return val;
499                 }
500 
501                 // It's actually a backreference (\[1-9]), not an escape
502                 return ESC_BACKREF;
503 
504             default:
505 
506                 // Simple quoting of a character
507                 return escapeChar;
508         }
509     }
510 
511     /**
512      * Compile a character class
513      * @return Index of class node
514      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
515      */
516     int characterClass() throws RESyntaxException
517     {
518         // Check for bad calling or empty class
519         if (pattern.charAt(idx) != '[')
520         {
521             internalError();
522         }
523 
524         // Check for unterminated or empty class
525         if ((idx + 1) >= len || pattern.charAt(++idx) == ']')
526         {
527             syntaxError("Empty or unterminated class");
528         }
529 
530         // Check for POSIX character class
531         if (idx < len && pattern.charAt(idx) == ':')
532         {
533             // Skip colon
534             idx++;
535 
536             // POSIX character classes are denoted with lowercase ASCII strings
537             int idxStart = idx;
538             while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z')
539             {
540                 idx++;
541             }
542 
543             // Should be a ":]" to terminate the POSIX character class
544             if ((idx + 1) < len && pattern.charAt(idx) == ':' && pattern.charAt(idx + 1) == ']')
545             {
546                 // Get character class
547                 String charClass = pattern.substring(idxStart, idx);
548 
549                 // Select the POSIX class id
550                 Character i = (Character)hashPOSIX.get(charClass);
551                 if (i != null)
552                 {
553                     // Move past colon and right bracket
554                     idx += 2;
555 
556                     // Return new POSIX character class node
557                     return node(RE.OP_POSIXCLASS, i.charValue());
558                 }
559                 syntaxError("Invalid POSIX character class '" + charClass + "'");
560             }
561             syntaxError("Invalid POSIX character class syntax");
562         }
563 
564         // Try to build a class.  Create OP_ANYOF node
565         int ret = node(RE.OP_ANYOF, 0);
566 
567         // Parse class declaration
568         char CHAR_INVALID = Character.MAX_VALUE;
569         char last = CHAR_INVALID;
570         char simpleChar = 0;
571         boolean include = true;
572         boolean definingRange = false;
573         int idxFirst = idx;
574         char rangeStart = Character.MIN_VALUE;
575         char rangeEnd;
576         RERange range = new RERange();
577         while (idx < len && pattern.charAt(idx) != ']')
578         {
579 
580             switchOnCharacter:
581 
582             // Switch on character
583             switch (pattern.charAt(idx))
584             {
585                 case '^':
586                     include = !include;
587                     if (idx == idxFirst)
588                     {
589                         range.include(Character.MIN_VALUE, Character.MAX_VALUE, true);
590                     }
591                     idx++;
592                     continue;
593 
594                 case '\\':
595                 {
596                     // Escape always advances the stream
597                     int c;
598                     switch (c = escape ())
599                     {
600                         case ESC_COMPLEX:
601                         case ESC_BACKREF:
602 
603                             // Word boundaries and backrefs not allowed in a character class!
604                             syntaxError("Bad character class");
605 
606                         case ESC_CLASS:
607 
608                             // Classes can't be an endpoint of a range
609                             if (definingRange)
610                             {
611                                 syntaxError("Bad character class");
612                             }
613 
614                             // Handle specific type of class (some are ok)
615                             switch (pattern.charAt(idx - 1))
616                             {
617                                 case RE.E_NSPACE:
618                                 case RE.E_NDIGIT:
619                                 case RE.E_NALNUM:
620                                     syntaxError("Bad character class");
621 
622                                 case RE.E_SPACE:
623                                     range.include('\t', include);
624                                     range.include('\r', include);
625                                     range.include('\f', include);
626                                     range.include('\n', include);
627                                     range.include('\b', include);
628                                     range.include(' ', include);
629                                     break;
630 
631                                 case RE.E_ALNUM:
632                                     range.include('a', 'z', include);
633                                     range.include('A', 'Z', include);
634                                     range.include('_', include);
635 
636                                     // Fall through!
637 
638                                 case RE.E_DIGIT:
639                                     range.include('0', '9', include);
640                                     break;
641                             }
642 
643                             // Make last char invalid (can't be a range start)
644                             last = CHAR_INVALID;
645                             break;
646 
647                         default:
648 
649                             // Escape is simple so treat as a simple char
650                             simpleChar = (char) c;
651                             break switchOnCharacter;
652                     }
653                 }
654                 continue;
655 
656                 case '-':
657 
658                     // Start a range if one isn't already started
659                     if (definingRange)
660                     {
661                         syntaxError("Bad class range");
662                     }
663                     definingRange = true;
664 
665                     // If no last character, start of range is 0
666                     rangeStart = (last == CHAR_INVALID ? 0 : last);
667 
668                     // Premature end of range. define up to Character.MAX_VALUE
669                     if ((idx + 1) < len && pattern.charAt(++idx) == ']')
670                     {
671                         simpleChar = Character.MAX_VALUE;
672                         break;
673                     }
674                     continue;
675 
676                 default:
677                     simpleChar = pattern.charAt(idx++);
678                     break;
679             }
680 
681             // Handle simple character simpleChar
682             if (definingRange)
683             {
684                 // if we are defining a range make it now
685                 rangeEnd = simpleChar;
686 
687                 // Actually create a range if the range is ok
688                 if (rangeStart >= rangeEnd)
689                 {
690                     syntaxError("Bad character class");
691                 }
692                 range.include(rangeStart, rangeEnd, include);
693 
694                 // We are done defining the range
695                 last = CHAR_INVALID;
696                 definingRange = false;
697             }
698             else
699             {
700                 // If simple character and not start of range, include it
701                 if (idx >= len || pattern.charAt(idx) != '-')
702                 {
703                     range.include(simpleChar, include);
704                 }
705                 last = simpleChar;
706             }
707         }
708 
709         // Shouldn't be out of input
710         if (idx == len)
711         {
712             syntaxError("Unterminated character class");
713         }
714 
715         // Absorb the ']' end of class marker
716         idx++;
717 
718         // Emit character class definition
719         instruction[ret + RE.offsetOpdata] = (char)range.num;
720         for (int i = 0; i < range.num; i++)
721         {
722             emit((char)range.minRange[i]);
723             emit((char)range.maxRange[i]);
724         }
725         return ret;
726     }
727 
728     /**
729      * Absorb an atomic character string.  This method is a little tricky because
730      * it can un-include the last character of string if a closure operator follows.
731      * This is correct because *+? have higher precedence than concatentation (thus
732      * ABC* means AB(C*) and NOT (ABC)*).
733      * @return Index of new atom node
734      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
735      */
736     int atom() throws RESyntaxException
737     {
738         // Create a string node
739         int ret = node(RE.OP_ATOM, 0);
740 
741         // Length of atom
742         int lenAtom = 0;
743 
744         // Loop while we've got input
745 
746         atomLoop:
747 
748         while (idx < len)
749         {
750             // Is there a next char?
751             if ((idx + 1) < len)
752             {
753                 char c = pattern.charAt(idx + 1);
754 
755                 // If the next 'char' is an escape, look past the whole escape
756                 if (pattern.charAt(idx) == '\\')
757                 {
758                     int idxEscape = idx;
759                     escape();
760                     if (idx < len)
761                     {
762                         c = pattern.charAt(idx);
763                     }
764                     idx = idxEscape;
765                 }
766 
767                 // Switch on next char
768                 switch (c)
769                 {
770                     case '{':
771                     case '?':
772                     case '*':
773                     case '+':
774 
775                         // If the next character is a closure operator and our atom is non-empty, the
776                         // current character should bind to the closure operator rather than the atom
777                         if (lenAtom != 0)
778                         {
779                             break atomLoop;
780                         }
781                 }
782             }
783 
784             // Switch on current char
785             switch (pattern.charAt(idx))
786             {
787                 case ']':
788                 case '^':
789                 case '$':
790                 case '.':
791                 case '[':
792                 case '(':
793                 case ')':
794                 case '|':
795                     break atomLoop;
796 
797                 case '{':
798                 case '?':
799                 case '*':
800                 case '+':
801 
802                     // We should have an atom by now
803                     if (lenAtom == 0)
804                     {
805                         // No atom before closure
806                         syntaxError("Missing operand to closure");
807                     }
808                     break atomLoop;
809 
810                 case '\\':
811 
812                     {
813                         // Get the escaped character (advances input automatically)
814                         int idxBeforeEscape = idx;
815                         int c = escape();
816 
817                         // Check if it's a simple escape (as opposed to, say, a backreference)
818                         if ((c & ESC_MASK) == ESC_MASK)
819                         {
820                             // Not a simple escape, so backup to where we were before the escape.
821                             idx = idxBeforeEscape;
822                             break atomLoop;
823                         }
824 
825                         // Add escaped char to atom
826                         emit((char) c);
827                         lenAtom++;
828                     }
829                     break;
830 
831                 default:
832 
833                     // Add normal character to atom
834                     emit(pattern.charAt(idx++));
835                     lenAtom++;
836                     break;
837             }
838         }
839 
840         // This "shouldn't" happen
841         if (lenAtom == 0)
842         {
843             internalError();
844         }
845 
846         // Emit the atom length into the program
847         instruction[ret + RE.offsetOpdata] = (char)lenAtom;
848         return ret;
849     }
850 
851     /**
852      * Match a terminal node.
853      * @param flags Flags
854      * @return Index of terminal node (closeable)
855      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
856      */
857     int terminal(int[] flags) throws RESyntaxException
858     {
859         switch (pattern.charAt(idx))
860         {
861         case RE.OP_EOL:
862         case RE.OP_BOL:
863         case RE.OP_ANY:
864             return node(pattern.charAt(idx++), 0);
865 
866         case '[':
867             return characterClass();
868 
869         case '(':
870             return expr(flags);
871 
872         case ')':
873             syntaxError("Unexpected close paren");
874 
875         case '|':
876             internalError();
877 
878         case ']':
879             syntaxError("Mismatched class");
880 
881         case 0:
882             syntaxError("Unexpected end of input");
883 
884         case '?':
885         case '+':
886         case '{':
887         case '*':
888             syntaxError("Missing operand to closure");
889 
890         case '\\':
891             {
892                 // Don't forget, escape() advances the input stream!
893                 int idxBeforeEscape = idx;
894 
895                 // Switch on escaped character
896                 switch (escape())
897                 {
898                     case ESC_CLASS:
899                     case ESC_COMPLEX:
900                         flags[0] &= ~NODE_NULLABLE;
901                         return node(RE.OP_ESCAPE, pattern.charAt(idx - 1));
902 
903                     case ESC_BACKREF:
904                         {
905                             char backreference = (char)(pattern.charAt(idx - 1) - '0');
906                             if (parens <= backreference)
907                             {
908                                 syntaxError("Bad backreference");
909                             }
910                             flags[0] |= NODE_NULLABLE;
911                             return node(RE.OP_BACKREF, backreference);
912                         }
913 
914                     default:
915 
916                         // We had a simple escape and we want to have it end up in
917                         // an atom, so we back up and fall though to the default handling
918                         idx = idxBeforeEscape;
919                         flags[0] &= ~NODE_NULLABLE;
920                         break;
921                 }
922             }
923         }
924 
925         // Everything above either fails or returns.
926         // If it wasn't one of the above, it must be the start of an atom.
927         flags[0] &= ~NODE_NULLABLE;
928         return atom();
929     }
930 
931     /**
932      * Compile a possibly closured terminal
933      * @param flags Flags passed by reference
934      * @return Index of closured node
935      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
936      */
937     int closure(int[] flags) throws RESyntaxException
938     {
939         // Before terminal
940         int idxBeforeTerminal = idx;
941 
942         // Values to pass by reference to terminal()
943         int[] terminalFlags = { NODE_NORMAL };
944 
945         // Get terminal symbol
946         int ret = terminal(terminalFlags);
947 
948         // Or in flags from terminal symbol
949         flags[0] |= terminalFlags[0];
950 
951         // Advance input, set NODE_NULLABLE flag and do sanity checks
952         if (idx >= len)
953         {
954             return ret;
955         }
956         boolean greedy = true;
957         char closureType = pattern.charAt(idx);
958         switch (closureType)
959         {
960             case '?':
961             case '*':
962 
963                 // The current node can be null
964                 flags[0] |= NODE_NULLABLE;
965 
966             case '+':
967 
968                 // Eat closure character
969                 idx++;
970 
971             case '{':
972 
973                 // Don't allow blantant stupidity
974                 int opcode = instruction[ret + RE.offsetOpcode];
975                 if (opcode == RE.OP_BOL || opcode == RE.OP_EOL)
976                 {
977                     syntaxError("Bad closure operand");
978                 }
979                 if ((terminalFlags[0] & NODE_NULLABLE) != 0)
980                 {
981                     syntaxError("Closure operand can't be nullable");
982                 }
983                 break;
984         }
985 
986         // If the next character is a '?', make the closure non-greedy (reluctant)
987         if (idx < len && pattern.charAt(idx) == '?')
988         {
989             idx++;
990             greedy = false;
991         }
992 
993         if (greedy)
994         {
995             // Actually do the closure now
996             switch (closureType)
997             {
998                 case '{':
999                 {
1000                     // We look for our bracket in the list
1001                     boolean found = false;
1002                     int i;
1003                     allocBrackets();
1004                     for (i = 0; i < brackets; i++)
1005                     {
1006                         if (bracketStart[i] == idx)
1007                         {
1008                             found = true;
1009                             break;
1010                         }
1011                     }
1012 
1013                     // If its not in the list we parse the {m,n}
1014                     if (!found)
1015                     {
1016                         if (brackets >= maxBrackets)
1017                         {
1018                             reallocBrackets();
1019                         }
1020                         bracketStart[brackets] = idx;
1021                         bracket();
1022                         bracketEnd[brackets] = idx;
1023                         i = brackets++;
1024                     }
1025 
1026                     // Process min first
1027                     if (bracketMin[i]-- > 0)
1028                     {
1029                         if (bracketMin[i] > 0 || bracketOpt[i] != 0) {
1030                             // Rewind stream and run it through again - more matchers coming
1031                             for (int j = 0; j < brackets; j++) {
1032                                 if (j != i && bracketStart[j] < idx
1033                                     && bracketStart[j] >= idxBeforeTerminal)
1034                                 {
1035                                     brackets--;
1036                                     bracketStart[j] = bracketStart[brackets];
1037                                     bracketEnd[j] = bracketEnd[brackets];
1038                                     bracketMin[j] = bracketMin[brackets];
1039                                     bracketOpt[j] = bracketOpt[brackets];
1040                                 }
1041                             }
1042 
1043                             idx = idxBeforeTerminal;
1044                         } else {
1045                             // Bug #1030: No optinal matches - no need to rewind
1046                             idx = bracketEnd[i];
1047                         }
1048                         break;
1049                     }
1050 
1051                     // Do the right thing for maximum ({m,})
1052                     if (bracketOpt[i] == bracketUnbounded)
1053                     {
1054                         // Drop through now and closure expression.
1055                         // We are done with the {m,} expr, so skip rest
1056                         closureType = '*';
1057                         bracketOpt[i] = 0;
1058                         idx = bracketEnd[i];
1059                     }
1060                     else
1061                         if (bracketOpt[i]-- > 0)
1062                         {
1063                             if (bracketOpt[i] > 0)
1064                             {
1065                                 // More optional matchers - 'play it again sam!'
1066                                 idx = idxBeforeTerminal;
1067                             } else {
1068                                 // Bug #1030: We are done - this one is last and optional
1069                                 idx = bracketEnd[i];
1070                             }
1071                             // Drop through to optionally close
1072                             closureType = '?';
1073                         }
1074                         else
1075                         {
1076                             // Rollback terminal - neither min nor opt matchers present
1077                             lenInstruction = ret;
1078                             node(RE.OP_NOTHING, 0);
1079 
1080                             // We are done. skip the rest of {m,n} expr
1081                             idx = bracketEnd[i];
1082                             break;
1083                         }
1084                 }
1085 
1086                 // Fall through!
1087 
1088                 case '?':
1089                 case '*':
1090 
1091                     if (!greedy)
1092                     {
1093                         break;
1094                     }
1095 
1096                     if (closureType == '?')
1097                     {
1098                         // X? is compiled as (X|)
1099                         nodeInsert(RE.OP_BRANCH, 0, ret);                 // branch before X
1100                         setNextOfEnd(ret, node (RE.OP_BRANCH, 0));        // inserted branch to option
1101                         int nothing = node (RE.OP_NOTHING, 0);            // which is OP_NOTHING
1102                         setNextOfEnd(ret, nothing);                       // point (second) branch to OP_NOTHING
1103                         setNextOfEnd(ret + RE.nodeSize, nothing);         // point the end of X to OP_NOTHING node
1104                     }
1105 
1106                     if (closureType == '*')
1107                     {
1108                         // X* is compiled as (X{gotoX}|)
1109                         nodeInsert(RE.OP_BRANCH, 0, ret);                         // branch before X
1110                         setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH, 0));   // end of X points to an option
1111                         setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0));     // to goto
1112                         setNextOfEnd(ret + RE.nodeSize, ret);                     // the start again
1113                         setNextOfEnd(ret, node(RE.OP_BRANCH, 0));                 // the other option is
1114                         setNextOfEnd(ret, node(RE.OP_NOTHING, 0));                // OP_NOTHING
1115                     }
1116                     break;
1117 
1118                 case '+':
1119                 {
1120                     // X+ is compiled as X({gotoX}|)
1121                     int branch;
1122                     branch = node(RE.OP_BRANCH, 0);                   // a new branch
1123                     setNextOfEnd(ret, branch);                        // is added to the end of X
1124                     setNextOfEnd(node(RE.OP_GOTO, 0), ret);           // one option is to go back to the start
1125                     setNextOfEnd(branch, node(RE.OP_BRANCH, 0));      // the other option
1126                     setNextOfEnd(ret, node(RE.OP_NOTHING, 0));        // is OP_NOTHING
1127                 }
1128                 break;
1129             }
1130         }
1131         else
1132         {
1133             // Add end after closured subexpr
1134             setNextOfEnd(ret, node(RE.OP_END, 0));
1135 
1136             // Actually do the closure now
1137             switch (closureType)
1138             {
1139                 case '?':
1140                     nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret);
1141                     break;
1142 
1143                 case '*':
1144                     nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret);
1145                     break;
1146 
1147                 case '+':
1148                     nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret);
1149                     break;
1150             }
1151 
1152             // Point to the expr after the closure
1153             setNextOfEnd(ret, lenInstruction);
1154         }
1155         return ret;
1156     }
1157 
1158     /**
1159      * Compile one branch of an or operator (implements concatenation)
1160      * @param flags Flags passed by reference
1161      * @return Pointer to branch node
1162      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
1163      */
1164     int branch(int[] flags) throws RESyntaxException
1165     {
1166         // Get each possibly closured piece and concat
1167         int node;
1168         int ret = node(RE.OP_BRANCH, 0);
1169         int chain = -1;
1170         int[] closureFlags = new int[1];
1171         boolean nullable = true;
1172         while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')')
1173         {
1174             // Get new node
1175             closureFlags[0] = NODE_NORMAL;
1176             node = closure(closureFlags);
1177             if (closureFlags[0] == NODE_NORMAL)
1178             {
1179                 nullable = false;
1180             }
1181 
1182             // If there's a chain, append to the end
1183             if (chain != -1)
1184             {
1185                 setNextOfEnd(chain, node);
1186             }
1187 
1188             // Chain starts at current
1189             chain = node;
1190         }
1191 
1192         // If we don't run loop, make a nothing node
1193         if (chain == -1)
1194         {
1195             node(RE.OP_NOTHING, 0);
1196         }
1197 
1198         // Set nullable flag for this branch
1199         if (nullable)
1200         {
1201             flags[0] |= NODE_NULLABLE;
1202         }
1203         return ret;
1204     }
1205 
1206     /**
1207      * Compile an expression with possible parens around it.  Paren matching
1208      * is done at this level so we can tie the branch tails together.
1209      * @param flags Flag value passed by reference
1210      * @return Node index of expression in instruction array
1211      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
1212      */
1213     int expr(int[] flags) throws RESyntaxException
1214     {
1215         // Create open paren node unless we were called from the top level (which has no parens)
1216         int paren = -1;
1217         int ret = -1;
1218         int closeParens = parens;
1219         if ((flags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(')
1220         {
1221             // if its a cluster ( rather than a proper subexpression ie with backrefs )
1222             if ( idx + 2 < len && pattern.charAt( idx + 1 ) == '?' && pattern.charAt( idx + 2 ) == ':' )
1223             {
1224                 paren = 2;
1225                 idx += 3;
1226                 ret = node( RE.OP_OPEN_CLUSTER, 0 );
1227             }
1228             else
1229             {
1230                 paren = 1;
1231                 idx++;
1232                 ret = node(RE.OP_OPEN, parens++);
1233             }
1234         }
1235         flags[0] &= ~NODE_TOPLEVEL;
1236 
1237         // Create a branch node
1238         int branch = branch(flags);
1239         if (ret == -1)
1240         {
1241             ret = branch;
1242         }
1243         else
1244         {
1245             setNextOfEnd(ret, branch);
1246         }
1247 
1248         // Loop through branches
1249         while (idx < len && pattern.charAt(idx) == '|')
1250         {
1251             idx++;
1252             branch = branch(flags);
1253             setNextOfEnd(ret, branch);
1254         }
1255 
1256         // Create an ending node (either a close paren or an OP_END)
1257         int end;
1258         if ( paren > 0 )
1259         {
1260             if (idx < len && pattern.charAt(idx) == ')')
1261             {
1262                 idx++;
1263             }
1264             else
1265             {
1266                 syntaxError("Missing close paren");
1267             }
1268             if ( paren == 1 )
1269             {
1270                 end = node(RE.OP_CLOSE, closeParens);
1271             }
1272             else
1273             {
1274                 end = node( RE.OP_CLOSE_CLUSTER, 0 );
1275             }
1276         }
1277         else
1278         {
1279             end = node(RE.OP_END, 0);
1280         }
1281 
1282         // Append the ending node to the ret nodelist
1283         setNextOfEnd(ret, end);
1284 
1285         // Hook the ends of each branch to the end node
1286         int currentNode = ret;
1287         int nextNodeOffset = instruction[ currentNode + RE.offsetNext ];
1288         // while the next node o
1289         while ( nextNodeOffset != 0 && currentNode < lenInstruction )
1290         {
1291             // If branch, make the end of the branch's operand chain point to the end node.
1292             if ( instruction[ currentNode + RE.offsetOpcode ] == RE.OP_BRANCH )
1293             {
1294                 setNextOfEnd( currentNode + RE.nodeSize, end );
1295             }
1296             nextNodeOffset = instruction[ currentNode + RE.offsetNext ];
1297             currentNode += nextNodeOffset;
1298         }
1299 
1300         // Return the node list
1301         return ret;
1302     }
1303 
1304     /**
1305      * Compiles a regular expression pattern into a program runnable by the pattern
1306      * matcher class 'RE'.
1307      * @param pattern Regular expression pattern to compile (see RECompiler class
1308      * for details).
1309      * @return A compiled regular expression program.
1310      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
1311      * @see RECompiler
1312      * @see RE
1313      */
1314     public REProgram compile(String pattern) throws RESyntaxException
1315     {
1316         // Initialize variables for compilation
1317         this.pattern = pattern;                         // Save pattern in instance variable
1318         len = pattern.length();                         // Precompute pattern length for speed
1319         idx = 0;                                        // Set parsing index to the first character
1320         lenInstruction = 0;                             // Set emitted instruction count to zero
1321         parens = 1;                                     // Set paren level to 1 (the implicit outer parens)
1322         brackets = 0;                                   // No bracketed closures yet
1323 
1324         // Initialize pass by reference flags value
1325         int[] flags = { NODE_TOPLEVEL };
1326 
1327         // Parse expression
1328         expr(flags);
1329 
1330         // Should be at end of input
1331         if (idx != len)
1332         {
1333             if (pattern.charAt(idx) == ')')
1334             {
1335                 syntaxError("Unmatched close paren");
1336             }
1337             syntaxError("Unexpected input remains");
1338         }
1339 
1340         // Return the result
1341         char[] ins = new char[lenInstruction];
1342         System.arraycopy(instruction, 0, ins, 0, lenInstruction);
1343         return new REProgram(parens, ins);
1344     }
1345 
1346     /**
1347      * Local, nested class for maintaining character ranges for character classes.
1348      */
1349     class RERange
1350     {
1351         int size = 16;                      // Capacity of current range arrays
1352         int[] minRange = new int[size];     // Range minima
1353         int[] maxRange = new int[size];     // Range maxima
1354         int num = 0;                        // Number of range array elements in use
1355 
1356         /**
1357          * Deletes the range at a given index from the range lists
1358          * @param index Index of range to delete from minRange and maxRange arrays.
1359          */
1360         void delete(int index)
1361         {
1362             // Return if no elements left or index is out of range
1363             if (num == 0 || index >= num)
1364             {
1365                 return;
1366             }
1367 
1368             // Move elements down
1369             while (++index < num)
1370             {
1371                 if (index - 1 >= 0)
1372                 {
1373                     minRange[index-1] = minRange[index];
1374                     maxRange[index-1] = maxRange[index];
1375                 }
1376             }
1377 
1378             // One less element now
1379             num--;
1380         }
1381 
1382         /**
1383          * Merges a range into the range list, coalescing ranges if possible.
1384          * @param min Minimum end of range
1385          * @param max Maximum end of range
1386          */
1387         void merge(int min, int max)
1388         {
1389             // Loop through ranges
1390             for (int i = 0; i < num; i++)
1391             {
1392                 // Min-max is subsumed by minRange[i]-maxRange[i]
1393                 if (min >= minRange[i] && max <= maxRange[i])
1394                 {
1395                     return;
1396                 }
1397 
1398                 // Min-max subsumes minRange[i]-maxRange[i]
1399                 else if (min <= minRange[i] && max >= maxRange[i])
1400                 {
1401                     delete(i);
1402                     merge(min, max);
1403                     return;
1404                 }
1405 
1406                 // Min is in the range, but max is outside
1407                 else if (min >= minRange[i] && min <= maxRange[i])
1408                 {
1409                     delete(i);
1410                     min = minRange[i];
1411                     merge(min, max);
1412                     return;
1413                 }
1414 
1415                 // Max is in the range, but min is outside
1416                 else if (max >= minRange[i] && max <= maxRange[i])
1417                 {
1418                     delete(i);
1419                     max = maxRange[i];
1420                     merge(min, max);
1421                     return;
1422                 }
1423             }
1424 
1425             // Must not overlap any other ranges
1426             if (num >= size)
1427             {
1428                 size *= 2;
1429                 int[] newMin = new int[size];
1430                 int[] newMax = new int[size];
1431                 System.arraycopy(minRange, 0, newMin, 0, num);
1432                 System.arraycopy(maxRange, 0, newMax, 0, num);
1433                 minRange = newMin;
1434                 maxRange = newMax;
1435             }
1436             minRange[num] = min;
1437             maxRange[num] = max;
1438             num++;
1439         }
1440 
1441         /**
1442          * Removes a range by deleting or shrinking all other ranges
1443          * @param min Minimum end of range
1444          * @param max Maximum end of range
1445          */
1446         void remove(int min, int max)
1447         {
1448             // Loop through ranges
1449             for (int i = 0; i < num; i++)
1450             {
1451                 // minRange[i]-maxRange[i] is subsumed by min-max
1452                 if (minRange[i] >= min && maxRange[i] <= max)
1453                 {
1454                     delete(i);
1455                     i--;
1456                     return;
1457                 }
1458 
1459                 // min-max is subsumed by minRange[i]-maxRange[i]
1460                 else if (min >= minRange[i] && max <= maxRange[i])
1461                 {
1462                     int minr = minRange[i];
1463                     int maxr = maxRange[i];
1464                     delete(i);
1465                     if (minr < min)
1466                     {
1467                         merge(minr, min - 1);
1468                     }
1469                     if (max < maxr)
1470                     {
1471                         merge(max + 1, maxr);
1472                     }
1473                     return;
1474                 }
1475 
1476                 // minRange is in the range, but maxRange is outside
1477                 else if (minRange[i] >= min && minRange[i] <= max)
1478                 {
1479                     minRange[i] = max + 1;
1480                     return;
1481                 }
1482 
1483                 // maxRange is in the range, but minRange is outside
1484                 else if (maxRange[i] >= min && maxRange[i] <= max)
1485                 {
1486                     maxRange[i] = min - 1;
1487                     return;
1488                 }
1489             }
1490         }
1491 
1492         /**
1493          * Includes (or excludes) the range from min to max, inclusive.
1494          * @param min Minimum end of range
1495          * @param max Maximum end of range
1496          * @param include True if range should be included.  False otherwise.
1497          */
1498         void include(int min, int max, boolean include)
1499         {
1500             if (include)
1501             {
1502                 merge(min, max);
1503             }
1504             else
1505             {
1506                 remove(min, max);
1507             }
1508         }
1509 
1510         /**
1511          * Includes a range with the same min and max
1512          * @param minmax Minimum and maximum end of range (inclusive)
1513          * @param include True if range should be included.  False otherwise.
1514          */
1515         void include(char minmax, boolean include)
1516         {
1517             include(minmax, minmax, include);
1518         }
1519     }
1520 }