View Javadoc
1   /*
2    * reserved comment block
3    * DO NOT REMOVE OR ALTER!
4    */
5   /*
6    * Copyright 1999-2004 The Apache Software Foundation.
7    *
8    * Licensed under the Apache License, Version 2.0 (the "License");
9    * you may not use this file except in compliance with the License.
10   * You may obtain a copy of the License at
11   *
12   *      http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  
21  package com.sun.org.apache.xerces.internal.impl.xpath.regex;
22  
23  import com.sun.org.apache.xerces.internal.utils.SecuritySupport;
24  import java.util.Locale;
25  import java.util.MissingResourceException;
26  import java.util.ResourceBundle;
27  import java.util.Vector;
28  
29  /**
30   * A Regular Expression Parser.
31   *
32   * @xerces.internal
33   *
34   * @version $Id: RegexParser.java,v 1.8 2010-11-01 04:39:54 joehw Exp $
35   */
36  class RegexParser {
37      static final int T_CHAR = 0;
38      static final int T_EOF = 1;
39      static final int T_OR = 2;                  // '|'
40      static final int T_STAR = 3;                // '*'
41      static final int T_PLUS = 4;                // '+'
42      static final int T_QUESTION = 5;            // '?'
43      static final int T_LPAREN = 6;              // '('
44      static final int T_RPAREN = 7;              // ')'
45      static final int T_DOT = 8;                 // '.'
46      static final int T_LBRACKET = 9;            // '['
47      static final int T_BACKSOLIDUS = 10;        // '\'
48      static final int T_CARET = 11;              // '^'
49      static final int T_DOLLAR = 12;             // '$'
50      static final int T_LPAREN2 = 13;            // '(?:'
51      static final int T_LOOKAHEAD = 14;          // '(?='
52      static final int T_NEGATIVELOOKAHEAD = 15;  // '(?!'
53      static final int T_LOOKBEHIND = 16;         // '(?<='
54      static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
55      static final int T_INDEPENDENT = 18;        // '(?>'
56      static final int T_SET_OPERATIONS = 19;     // '(?['
57      static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
58      static final int T_COMMENT = 21;            // '(?#'
59      static final int T_MODIFIERS = 22;          // '(?' [\-,a-z,A-Z]
60      static final int T_CONDITION = 23;          // '(?('
61      static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
62  
63      static class ReferencePosition {
64          int refNumber;
65          int position;
66          ReferencePosition(int n, int pos) {
67              this.refNumber = n;
68              this.position = pos;
69          }
70      }
71  
72      int offset;
73      String regex;
74      int regexlen;
75      int options;
76      ResourceBundle resources;
77      int chardata;
78      int nexttoken;
79      static protected final int S_NORMAL = 0;
80      static protected final int S_INBRACKETS = 1;
81      static protected final int S_INXBRACKETS = 2;
82      int context = S_NORMAL;
83      int parenOpened = 1;
84      int parennumber = 1;
85      boolean hasBackReferences;
86      Vector references = null;
87      int parenCount = 0;
88  
89      public RegexParser() {
90          this.setLocale(Locale.getDefault());
91      }
92      public RegexParser(Locale locale) {
93          this.setLocale(locale);
94      }
95  
96      public void setLocale(Locale locale) {
97          try {
98              if (locale != null) {
99                  this.resources = SecuritySupport.getResourceBundle("com.sun.org.apache.xerces.internal.impl.xpath.regex.message", locale);
100             }
101             else {
102                 this.resources = SecuritySupport.getResourceBundle("com.sun.org.apache.xerces.internal.impl.xpath.regex.message");
103             }
104         }
105         catch (MissingResourceException mre) {
106             throw new RuntimeException("Installation Problem???  Couldn't load messages: "
107                                        + mre.getMessage());
108         }
109     }
110 
111     final ParseException ex(String key, int loc) {
112         return new ParseException(this.resources.getString(key), loc);
113     }
114 
115     protected final boolean isSet(int flag) {
116         return (this.options & flag) == flag;
117     }
118 
119     synchronized Token parse(String regex, int options) throws ParseException {
120         this.options = options;
121         this.offset = 0;
122         this.setContext(S_NORMAL);
123         this.parennumber = 1;
124         this.parenOpened = 1;
125         this.hasBackReferences = false;
126         this.regex = regex;
127         if (this.isSet(RegularExpression.EXTENDED_COMMENT))
128             this.regex = REUtil.stripExtendedComment(this.regex);
129         this.regexlen = this.regex.length();
130 
131 
132         this.next();
133         Token ret = this.parseRegex();
134         if (this.offset != this.regexlen)
135             throw ex("parser.parse.1", this.offset);
136         if (parenCount < 0)
137             throw ex("parser.factor.0", this.offset);
138         if (this.references != null) {
139             for (int i = 0;  i < this.references.size();  i ++) {
140                 ReferencePosition position = (ReferencePosition)this.references.elementAt(i);
141                 if (this.parennumber <= position.refNumber)
142                     throw ex("parser.parse.2", position.position);
143             }
144             this.references.removeAllElements();
145         }
146         return ret;
147     }
148 
149     /*
150     public RegularExpression createRegex(String regex, int options) throws ParseException {
151         Token tok = this.parse(regex, options);
152         return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
153     }
154     */
155 
156     protected final void setContext(int con) {
157         this.context = con;
158     }
159 
160     final int read() {
161         return this.nexttoken;
162     }
163 
164     final void next() {
165         if (this.offset >= this.regexlen) {
166             this.chardata = -1;
167             this.nexttoken = T_EOF;
168             return;
169         }
170 
171         int ret;
172         int ch = this.regex.charAt(this.offset++);
173         this.chardata = ch;
174 
175         if (this.context == S_INBRACKETS) {
176             // In a character class, this.chardata has one character, that is to say,
177             // a pair of surrogates is composed and stored to this.chardata.
178             switch (ch) {
179               case '\\':
180                 ret = T_BACKSOLIDUS;
181                 if (this.offset >= this.regexlen)
182                     throw ex("parser.next.1", this.offset-1);
183                 this.chardata = this.regex.charAt(this.offset++);
184                 break;
185 
186               case '-':
187                 // Allow character class subtraction (regardless of whether we are in
188                 // XML Schema mode or not)
189                 if (this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
190                     this.offset++;
191                     ret = T_XMLSCHEMA_CC_SUBTRACTION;
192                 } else
193                     ret = T_CHAR;
194                 break;
195 
196               case '[':
197                 if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
198                     && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
199                     this.offset++;
200                     ret = T_POSIX_CHARCLASS_START;
201                     break;
202                 } // Through down
203               default:
204                 if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
205                     int low = this.regex.charAt(this.offset);
206                     if (REUtil.isLowSurrogate(low)) {
207                         this.chardata = REUtil.composeFromSurrogates(ch, low);
208                         this.offset ++;
209                     }
210                 }
211                 ret = T_CHAR;
212             }
213             this.nexttoken = ret;
214             return;
215         }
216 
217         switch (ch) {
218           case '|': ret = T_OR;             break;
219           case '*': ret = T_STAR;           break;
220           case '+': ret = T_PLUS;           break;
221           case '?': ret = T_QUESTION;       break;
222           case ')': ret = T_RPAREN;         break;
223           case '.': ret = T_DOT;            break;
224           case '[': ret = T_LBRACKET;       break;
225           case '^':
226               if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) {
227                   ret = T_CHAR;
228               }
229               else {
230                   ret = T_CARET;
231               }
232               break;
233           case '$':
234               if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) {
235                   ret = T_CHAR;
236               }
237               else {
238                   ret = T_DOLLAR;
239               }
240               break;
241           case '(':
242             ret = T_LPAREN;
243             parenCount++;
244             if (this.offset >= this.regexlen)
245                 break;
246             if (this.regex.charAt(this.offset) != '?')
247                 break;
248             if (++this.offset >= this.regexlen)
249                 throw ex("parser.next.2", this.offset-1);
250             ch = this.regex.charAt(this.offset++);
251             switch (ch) {
252               case ':':  ret = T_LPAREN2;            break;
253               case '=':  ret = T_LOOKAHEAD;          break;
254               case '!':  ret = T_NEGATIVELOOKAHEAD;  break;
255               case '[':  ret = T_SET_OPERATIONS;     break;
256               case '>':  ret = T_INDEPENDENT;        break;
257               case '<':
258                 if (this.offset >= this.regexlen)
259                     throw ex("parser.next.2", this.offset-3);
260                 ch = this.regex.charAt(this.offset++);
261                 if (ch == '=') {
262                     ret = T_LOOKBEHIND;
263                 } else if (ch == '!') {
264                     ret = T_NEGATIVELOOKBEHIND;
265                 } else
266                     throw ex("parser.next.3", this.offset-3);
267                 break;
268               case '#':
269                 while (this.offset < this.regexlen) {
270                     ch = this.regex.charAt(this.offset++);
271                     if (ch == ')')  break;
272                 }
273                 if (ch != ')')
274                     throw ex("parser.next.4", this.offset-1);
275                 ret = T_COMMENT;
276                 break;
277               default:
278                 if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options
279                     this.offset --;
280                     ret = T_MODIFIERS;
281                     break;
282                 } else if (ch == '(') {         // conditional
283                     ret = T_CONDITION;          // this.offsets points the next of '('.
284                     break;
285                 }
286                 throw ex("parser.next.2", this.offset-2);
287             }
288             break;
289 
290           case '\\':
291             ret = T_BACKSOLIDUS;
292             if (this.offset >= this.regexlen)
293                 throw ex("parser.next.1", this.offset-1);
294             this.chardata = this.regex.charAt(this.offset++);
295             break;
296 
297           default:
298             ret = T_CHAR;
299         }
300         this.nexttoken = ret;
301     }
302 
303     /**
304      * regex ::= term (`|` term)*
305      * term ::= factor+
306      * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
307      *            | atom (('*' | '+' | '?' | minmax ) '?'? )?)
308      *            | '(?=' regex ')'  | '(?!' regex ')'  | '(?&lt;=' regex ')'  | '(?&lt;!' regex ')'
309      * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
310      *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
311      */
312     Token parseRegex() throws ParseException {
313         Token tok = this.parseTerm();
314         Token parent = null;
315         while (this.read() == T_OR) {
316             this.next();                    // '|'
317             if (parent == null) {
318                 parent = Token.createUnion();
319                 parent.addChild(tok);
320                 tok = parent;
321             }
322             tok.addChild(this.parseTerm());
323         }
324         return tok;
325     }
326 
327     /**
328      * term ::= factor+
329      */
330     Token parseTerm() throws ParseException {
331         int ch = this.read();
332         Token tok = null;
333         if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
334             tok = Token.createEmpty();
335         } else {
336             tok = this.parseFactor();
337             Token concat = null;
338             while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
339                 if (concat == null) {
340                     concat = Token.createConcat();
341                     concat.addChild(tok);
342                     tok = concat;
343                 }
344                 concat.addChild(this.parseFactor());
345                 //tok = Token.createConcat(tok, this.parseFactor());
346             }
347         }
348         if (ch == T_RPAREN) {
349             parenCount--;
350         }
351         return tok;
352     }
353 
354     // ----------------------------------------------------------------
355 
356     Token processCaret() throws ParseException {
357         this.next();
358         return Token.token_linebeginning;
359     }
360     Token processDollar() throws ParseException {
361         this.next();
362         return Token.token_lineend;
363     }
364     Token processLookahead() throws ParseException {
365         this.next();
366         Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
367         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
368         this.next();                            // ')'
369         return tok;
370     }
371     Token processNegativelookahead() throws ParseException {
372         this.next();
373         Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
374         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
375         this.next();                            // ')'
376         return tok;
377     }
378     Token processLookbehind() throws ParseException {
379         this.next();
380         Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
381         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
382         this.next();                            // ')'
383         return tok;
384     }
385     Token processNegativelookbehind() throws ParseException {
386         this.next();
387         Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
388         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
389         this.next();                    // ')'
390         return tok;
391     }
392     Token processBacksolidus_A() throws ParseException {
393         this.next();
394         return Token.token_stringbeginning;
395     }
396     Token processBacksolidus_Z() throws ParseException {
397         this.next();
398         return Token.token_stringend2;
399     }
400     Token processBacksolidus_z() throws ParseException {
401         this.next();
402         return Token.token_stringend;
403     }
404     Token processBacksolidus_b() throws ParseException {
405         this.next();
406         return Token.token_wordedge;
407     }
408     Token processBacksolidus_B() throws ParseException {
409         this.next();
410         return Token.token_not_wordedge;
411     }
412     Token processBacksolidus_lt() throws ParseException {
413         this.next();
414         return Token.token_wordbeginning;
415     }
416     Token processBacksolidus_gt() throws ParseException {
417         this.next();
418         return Token.token_wordend;
419     }
420     Token processStar(Token tok) throws ParseException {
421         this.next();
422         if (this.read() == T_QUESTION) {
423             this.next();
424             return Token.createNGClosure(tok);
425         } else
426             return Token.createClosure(tok);
427     }
428     Token processPlus(Token tok) throws ParseException {
429         // X+ -> XX*
430         this.next();
431         if (this.read() == T_QUESTION) {
432             this.next();
433             return Token.createConcat(tok, Token.createNGClosure(tok));
434         } else
435             return Token.createConcat(tok, Token.createClosure(tok));
436     }
437     Token processQuestion(Token tok) throws ParseException {
438         // X? -> X|
439         this.next();
440         Token par = Token.createUnion();
441         if (this.read() == T_QUESTION) {
442             this.next();
443             par.addChild(Token.createEmpty());
444             par.addChild(tok);
445         } else {
446             par.addChild(tok);
447             par.addChild(Token.createEmpty());
448         }
449         return par;
450     }
451     boolean checkQuestion(int off) {
452         return off < this.regexlen && this.regex.charAt(off) == '?';
453     }
454     Token processParen() throws ParseException {
455         this.next();
456         int p = this.parenOpened++;
457         Token tok = Token.createParen(this.parseRegex(), p);
458         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
459         this.parennumber++;
460         this.next();                            // Skips ')'
461         return tok;
462     }
463     Token processParen2() throws ParseException {
464         this.next();
465         Token tok = Token.createParen(this.parseRegex(), 0);
466         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
467         this.next();                            // Skips ')'
468         return tok;
469     }
470     Token processCondition() throws ParseException {
471                                                 // this.offset points the next of '('
472         if (this.offset+1 >= this.regexlen)  throw ex("parser.factor.4", this.offset);
473                                                 // Parses a condition.
474         int refno = -1;
475         Token condition = null;
476         int ch = this.regex.charAt(this.offset);
477         if ('1' <= ch && ch <= '9') {
478             refno = ch-'0';
479             int finalRefno = refno;
480 
481             if (this.parennumber <= refno)
482                 throw ex("parser.parse.2", this.offset);
483 
484             while (this.offset + 1 < this.regexlen) {
485                 ch = this.regex.charAt(this.offset + 1);
486                 if ('1' <= ch && ch <= '9') {
487                     refno = (refno * 10) + (ch - '0');
488                     if (refno < this.parennumber) {
489                         finalRefno= refno;
490                         ++this.offset;
491                     }
492                     else {
493                         break;
494                     }
495                 }
496                 else {
497                     break;
498                 }
499             }
500 
501             this.hasBackReferences = true;
502             if (this.references == null)  this.references = new Vector();
503             this.references.addElement(new ReferencePosition(finalRefno, this.offset));
504             this.offset ++;
505             if (this.regex.charAt(this.offset) != ')')  throw ex("parser.factor.1", this.offset);
506             this.offset ++;
507         } else {
508             if (ch == '?')  this.offset --; // Points '('.
509             this.next();
510             condition = this.parseFactor();
511             switch (condition.type) {
512               case Token.LOOKAHEAD:
513               case Token.NEGATIVELOOKAHEAD:
514               case Token.LOOKBEHIND:
515               case Token.NEGATIVELOOKBEHIND:
516                 break;
517               case Token.ANCHOR:
518                 if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
519                 break;
520               default:
521                 throw ex("parser.factor.5", this.offset);
522             }
523         }
524                                                 // Parses yes/no-patterns.
525         this.next();
526         Token yesPattern = this.parseRegex();
527         Token noPattern = null;
528         if (yesPattern.type == Token.UNION) {
529             if (yesPattern.size() != 2)  throw ex("parser.factor.6", this.offset);
530             noPattern = yesPattern.getChild(1);
531             yesPattern = yesPattern.getChild(0);
532         }
533         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
534         this.next();
535         return Token.createCondition(refno, condition, yesPattern, noPattern);
536     }
537     Token processModifiers() throws ParseException {
538                                                 // this.offset points the next of '?'.
539                                                 // modifiers ::= [imsw]* ('-' [imsw]*)? ':'
540         int add = 0, mask = 0, ch = -1;
541         while (this.offset < this.regexlen) {
542             ch = this.regex.charAt(this.offset);
543             int v = REUtil.getOptionValue(ch);
544             if (v == 0)  break;                 // '-' or ':'?
545             add |= v;
546             this.offset ++;
547         }
548         if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
549         if (ch == '-') {
550             this.offset ++;
551             while (this.offset < this.regexlen) {
552                 ch = this.regex.charAt(this.offset);
553                 int v = REUtil.getOptionValue(ch);
554                 if (v == 0)  break;             // ':'?
555                 mask |= v;
556                 this.offset ++;
557             }
558             if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
559         }
560         Token tok;
561         if (ch == ':') {
562             this.offset ++;
563             this.next();
564             tok = Token.createModifierGroup(this.parseRegex(), add, mask);
565             if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
566             this.next();
567         } else if (ch == ')') {                 // such as (?-i)
568             this.offset ++;
569             this.next();
570             tok = Token.createModifierGroup(this.parseRegex(), add, mask);
571         } else
572             throw ex("parser.factor.3", this.offset);
573 
574         return tok;
575     }
576     Token processIndependent() throws ParseException {
577         this.next();
578         Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
579         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
580         this.next();                            // Skips ')'
581         return tok;
582     }
583     Token processBacksolidus_c() throws ParseException {
584         int ch2;                                // Must be in 0x0040-0x005f
585         if (this.offset >= this.regexlen
586             || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040)
587             throw ex("parser.atom.1", this.offset-1);
588         this.next();
589         return Token.createChar(ch2-0x40);
590     }
591     Token processBacksolidus_C() throws ParseException {
592         throw ex("parser.process.1", this.offset);
593     }
594     Token processBacksolidus_i() throws ParseException {
595         Token tok = Token.createChar('i');
596         this.next();
597         return tok;
598     }
599     Token processBacksolidus_I() throws ParseException {
600         throw ex("parser.process.1", this.offset);
601     }
602     Token processBacksolidus_g() throws ParseException {
603         this.next();
604         return Token.getGraphemePattern();
605     }
606     Token processBacksolidus_X() throws ParseException {
607         this.next();
608         return Token.getCombiningCharacterSequence();
609     }
610     Token processBackreference() throws ParseException {
611         int refnum = this.chardata-'0';
612         int finalRefnum = refnum;
613 
614         if (this.parennumber <= refnum)
615             throw ex("parser.parse.2", this.offset-2);
616 
617         while  (this.offset < this.regexlen) {
618             final int ch = this.regex.charAt(this.offset);
619             if ('1' <= ch && ch <= '9') {
620                 refnum = (refnum * 10) + (ch - '0');
621                 if (refnum < this.parennumber) {
622                     ++this.offset;
623                     finalRefnum = refnum;
624                     this.chardata = ch;
625                 }
626                 else {
627                     break;
628                 }
629             }
630             else {
631                 break;
632             }
633         }
634 
635         Token tok = Token.createBackReference(finalRefnum);
636         this.hasBackReferences = true;
637         if (this.references == null)  this.references = new Vector();
638         this.references.addElement(new ReferencePosition(finalRefnum, this.offset-2));
639         this.next();
640         return tok;
641     }
642 
643     // ----------------------------------------------------------------
644 
645     /**
646      * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
647      *            | atom (('*' | '+' | '?' | minmax ) '?'? )?)
648      *            | '(?=' regex ')'  | '(?!' regex ')'  | '(?&lt;=' regex ')'  | '(?&lt;!' regex ')'
649      *            | '(?#' [^)]* ')'
650      * minmax ::= '{' min (',' max?)? '}'
651      * min ::= [0-9]+
652      * max ::= [0-9]+
653      */
654     Token parseFactor() throws ParseException {
655         int ch = this.read();
656         Token tok;
657         switch (ch) {
658           case T_CARET:         return this.processCaret();
659           case T_DOLLAR:        return this.processDollar();
660           case T_LOOKAHEAD:     return this.processLookahead();
661           case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
662           case T_LOOKBEHIND:    return this.processLookbehind();
663           case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();
664 
665           case T_COMMENT:
666             this.next();
667             return Token.createEmpty();
668 
669           case T_BACKSOLIDUS:
670             switch (this.chardata) {
671               case 'A': return this.processBacksolidus_A();
672               case 'Z': return this.processBacksolidus_Z();
673               case 'z': return this.processBacksolidus_z();
674               case 'b': return this.processBacksolidus_b();
675               case 'B': return this.processBacksolidus_B();
676               case '<': return this.processBacksolidus_lt();
677               case '>': return this.processBacksolidus_gt();
678             }
679                                                 // through down
680         }
681         tok = this.parseAtom();
682         ch = this.read();
683         switch (ch) {
684           case T_STAR:  return this.processStar(tok);
685           case T_PLUS:  return this.processPlus(tok);
686           case T_QUESTION: return this.processQuestion(tok);
687           case T_CHAR:
688             if (this.chardata == '{' && this.offset < this.regexlen) {
689 
690                 int off = this.offset;          // this.offset -> next of '{'
691                 int min = 0, max = -1;
692 
693                 if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
694 
695                     min = ch -'0';
696                     while (off < this.regexlen
697                            && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
698                         min = min*10 +ch-'0';
699                         if (min < 0)
700                             throw ex("parser.quantifier.5", this.offset);
701                     }
702                 }
703                 else {
704                     throw ex("parser.quantifier.1", this.offset);
705                 }
706 
707                 max = min;
708                 if (ch == ',') {
709 
710                    if (off >= this.regexlen) {
711                        throw ex("parser.quantifier.3", this.offset);
712                    }
713                    else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
714 
715                         max = ch -'0';       // {min,max}
716                         while (off < this.regexlen
717                                && (ch = this.regex.charAt(off++)) >= '0'
718                                && ch <= '9') {
719                             max = max*10 +ch-'0';
720                             if (max < 0)
721                                 throw ex("parser.quantifier.5", this.offset);
722                         }
723 
724                         if (min > max)
725                             throw ex("parser.quantifier.4", this.offset);
726                    }
727                    else { // assume {min,}
728                         max = -1;
729                     }
730                 }
731 
732                if (ch != '}')
733                    throw ex("parser.quantifier.2", this.offset);
734 
735                if (this.checkQuestion(off)) {  // off -> next of '}'
736                     tok = Token.createNGClosure(tok);
737                     this.offset = off+1;
738                 } else {
739                     tok = Token.createClosure(tok);
740                     this.offset = off;
741                 }
742 
743                 tok.setMin(min);
744                 tok.setMax(max);
745                 //System.err.println("CLOSURE: "+min+", "+max);
746                 this.next();
747             }
748         }
749         return tok;
750     }
751 
752     /**
753      * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
754      *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
755      *          | '(?>' regex ')'
756      * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
757      */
758     Token parseAtom() throws ParseException {
759         int ch = this.read();
760         Token tok = null;
761         switch (ch) {
762           case T_LPAREN:        return this.processParen();
763           case T_LPAREN2:       return this.processParen2(); // '(?:'
764           case T_CONDITION:     return this.processCondition(); // '(?('
765           case T_MODIFIERS:     return this.processModifiers(); // (?modifiers ... )
766           case T_INDEPENDENT:   return this.processIndependent();
767           case T_DOT:
768             this.next();                    // Skips '.'
769             tok = Token.token_dot;
770             break;
771 
772             /**
773              * char-class ::= '[' ( '^'? range ','?)+ ']'
774              * range ::= '\d' | '\w' | '\s' | category-block | range-char
775              *           | range-char '-' range-char
776              * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
777              * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
778              */
779           case T_LBRACKET:      return this.parseCharacterClass(true);
780           case T_SET_OPERATIONS: return this.parseSetOperations();
781 
782           case T_BACKSOLIDUS:
783             switch (this.chardata) {
784               case 'd':  case 'D':
785               case 'w':  case 'W':
786               case 's':  case 'S':
787                 tok = this.getTokenForShorthand(this.chardata);
788                 this.next();
789                 return tok;
790 
791               case 'e':  case 'f':  case 'n':  case 'r':
792               case 't':  case 'u':  case 'v':  case 'x':
793                 {
794                     int ch2 = this.decodeEscaped();
795                     if (ch2 < 0x10000) {
796                         tok = Token.createChar(ch2);
797                     } else {
798                         tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
799                     }
800                 }
801                 break;
802 
803               case 'c': return this.processBacksolidus_c();
804               case 'C': return this.processBacksolidus_C();
805               case 'i': return this.processBacksolidus_i();
806               case 'I': return this.processBacksolidus_I();
807               case 'g': return this.processBacksolidus_g();
808               case 'X': return this.processBacksolidus_X();
809               case '1':  case '2':  case '3':  case '4':
810               case '5':  case '6':  case '7':  case '8':  case '9':
811                 return this.processBackreference();
812 
813               case 'P':
814               case 'p':
815                 int pstart = this.offset;
816                 tok = processBacksolidus_pP(this.chardata);
817                 if (tok == null)  throw this.ex("parser.atom.5", pstart);
818                 break;
819 
820               default:
821                 tok = Token.createChar(this.chardata);
822             }
823             this.next();
824             break;
825 
826           case T_CHAR:
827             if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}')
828                 throw this.ex("parser.atom.4", this.offset-1);
829             tok = Token.createChar(this.chardata);
830             int high = this.chardata;
831             this.next();
832             if (REUtil.isHighSurrogate(high)
833                 && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) {
834                 char[] sur = new char[2];
835                 sur[0] = (char)high;
836                 sur[1] = (char)this.chardata;
837                 tok = Token.createParen(Token.createString(new String(sur)), 0);
838                 this.next();
839             }
840             break;
841 
842           default:
843             throw this.ex("parser.atom.4", this.offset-1);
844         }
845         return tok;
846     }
847 
848     protected RangeToken processBacksolidus_pP(int c) throws ParseException {
849 
850         this.next();
851         if (this.read() != T_CHAR || this.chardata != '{')
852             throw this.ex("parser.atom.2", this.offset-1);
853 
854         // handle category escape
855         boolean positive = c == 'p';
856         int namestart = this.offset;
857         int nameend = this.regex.indexOf('}', namestart);
858 
859         if (nameend < 0)
860             throw this.ex("parser.atom.3", this.offset);
861 
862         String pname = this.regex.substring(namestart, nameend);
863         this.offset = nameend+1;
864 
865         return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE));
866     }
867 
868     int processCIinCharacterClass(RangeToken tok, int c) {
869         return this.decodeEscaped();
870     }
871 
872     /**
873      * char-class ::= '[' ( '^'? range ','?)+ ']'
874      * range ::= '\d' | '\w' | '\s' | category-block | range-char
875      *           | range-char '-' range-char
876      * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
877      * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
878      */
879     protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
880         this.setContext(S_INBRACKETS);
881         this.next();                            // '['
882         boolean nrange = false;
883         RangeToken base = null;
884         RangeToken tok;
885         if (this.read() == T_CHAR && this.chardata == '^') {
886             nrange = true;
887             this.next();                        // '^'
888             if (useNrange) {
889                 tok = Token.createNRange();
890             } else {
891                 base = Token.createRange();
892                 base.addRange(0, Token.UTF16_MAX);
893                 tok = Token.createRange();
894             }
895         } else {
896             tok = Token.createRange();
897         }
898         int type;
899         boolean firstloop = true;
900         while ((type = this.read()) != T_EOF) {
901             if (type == T_CHAR && this.chardata == ']' && !firstloop)
902                 break;
903             int c = this.chardata;
904             boolean end = false;
905             if (type == T_BACKSOLIDUS) {
906                 switch (c) {
907                   case 'd':  case 'D':
908                   case 'w':  case 'W':
909                   case 's':  case 'S':
910                     tok.mergeRanges(this.getTokenForShorthand(c));
911                     end = true;
912                     break;
913 
914                   case 'i':  case 'I':
915                   case 'c':  case 'C':
916                     c = this.processCIinCharacterClass(tok, c);
917                     if (c < 0)  end = true;
918                     break;
919 
920                   case 'p':
921                   case 'P':
922                     int pstart = this.offset;
923                     RangeToken tok2 = this.processBacksolidus_pP(c);
924                     if (tok2 == null)  throw this.ex("parser.atom.5", pstart);
925                     tok.mergeRanges(tok2);
926                     end = true;
927                     break;
928 
929                   default:
930                     c = this.decodeEscaped();
931                 } // \ + c
932             } // backsolidus
933                                                 // POSIX Character class such as [:alnum:]
934             else if (type == T_POSIX_CHARCLASS_START) {
935                 int nameend = this.regex.indexOf(':', this.offset);
936                 if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
937                 boolean positive = true;
938                 if (this.regex.charAt(this.offset) == '^') {
939                     this.offset ++;
940                     positive = false;
941                 }
942                 String name = this.regex.substring(this.offset, nameend);
943                 RangeToken range = Token.getRange(name, positive,
944                                                   this.isSet(RegularExpression.XMLSCHEMA_MODE));
945                 if (range == null)  throw this.ex("parser.cc.3", this.offset);
946                 tok.mergeRanges(range);
947                 end = true;
948                 if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
949                     throw this.ex("parser.cc.1", nameend);
950                 this.offset = nameend+2;
951             }
952             else if (type == T_XMLSCHEMA_CC_SUBTRACTION && !firstloop) {
953                 if (nrange) {
954                     nrange = false;
955                     if (useNrange) {
956                         tok = (RangeToken) Token.complementRanges(tok);
957                     }
958                     else {
959                         base.subtractRanges(tok);
960                         tok = base;
961                     }
962                 }
963                 RangeToken range2 = this.parseCharacterClass(false);
964                 tok.subtractRanges(range2);
965                 if (this.read() != T_CHAR || this.chardata != ']') {
966                     throw this.ex("parser.cc.5", this.offset);
967                 }
968                 break;                          // Exit this loop
969             }
970             this.next();
971             if (!end) {                         // if not shorthands...
972                 if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
973                     if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
974                         tok.addRange(c, c);
975                     }
976                     else {
977                         addCaseInsensitiveChar(tok, c);
978                     }
979                 }
980                 else if (type == T_XMLSCHEMA_CC_SUBTRACTION) {
981                     throw this.ex("parser.cc.8", this.offset-1);
982                 }
983                 else {
984                     this.next(); // Skips '-'
985                     if ((type = this.read()) == T_EOF)  throw this.ex("parser.cc.2", this.offset);
986                     if (type == T_CHAR && this.chardata == ']') {
987                         if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
988                             tok.addRange(c, c);
989                         }
990                         else {
991                             addCaseInsensitiveChar(tok, c);
992                         }
993                         tok.addRange('-', '-');
994                     } else {
995                         int rangeend = this.chardata;
996                         if (type == T_BACKSOLIDUS) {
997                             rangeend = this.decodeEscaped();
998                         }
999                         this.next();
1000                         if (c > rangeend) {
1001                             throw this.ex("parser.ope.3", this.offset-1);
1002                         }
1003                         if (!this.isSet(RegularExpression.IGNORE_CASE) ||
1004                                 (c > 0xffff && rangeend > 0xffff)) {
1005                             tok.addRange(c, rangeend);
1006                         }
1007                         else {
1008                             addCaseInsensitiveCharRange(tok, c, rangeend);
1009                         }
1010                     }
1011                 }
1012             }
1013             if (this.isSet(RegularExpression.SPECIAL_COMMA)
1014                 && this.read() == T_CHAR && this.chardata == ',') {
1015                 this.next();
1016             }
1017             firstloop = false;
1018         }
1019         if (this.read() == T_EOF) {
1020             throw this.ex("parser.cc.2", this.offset);
1021         }
1022 
1023         if (!useNrange && nrange) {
1024             base.subtractRanges(tok);
1025             tok = base;
1026         }
1027         tok.sortRanges();
1028         tok.compactRanges();
1029         this.setContext(S_NORMAL);
1030         this.next();                    // Skips ']'
1031 
1032         return tok;
1033     }
1034 
1035     /**
1036      * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
1037      */
1038     protected RangeToken parseSetOperations() throws ParseException {
1039         RangeToken tok = this.parseCharacterClass(false);
1040         int type;
1041         while ((type = this.read()) != T_RPAREN) {
1042             int ch = this.chardata;
1043             if (type == T_CHAR && (ch == '-' || ch == '&')
1044                 || type == T_PLUS) {
1045                 this.next();
1046                 if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1);
1047                 RangeToken t2 = this.parseCharacterClass(false);
1048                 if (type == T_PLUS)
1049                     tok.mergeRanges(t2);
1050                 else if (ch == '-')
1051                     tok.subtractRanges(t2);
1052                 else if (ch == '&')
1053                     tok.intersectRanges(t2);
1054                 else
1055                     throw new RuntimeException("ASSERT");
1056             } else {
1057                 throw ex("parser.ope.2", this.offset-1);
1058             }
1059         }
1060         this.next();
1061         return tok;
1062     }
1063 
1064     Token getTokenForShorthand(int ch) {
1065         Token tok;
1066         switch (ch) {
1067           case 'd':
1068             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1069                 ? Token.getRange("Nd", true) : Token.token_0to9;
1070             break;
1071           case 'D':
1072             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1073                 ? Token.getRange("Nd", false) : Token.token_not_0to9;
1074             break;
1075           case 'w':
1076             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1077                 ? Token.getRange("IsWord", true) : Token.token_wordchars;
1078             break;
1079           case 'W':
1080             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1081                 ? Token.getRange("IsWord", false) : Token.token_not_wordchars;
1082             break;
1083           case 's':
1084             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1085                 ? Token.getRange("IsSpace", true) : Token.token_spaces;
1086             break;
1087           case 'S':
1088             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1089                 ? Token.getRange("IsSpace", false) : Token.token_not_spaces;
1090             break;
1091 
1092           default:
1093             throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16));
1094         }
1095         return tok;
1096     }
1097 
1098     /**
1099      */
1100     int decodeEscaped() throws ParseException {
1101         if (this.read() != T_BACKSOLIDUS)  throw ex("parser.next.1", this.offset-1);
1102         int c = this.chardata;
1103         switch (c) {
1104           case 'e':  c = 0x1b;  break; // ESCAPE U+001B
1105           case 'f':  c = '\f';  break; // FORM FEED U+000C
1106           case 'n':  c = '\n';  break; // LINE FEED U+000A
1107           case 'r':  c = '\r';  break; // CRRIAGE RETURN U+000D
1108           case 't':  c = '\t';  break; // HORIZONTAL TABULATION U+0009
1109           //case 'v':  c = 0x0b;  break; // VERTICAL TABULATION U+000B
1110           case 'x':
1111             this.next();
1112             if (this.read() != T_CHAR)  throw ex("parser.descape.1", this.offset-1);
1113             if (this.chardata == '{') {
1114                 int v1 = 0;
1115                 int uv = 0;
1116                 do {
1117                     this.next();
1118                     if (this.read() != T_CHAR)  throw ex("parser.descape.1", this.offset-1);
1119                     if ((v1 = hexChar(this.chardata)) < 0)
1120                         break;
1121                     if (uv > uv*16) throw ex("parser.descape.2", this.offset-1);
1122                     uv = uv*16+v1;
1123                 } while (true);
1124                 if (this.chardata != '}')  throw ex("parser.descape.3", this.offset-1);
1125                 if (uv > Token.UTF16_MAX)  throw ex("parser.descape.4", this.offset-1);
1126                 c = uv;
1127             } else {
1128                 int v1 = 0;
1129                 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1130                     throw ex("parser.descape.1", this.offset-1);
1131                 int uv = v1;
1132                 this.next();
1133                 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1134                     throw ex("parser.descape.1", this.offset-1);
1135                 uv = uv*16+v1;
1136                 c = uv;
1137             }
1138             break;
1139 
1140           case 'u':
1141             int v1 = 0;
1142             this.next();
1143             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1144                 throw ex("parser.descape.1", this.offset-1);
1145             int uv = v1;
1146             this.next();
1147             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1148                 throw ex("parser.descape.1", this.offset-1);
1149             uv = uv*16+v1;
1150             this.next();
1151             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1152                 throw ex("parser.descape.1", this.offset-1);
1153             uv = uv*16+v1;
1154             this.next();
1155             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1156                 throw ex("parser.descape.1", this.offset-1);
1157             uv = uv*16+v1;
1158             c = uv;
1159             break;
1160 
1161           case 'v':
1162             this.next();
1163             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1164                 throw ex("parser.descape.1", this.offset-1);
1165             uv = v1;
1166             this.next();
1167             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1168                 throw ex("parser.descape.1", this.offset-1);
1169             uv = uv*16+v1;
1170             this.next();
1171             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1172                 throw ex("parser.descape.1", this.offset-1);
1173             uv = uv*16+v1;
1174             this.next();
1175             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1176                 throw ex("parser.descape.1", this.offset-1);
1177             uv = uv*16+v1;
1178             this.next();
1179             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1180                 throw ex("parser.descape.1", this.offset-1);
1181             uv = uv*16+v1;
1182             this.next();
1183             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1184                 throw ex("parser.descape.1", this.offset-1);
1185             uv = uv*16+v1;
1186             if (uv > Token.UTF16_MAX)  throw ex("parser.descappe.4", this.offset-1);
1187             c = uv;
1188             break;
1189           case 'A':
1190           case 'Z':
1191           case 'z':
1192             throw ex("parser.descape.5", this.offset-2);
1193           default:
1194         }
1195         return c;
1196     }
1197 
1198     static private final int hexChar(int ch) {
1199         if (ch < '0')  return -1;
1200         if (ch > 'f')  return -1;
1201         if (ch <= '9')  return ch-'0';
1202         if (ch < 'A')  return -1;
1203         if (ch <= 'F')  return ch-'A'+10;
1204         if (ch < 'a')  return -1;
1205         return ch-'a'+10;
1206     }
1207 
1208     static protected final void addCaseInsensitiveChar(RangeToken tok, int c) {
1209         final int[] caseMap = CaseInsensitiveMap.get(c);
1210         tok.addRange(c, c);
1211 
1212         if (caseMap != null) {
1213             for (int i=0; i<caseMap.length; i+=2) {
1214                 tok.addRange(caseMap[i], caseMap[i]);
1215             }
1216         }
1217 
1218     }
1219 
1220     static protected final void addCaseInsensitiveCharRange(RangeToken tok, int start, int end) {
1221         int[] caseMap;
1222         int r1, r2;
1223         if (start <= end) {
1224             r1 = start;
1225             r2 = end;
1226         } else {
1227             r1 = end;
1228             r2 = start;
1229         }
1230 
1231         tok.addRange(r1, r2);
1232         for (int ch = r1;  ch <= r2;  ch++) {
1233             caseMap = CaseInsensitiveMap.get(ch);
1234             if (caseMap != null) {
1235                 for (int i=0; i<caseMap.length; i+=2) {
1236                     tok.addRange(caseMap[i], caseMap[i]);
1237                 }
1238             }
1239         }
1240     }
1241 }