View Javadoc
1   /*
2    * reserved comment block
3    * DO NOT REMOVE OR ALTER!
4    */
5   /*
6    * Copyright 1999-2002,2004,2005 The Apache Software Foundation.
7    *
8    * Licensed under the Apache License, Version 2.0 (the "License");
9    * you may not use this file except in compliance with the License.
10   * You may obtain a copy of the License at
11   *
12   *      http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  
21  package com.sun.org.apache.xerces.internal.impl.xpath.regex;
22  
23  import java.util.Vector;
24  import java.util.Hashtable;
25  
26  /**
27   * This class represents a node in parse tree.
28   *
29   * @xerces.internal
30   *
31   * @version $Id: Token.java,v 1.7 2010/07/27 05:02:34 joehw Exp $
32   */
33  class Token implements java.io.Serializable {
34  
35      private static final long serialVersionUID = 8484976002585487481L;
36  
37      static final boolean COUNTTOKENS = true;
38      static int tokens = 0;
39  
40      static final int CHAR = 0;                  // Literal char
41      static final int DOT = 11;                  // .
42      static final int CONCAT = 1;                // XY
43      static final int UNION = 2;                 // X|Y|Z
44      static final int CLOSURE = 3;               // X*
45      static final int RANGE = 4;                 // [a-zA-Z] etc.
46      static final int NRANGE = 5;                // [^a-zA-Z] etc.
47      static final int PAREN = 6;                 // (X) or (?:X)
48      static final int EMPTY = 7;                 //
49      static final int ANCHOR = 8;                // ^ $ \b \B \< \> \A \Z \z
50      static final int NONGREEDYCLOSURE = 9;      // *? +?
51      static final int STRING = 10;               // strings
52      static final int BACKREFERENCE = 12;        // back references
53      static final int LOOKAHEAD = 20;            // (?=...)
54      static final int NEGATIVELOOKAHEAD = 21;    // (?!...)
55      static final int LOOKBEHIND = 22;           // (?<=...)
56      static final int NEGATIVELOOKBEHIND = 23;   // (?<!...)
57      static final int INDEPENDENT = 24;          // (?>...)
58      static final int MODIFIERGROUP = 25;        // (?ims-ims:...)
59      static final int CONDITION = 26;            // (?(...)yes|no)
60  
61      static final int UTF16_MAX = 0x10ffff;
62  
63      final int type;
64  
65      static Token token_dot;
66      static Token token_0to9;
67      static Token token_wordchars;
68      static Token token_not_0to9;
69      static Token token_not_wordchars;
70      static Token token_spaces;
71      static Token token_not_spaces;
72      static Token token_empty;
73      static Token token_linebeginning;
74      static Token token_linebeginning2;
75      static Token token_lineend;
76      static Token token_stringbeginning;
77      static Token token_stringend;
78      static Token token_stringend2;
79      static Token token_wordedge;
80      static Token token_not_wordedge;
81      static Token token_wordbeginning;
82      static Token token_wordend;
83      static {
84          Token.token_empty = new Token(Token.EMPTY);
85  
86          Token.token_linebeginning = Token.createAnchor('^');
87          Token.token_linebeginning2 = Token.createAnchor('@');
88          Token.token_lineend = Token.createAnchor('$');
89          Token.token_stringbeginning = Token.createAnchor('A');
90          Token.token_stringend = Token.createAnchor('z');
91          Token.token_stringend2 = Token.createAnchor('Z');
92          Token.token_wordedge = Token.createAnchor('b');
93          Token.token_not_wordedge = Token.createAnchor('B');
94          Token.token_wordbeginning = Token.createAnchor('<');
95          Token.token_wordend = Token.createAnchor('>');
96  
97          Token.token_dot = new Token(Token.DOT);
98  
99          Token.token_0to9 = Token.createRange();
100         Token.token_0to9.addRange('0', '9');
101         Token.token_wordchars = Token.createRange();
102         Token.token_wordchars.addRange('0', '9');
103         Token.token_wordchars.addRange('A', 'Z');
104         Token.token_wordchars.addRange('_', '_');
105         Token.token_wordchars.addRange('a', 'z');
106         Token.token_spaces = Token.createRange();
107         Token.token_spaces.addRange('\t', '\t');
108         Token.token_spaces.addRange('\n', '\n');
109         Token.token_spaces.addRange('\f', '\f');
110         Token.token_spaces.addRange('\r', '\r');
111         Token.token_spaces.addRange(' ', ' ');
112 
113         Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
114         Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars);
115         Token.token_not_spaces = Token.complementRanges(Token.token_spaces);
116     }
117 
118     static Token.ParenToken createLook(int type, Token child) {
119         if (COUNTTOKENS)  Token.tokens ++;
120         return new Token.ParenToken(type, child, 0);
121     }
122     static Token.ParenToken createParen(Token child, int pnumber) {
123         if (COUNTTOKENS)  Token.tokens ++;
124         return new Token.ParenToken(Token.PAREN, child, pnumber);
125     }
126     static Token.ClosureToken createClosure(Token tok) {
127         if (COUNTTOKENS)  Token.tokens ++;
128         return new Token.ClosureToken(Token.CLOSURE, tok);
129     }
130     static Token.ClosureToken createNGClosure(Token tok) {
131         if (COUNTTOKENS)  Token.tokens ++;
132         return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
133     }
134     static Token.ConcatToken createConcat(Token tok1, Token tok2) {
135         if (COUNTTOKENS)  Token.tokens ++;
136         return new Token.ConcatToken(tok1, tok2);
137     }
138     static Token.UnionToken createConcat() {
139         if (COUNTTOKENS)  Token.tokens ++;
140         return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
141     }
142     static Token.UnionToken createUnion() {
143         if (COUNTTOKENS)  Token.tokens ++;
144         return new Token.UnionToken(Token.UNION);
145     }
146     static Token createEmpty() {
147         return Token.token_empty;
148     }
149     static RangeToken createRange() {
150         if (COUNTTOKENS)  Token.tokens ++;
151         return new RangeToken(Token.RANGE);
152     }
153     static RangeToken createNRange() {
154         if (COUNTTOKENS)  Token.tokens ++;
155         return new RangeToken(Token.NRANGE);
156     }
157     static Token.CharToken createChar(int ch) {
158         if (COUNTTOKENS)  Token.tokens ++;
159         return new Token.CharToken(Token.CHAR, ch);
160     }
161     static private Token.CharToken createAnchor(int ch) {
162         if (COUNTTOKENS)  Token.tokens ++;
163         return new Token.CharToken(Token.ANCHOR, ch);
164     }
165     static Token.StringToken createBackReference(int refno) {
166         if (COUNTTOKENS)  Token.tokens ++;
167         return new Token.StringToken(Token.BACKREFERENCE, null, refno);
168     }
169     static Token.StringToken createString(String str) {
170         if (COUNTTOKENS)  Token.tokens ++;
171         return new Token.StringToken(Token.STRING, str, 0);
172     }
173     static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
174         if (COUNTTOKENS)  Token.tokens ++;
175         return new Token.ModifierToken(child, add, mask);
176     }
177     static Token.ConditionToken createCondition(int refno, Token condition,
178                                                 Token yespat, Token nopat) {
179         if (COUNTTOKENS)  Token.tokens ++;
180         return new Token.ConditionToken(refno, condition, yespat, nopat);
181     }
182 
183     protected Token(int type) {
184         this.type = type;
185     }
186 
187     /**
188      * A number of children.
189      */
190     int size() {
191         return 0;
192     }
193     Token getChild(int index) {
194         return null;
195     }
196     void addChild(Token tok) {
197         throw new RuntimeException("Not supported.");
198     }
199 
200                                                 // for RANGE or NRANGE
201     protected void addRange(int start, int end) {
202         throw new RuntimeException("Not supported.");
203     }
204     protected void sortRanges() {
205         throw new RuntimeException("Not supported.");
206     }
207     protected void compactRanges() {
208         throw new RuntimeException("Not supported.");
209     }
210     protected void mergeRanges(Token tok) {
211         throw new RuntimeException("Not supported.");
212     }
213     protected void subtractRanges(Token tok) {
214         throw new RuntimeException("Not supported.");
215     }
216     protected void intersectRanges(Token tok) {
217         throw new RuntimeException("Not supported.");
218     }
219     static Token complementRanges(Token tok) {
220         return RangeToken.complementRanges(tok);
221     }
222 
223 
224     void setMin(int min) {                      // for CLOSURE
225     }
226     void setMax(int max) {                      // for CLOSURE
227     }
228     int getMin() {                              // for CLOSURE
229         return -1;
230     }
231     int getMax() {                              // for CLOSURE
232         return -1;
233     }
234     int getReferenceNumber() {                  // for STRING
235         return 0;
236     }
237     String getString() {                        // for STRING
238         return null;
239     }
240 
241     int getParenNumber() {
242         return 0;
243     }
244     int getChar() {
245         return -1;
246     }
247 
248     public String toString() {
249         return this.toString(0);
250     }
251     public String toString(int options) {
252         return this.type == Token.DOT ? "." : "";
253     }
254 
255     /**
256      * How many characters are needed?
257      */
258     final int getMinLength() {
259         switch (this.type) {
260           case CONCAT:
261             int sum = 0;
262             for (int i = 0;  i < this.size();  i ++)
263                 sum += this.getChild(i).getMinLength();
264             return sum;
265 
266           case CONDITION:
267           case UNION:
268             if (this.size() == 0)
269                 return 0;
270             int ret = this.getChild(0).getMinLength();
271             for (int i = 1;  i < this.size();  i ++) {
272                 int min = this.getChild(i).getMinLength();
273                 if (min < ret)  ret = min;
274             }
275             return ret;
276 
277           case CLOSURE:
278           case NONGREEDYCLOSURE:
279             if (this.getMin() >= 0)
280                 return this.getMin() * this.getChild(0).getMinLength();
281             return 0;
282 
283           case EMPTY:
284           case ANCHOR:
285             return 0;
286 
287           case DOT:
288           case CHAR:
289           case RANGE:
290           case NRANGE:
291             return 1;
292 
293           case INDEPENDENT:
294           case PAREN:
295           case MODIFIERGROUP:
296             return this.getChild(0).getMinLength();
297 
298           case BACKREFERENCE:
299             return 0;                           // *******
300 
301           case STRING:
302             return this.getString().length();
303 
304           case LOOKAHEAD:
305           case NEGATIVELOOKAHEAD:
306           case LOOKBEHIND:
307           case NEGATIVELOOKBEHIND:
308             return 0;                           // ***** Really?
309 
310           default:
311             throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type);
312         }
313     }
314 
315     final int getMaxLength() {
316         switch (this.type) {
317           case CONCAT:
318             int sum = 0;
319             for (int i = 0;  i < this.size();  i ++) {
320                 int d = this.getChild(i).getMaxLength();
321                 if (d < 0)  return -1;
322                 sum += d;
323             }
324             return sum;
325 
326           case CONDITION:
327           case UNION:
328             if (this.size() == 0)
329                 return 0;
330             int ret = this.getChild(0).getMaxLength();
331             for (int i = 1;  ret >= 0 && i < this.size();  i ++) {
332                 int max = this.getChild(i).getMaxLength();
333                 if (max < 0) {                  // infinity
334                     ret = -1;
335                     break;
336                 }
337                 if (max > ret)  ret = max;
338             }
339             return ret;
340 
341           case CLOSURE:
342           case NONGREEDYCLOSURE:
343             if (this.getMax() >= 0)
344                                                 // When this.child.getMaxLength() < 0,
345                                                 // this returns minus value
346                 return this.getMax() * this.getChild(0).getMaxLength();
347             return -1;
348 
349           case EMPTY:
350           case ANCHOR:
351             return 0;
352 
353           case CHAR:
354             return 1;
355           case DOT:
356           case RANGE:
357           case NRANGE:
358             return 2;
359 
360           case INDEPENDENT:
361           case PAREN:
362           case MODIFIERGROUP:
363             return this.getChild(0).getMaxLength();
364 
365           case BACKREFERENCE:
366             return -1;                          // ******
367 
368           case STRING:
369             return this.getString().length();
370 
371           case LOOKAHEAD:
372           case NEGATIVELOOKAHEAD:
373           case LOOKBEHIND:
374           case NEGATIVELOOKBEHIND:
375             return 0;                           // ***** Really?
376 
377           default:
378             throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type);
379         }
380     }
381 
382     static final int FC_CONTINUE = 0;
383     static final int FC_TERMINAL = 1;
384     static final int FC_ANY = 2;
385     private static final boolean isSet(int options, int flag) {
386         return (options & flag) == flag;
387     }
388     final int analyzeFirstCharacter(RangeToken result, int options) {
389         switch (this.type) {
390           case CONCAT:
391             int ret = FC_CONTINUE;
392             for (int i = 0;  i < this.size();  i ++)
393                 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE)
394                     break;
395             return ret;
396 
397           case UNION:
398             if (this.size() == 0)
399                 return FC_CONTINUE;
400             /*
401              *  a|b|c -> FC_TERMINAL
402              *  a|.|c -> FC_ANY
403              *  a|b|  -> FC_CONTINUE
404              */
405             int ret2 = FC_CONTINUE;
406             boolean hasEmpty = false;
407             for (int i = 0;  i < this.size();  i ++) {
408                 ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
409                 if (ret2 == FC_ANY)
410                     break;
411                 else if (ret2 == FC_CONTINUE)
412                     hasEmpty = true;
413             }
414             return hasEmpty ? FC_CONTINUE : ret2;
415 
416           case CONDITION:
417             int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
418             if (this.size() == 1)  return FC_CONTINUE;
419             if (ret3 == FC_ANY)  return ret3;
420             int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
421             if (ret4 == FC_ANY)  return ret4;
422             return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
423 
424           case CLOSURE:
425           case NONGREEDYCLOSURE:
426             this.getChild(0).analyzeFirstCharacter(result, options);
427             return FC_CONTINUE;
428 
429           case EMPTY:
430           case ANCHOR:
431             return FC_CONTINUE;
432 
433           case CHAR:
434             int ch = this.getChar();
435             result.addRange(ch, ch);
436             if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
437                 ch = Character.toUpperCase((char)ch);
438                 result.addRange(ch, ch);
439                 ch = Character.toLowerCase((char)ch);
440                 result.addRange(ch, ch);
441             }
442             return FC_TERMINAL;
443 
444           case DOT:
445               return FC_ANY;
446 
447           case RANGE:
448             result.mergeRanges(this);
449             return FC_TERMINAL;
450 
451           case NRANGE:                          // ****
452             result.mergeRanges(Token.complementRanges(this));
453             return FC_TERMINAL;
454 
455           case INDEPENDENT:
456           case PAREN:
457             return this.getChild(0).analyzeFirstCharacter(result, options);
458 
459           case MODIFIERGROUP:
460             options |= ((ModifierToken)this).getOptions();
461             options &= ~((ModifierToken)this).getOptionsMask();
462             return this.getChild(0).analyzeFirstCharacter(result, options);
463 
464           case BACKREFERENCE:
465             result.addRange(0, UTF16_MAX);  // **** We can not optimize.
466             return FC_ANY;
467 
468           case STRING:
469             int cha = this.getString().charAt(0);
470             int ch2;
471             if (REUtil.isHighSurrogate(cha)
472                 && this.getString().length() >= 2
473                 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1))))
474                 cha = REUtil.composeFromSurrogates(cha, ch2);
475             result.addRange(cha, cha);
476             if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
477                 cha = Character.toUpperCase((char)cha);
478                 result.addRange(cha, cha);
479                 cha = Character.toLowerCase((char)cha);
480                 result.addRange(cha, cha);
481             }
482             return FC_TERMINAL;
483 
484           case LOOKAHEAD:
485           case NEGATIVELOOKAHEAD:
486           case LOOKBEHIND:
487           case NEGATIVELOOKBEHIND:
488             return FC_CONTINUE;
489 
490           default:
491             throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
492         }
493     }
494 
495     private final boolean isShorterThan(Token tok) {
496         if (tok == null)  return false;
497         /*
498         int mylength;
499         if (this.type == STRING)  mylength = this.getString().length();
500         else if (this.type == CHAR)  mylength = this.getChar() >= 0x10000 ? 2 : 1;
501         else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
502         int otherlength;
503         if (tok.type == STRING)  otherlength = tok.getString().length();
504         else if (tok.type == CHAR)  otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
505         else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
506         */
507         int mylength;
508         if (this.type == STRING)  mylength = this.getString().length();
509         else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
510         int otherlength;
511         if (tok.type == STRING)  otherlength = tok.getString().length();
512         else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
513         return mylength < otherlength;
514     }
515 
516     static class FixedStringContainer {
517         Token token = null;
518         int options = 0;
519         FixedStringContainer() {
520         }
521     }
522 
523     final void findFixedString(FixedStringContainer container, int options) {
524         switch (this.type) {
525           case CONCAT:
526             Token prevToken = null;
527             int prevOptions = 0;
528             for (int i = 0;  i < this.size();  i ++) {
529                 this.getChild(i).findFixedString(container, options);
530                 if (prevToken == null || prevToken.isShorterThan(container.token)) {
531                     prevToken = container.token;
532                     prevOptions = container.options;
533                 }
534             }
535             container.token = prevToken;
536             container.options = prevOptions;
537             return;
538 
539           case UNION:
540           case CLOSURE:
541           case NONGREEDYCLOSURE:
542           case EMPTY:
543           case ANCHOR:
544           case RANGE:
545           case DOT:
546           case NRANGE:
547           case BACKREFERENCE:
548           case LOOKAHEAD:
549           case NEGATIVELOOKAHEAD:
550           case LOOKBEHIND:
551           case NEGATIVELOOKBEHIND:
552           case CONDITION:
553             container.token = null;
554             return;
555 
556           case CHAR:                            // Ignore CHAR tokens.
557             container.token = null;             // **
558             return;                             // **
559 
560           case STRING:
561             container.token = this;
562             container.options = options;
563             return;
564 
565           case INDEPENDENT:
566           case PAREN:
567             this.getChild(0).findFixedString(container, options);
568             return;
569 
570           case MODIFIERGROUP:
571             options |= ((ModifierToken)this).getOptions();
572             options &= ~((ModifierToken)this).getOptionsMask();
573             this.getChild(0).findFixedString(container, options);
574             return;
575 
576           default:
577             throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type);
578         }
579     }
580 
581     boolean match(int ch) {
582         throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
583     }
584 
585     // ------------------------------------------------------
586     private final static Hashtable categories = new Hashtable();
587     private final static Hashtable categories2 = new Hashtable();
588     private static final String[] categoryNames = {
589         "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
590         "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
591         "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
592         "Pi", "Pf",  // 29, 30
593         "L", "M", "N", "Z", "C", "P", "S",      // 31-37
594     };
595 
596     // Schema Rec. {Datatypes} - Punctuation
597     static final int CHAR_INIT_QUOTE  = 29;     // Pi - initial quote
598     static final int CHAR_FINAL_QUOTE = 30;     // Pf - final quote
599     static final int CHAR_LETTER = 31;
600     static final int CHAR_MARK = 32;
601     static final int CHAR_NUMBER = 33;
602     static final int CHAR_SEPARATOR = 34;
603     static final int CHAR_OTHER = 35;
604     static final int CHAR_PUNCTUATION = 36;
605     static final int CHAR_SYMBOL = 37;
606 
607     //blockNames in UNICODE 3.1 that supported by XML Schema REC
608     private static final String[] blockNames = {
609         /*0000..007F;*/ "Basic Latin",
610         /*0080..00FF;*/ "Latin-1 Supplement",
611         /*0100..017F;*/ "Latin Extended-A",
612         /*0180..024F;*/ "Latin Extended-B",
613         /*0250..02AF;*/ "IPA Extensions",
614         /*02B0..02FF;*/ "Spacing Modifier Letters",
615         /*0300..036F;*/ "Combining Diacritical Marks",
616         /*0370..03FF;*/ "Greek",
617         /*0400..04FF;*/ "Cyrillic",
618         /*0530..058F;*/ "Armenian",
619         /*0590..05FF;*/ "Hebrew",
620         /*0600..06FF;*/ "Arabic",
621         /*0700..074F;*/ "Syriac",
622         /*0780..07BF;*/ "Thaana",
623         /*0900..097F;*/ "Devanagari",
624         /*0980..09FF;*/ "Bengali",
625         /*0A00..0A7F;*/ "Gurmukhi",
626         /*0A80..0AFF;*/ "Gujarati",
627         /*0B00..0B7F;*/ "Oriya",
628         /*0B80..0BFF;*/ "Tamil",
629         /*0C00..0C7F;*/ "Telugu",
630         /*0C80..0CFF;*/ "Kannada",
631         /*0D00..0D7F;*/ "Malayalam",
632         /*0D80..0DFF;*/ "Sinhala",
633         /*0E00..0E7F;*/ "Thai",
634         /*0E80..0EFF;*/ "Lao",
635         /*0F00..0FFF;*/ "Tibetan",
636         /*1000..109F;*/ "Myanmar",
637         /*10A0..10FF;*/ "Georgian",
638         /*1100..11FF;*/ "Hangul Jamo",
639         /*1200..137F;*/ "Ethiopic",
640         /*13A0..13FF;*/ "Cherokee",
641         /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
642         /*1680..169F;*/ "Ogham",
643         /*16A0..16FF;*/ "Runic",
644         /*1780..17FF;*/ "Khmer",
645         /*1800..18AF;*/ "Mongolian",
646         /*1E00..1EFF;*/ "Latin Extended Additional",
647         /*1F00..1FFF;*/ "Greek Extended",
648         /*2000..206F;*/ "General Punctuation",
649         /*2070..209F;*/ "Superscripts and Subscripts",
650         /*20A0..20CF;*/ "Currency Symbols",
651         /*20D0..20FF;*/ "Combining Marks for Symbols",
652         /*2100..214F;*/ "Letterlike Symbols",
653         /*2150..218F;*/ "Number Forms",
654         /*2190..21FF;*/ "Arrows",
655         /*2200..22FF;*/ "Mathematical Operators",
656         /*2300..23FF;*/ "Miscellaneous Technical",
657         /*2400..243F;*/ "Control Pictures",
658         /*2440..245F;*/ "Optical Character Recognition",
659         /*2460..24FF;*/ "Enclosed Alphanumerics",
660         /*2500..257F;*/ "Box Drawing",
661         /*2580..259F;*/ "Block Elements",
662         /*25A0..25FF;*/ "Geometric Shapes",
663         /*2600..26FF;*/ "Miscellaneous Symbols",
664         /*2700..27BF;*/ "Dingbats",
665         /*2800..28FF;*/ "Braille Patterns",
666         /*2E80..2EFF;*/ "CJK Radicals Supplement",
667         /*2F00..2FDF;*/ "Kangxi Radicals",
668         /*2FF0..2FFF;*/ "Ideographic Description Characters",
669         /*3000..303F;*/ "CJK Symbols and Punctuation",
670         /*3040..309F;*/ "Hiragana",
671         /*30A0..30FF;*/ "Katakana",
672         /*3100..312F;*/ "Bopomofo",
673         /*3130..318F;*/ "Hangul Compatibility Jamo",
674         /*3190..319F;*/ "Kanbun",
675         /*31A0..31BF;*/ "Bopomofo Extended",
676         /*3200..32FF;*/ "Enclosed CJK Letters and Months",
677         /*3300..33FF;*/ "CJK Compatibility",
678         /*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
679         /*4E00..9FFF;*/ "CJK Unified Ideographs",
680         /*A000..A48F;*/ "Yi Syllables",
681         /*A490..A4CF;*/ "Yi Radicals",
682         /*AC00..D7A3;*/ "Hangul Syllables",
683         /*E000..F8FF;*/ "Private Use",
684         /*F900..FAFF;*/ "CJK Compatibility Ideographs",
685         /*FB00..FB4F;*/ "Alphabetic Presentation Forms",
686         /*FB50..FDFF;*/ "Arabic Presentation Forms-A",
687         /*FE20..FE2F;*/ "Combining Half Marks",
688         /*FE30..FE4F;*/ "CJK Compatibility Forms",
689         /*FE50..FE6F;*/ "Small Form Variants",
690         /*FE70..FEFE;*/ "Arabic Presentation Forms-B",
691         /*FEFF..FEFF;*/ "Specials",
692         /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
693          //missing Specials add manually
694         /*10300..1032F;*/ "Old Italic",         // 84
695         /*10330..1034F;*/ "Gothic",
696         /*10400..1044F;*/ "Deseret",
697         /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
698         /*1D100..1D1FF;*/ "Musical Symbols",
699         /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
700         /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
701         /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
702         /*E0000..E007F;*/ "Tags",
703         //missing 2 private use add manually
704 
705     };
706     //ADD THOSE MANUALLY
707     //F0000..FFFFD; "Private Use",
708     //100000..10FFFD; "Private Use"
709     //FFF0..FFFD; "Specials",
710     static final String blockRanges =
711        "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
712         +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
713         +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
714         +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
715         +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
716         +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
717         +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
718         +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
719         +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
720         +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
721         +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
722     static final int[] nonBMPBlockRanges = {
723         0x10300, 0x1032F,       // 84
724         0x10330, 0x1034F,
725         0x10400, 0x1044F,
726         0x1D000, 0x1D0FF,
727         0x1D100, 0x1D1FF,
728         0x1D400, 0x1D7FF,
729         0x20000, 0x2A6D6,
730         0x2F800, 0x2FA1F,
731         0xE0000, 0xE007F
732     };
733     private static final int NONBMP_BLOCK_START = 84;
734 
735     static protected RangeToken getRange(String name, boolean positive) {
736         if (Token.categories.size() == 0) {
737             synchronized (Token.categories) {
738                 Token[] ranges = new Token[Token.categoryNames.length];
739                 for (int i = 0;  i < ranges.length;  i ++) {
740                     ranges[i] = Token.createRange();
741                 }
742                 int type;
743                 for (int i = 0;  i < 0x10000;  i ++) {
744                     type = Character.getType((char)i);
745                     if (type == Character.START_PUNCTUATION ||
746                         type == Character.END_PUNCTUATION) {
747                         //build table of Pi values
748                         if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
749                             i == 0x201F || i == 0x2039) {
750                             type = CHAR_INIT_QUOTE;
751                         }
752                         //build table of Pf values
753                         if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
754                             type = CHAR_FINAL_QUOTE;
755                         }
756                     }
757                     ranges[type].addRange(i, i);
758                     switch (type) {
759                       case Character.UPPERCASE_LETTER:
760                       case Character.LOWERCASE_LETTER:
761                       case Character.TITLECASE_LETTER:
762                       case Character.MODIFIER_LETTER:
763                       case Character.OTHER_LETTER:
764                         type = CHAR_LETTER;
765                         break;
766                       case Character.NON_SPACING_MARK:
767                       case Character.COMBINING_SPACING_MARK:
768                       case Character.ENCLOSING_MARK:
769                         type = CHAR_MARK;
770                         break;
771                       case Character.DECIMAL_DIGIT_NUMBER:
772                       case Character.LETTER_NUMBER:
773                       case Character.OTHER_NUMBER:
774                         type = CHAR_NUMBER;
775                         break;
776                       case Character.SPACE_SEPARATOR:
777                       case Character.LINE_SEPARATOR:
778                       case Character.PARAGRAPH_SEPARATOR:
779                         type = CHAR_SEPARATOR;
780                         break;
781                       case Character.CONTROL:
782                       case Character.FORMAT:
783                       case Character.SURROGATE:
784                       case Character.PRIVATE_USE:
785                       case Character.UNASSIGNED:
786                         type = CHAR_OTHER;
787                         break;
788                       case Character.CONNECTOR_PUNCTUATION:
789                       case Character.DASH_PUNCTUATION:
790                       case Character.START_PUNCTUATION:
791                       case Character.END_PUNCTUATION:
792                       case CHAR_INIT_QUOTE:
793                       case CHAR_FINAL_QUOTE:
794                       case Character.OTHER_PUNCTUATION:
795                         type = CHAR_PUNCTUATION;
796                         break;
797                       case Character.MATH_SYMBOL:
798                       case Character.CURRENCY_SYMBOL:
799                       case Character.MODIFIER_SYMBOL:
800                       case Character.OTHER_SYMBOL:
801                         type = CHAR_SYMBOL;
802                         break;
803                       default:
804                         throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
805                     }
806                     ranges[type].addRange(i, i);
807                 } // for all characters
808                 ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
809 
810                 for (int i = 0;  i < ranges.length;  i ++) {
811                     if (Token.categoryNames[i] != null) {
812                         if (i == Character.UNASSIGNED) { // Unassigned
813                             ranges[i].addRange(0x10000, Token.UTF16_MAX);
814                         }
815                         Token.categories.put(Token.categoryNames[i], ranges[i]);
816                         Token.categories2.put(Token.categoryNames[i],
817                                               Token.complementRanges(ranges[i]));
818                     }
819                 }
820                 //REVISIT: do we really need to support block names as in Unicode 3.1
821                 //         or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
822                 //
823                 StringBuffer buffer = new StringBuffer(50);
824                 for (int i = 0;  i < Token.blockNames.length;  i ++) {
825                     Token r1 = Token.createRange();
826                     int location;
827                     if (i < NONBMP_BLOCK_START) {
828                         location = i*2;
829                         int rstart = Token.blockRanges.charAt(location);
830                         int rend = Token.blockRanges.charAt(location+1);
831                         //DEBUGING
832                         //System.out.println(n+" " +Integer.toHexString(rstart)
833                         //                     +"-"+ Integer.toHexString(rend));
834                         r1.addRange(rstart, rend);
835                     } else {
836                         location = (i - NONBMP_BLOCK_START) * 2;
837                         r1.addRange(Token.nonBMPBlockRanges[location],
838                                     Token.nonBMPBlockRanges[location + 1]);
839                     }
840                     String n = Token.blockNames[i];
841                     if (n.equals("Specials"))
842                         r1.addRange(0xfff0, 0xfffd);
843                     if (n.equals("Private Use")) {
844                         r1.addRange(0xF0000,0xFFFFD);
845                         r1.addRange(0x100000,0x10FFFD);
846                     }
847                     Token.categories.put(n, r1);
848                     Token.categories2.put(n, Token.complementRanges(r1));
849                     buffer.setLength(0);
850                     buffer.append("Is");
851                     if (n.indexOf(' ') >= 0) {
852                         for (int ci = 0;  ci < n.length();  ci ++)
853                             if (n.charAt(ci) != ' ')  buffer.append((char)n.charAt(ci));
854                     }
855                     else {
856                         buffer.append(n);
857                     }
858                     Token.setAlias(buffer.toString(), n, true);
859                 }
860 
861                 // TR#18 1.2
862                 Token.setAlias("ASSIGNED", "Cn", false);
863                 Token.setAlias("UNASSIGNED", "Cn", true);
864                 Token all = Token.createRange();
865                 all.addRange(0, Token.UTF16_MAX);
866                 Token.categories.put("ALL", all);
867                 Token.categories2.put("ALL", Token.complementRanges(all));
868                 Token.registerNonXS("ASSIGNED");
869                 Token.registerNonXS("UNASSIGNED");
870                 Token.registerNonXS("ALL");
871 
872                 Token isalpha = Token.createRange();
873                 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
874                 isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
875                 isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
876                 Token.categories.put("IsAlpha", isalpha);
877                 Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
878                 Token.registerNonXS("IsAlpha");
879 
880                 Token isalnum = Token.createRange();
881                 isalnum.mergeRanges(isalpha);   // Lu Ll Lo
882                 isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
883                 Token.categories.put("IsAlnum", isalnum);
884                 Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
885                 Token.registerNonXS("IsAlnum");
886 
887                 Token isspace = Token.createRange();
888                 isspace.mergeRanges(Token.token_spaces);
889                 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
890                 Token.categories.put("IsSpace", isspace);
891                 Token.categories2.put("IsSpace", Token.complementRanges(isspace));
892                 Token.registerNonXS("IsSpace");
893 
894                 Token isword = Token.createRange();
895                 isword.mergeRanges(isalnum);     // Lu Ll Lo Nd
896                 isword.addRange('_', '_');
897                 Token.categories.put("IsWord", isword);
898                 Token.categories2.put("IsWord", Token.complementRanges(isword));
899                 Token.registerNonXS("IsWord");
900 
901                 Token isascii = Token.createRange();
902                 isascii.addRange(0, 127);
903                 Token.categories.put("IsASCII", isascii);
904                 Token.categories2.put("IsASCII", Token.complementRanges(isascii));
905                 Token.registerNonXS("IsASCII");
906 
907                 Token isnotgraph = Token.createRange();
908                 isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
909                 isnotgraph.addRange(' ', ' ');
910                 Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
911                 Token.categories2.put("IsGraph", isnotgraph);
912                 Token.registerNonXS("IsGraph");
913 
914                 Token isxdigit = Token.createRange();
915                 isxdigit.addRange('0', '9');
916                 isxdigit.addRange('A', 'F');
917                 isxdigit.addRange('a', 'f');
918                 Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
919                 Token.categories2.put("IsXDigit", isxdigit);
920                 Token.registerNonXS("IsXDigit");
921 
922                 Token.setAlias("IsDigit", "Nd", true);
923                 Token.setAlias("IsUpper", "Lu", true);
924                 Token.setAlias("IsLower", "Ll", true);
925                 Token.setAlias("IsCntrl", "C", true);
926                 Token.setAlias("IsPrint", "C", false);
927                 Token.setAlias("IsPunct", "P", true);
928                 Token.registerNonXS("IsDigit");
929                 Token.registerNonXS("IsUpper");
930                 Token.registerNonXS("IsLower");
931                 Token.registerNonXS("IsCntrl");
932                 Token.registerNonXS("IsPrint");
933                 Token.registerNonXS("IsPunct");
934 
935                 Token.setAlias("alpha", "IsAlpha", true);
936                 Token.setAlias("alnum", "IsAlnum", true);
937                 Token.setAlias("ascii", "IsASCII", true);
938                 Token.setAlias("cntrl", "IsCntrl", true);
939                 Token.setAlias("digit", "IsDigit", true);
940                 Token.setAlias("graph", "IsGraph", true);
941                 Token.setAlias("lower", "IsLower", true);
942                 Token.setAlias("print", "IsPrint", true);
943                 Token.setAlias("punct", "IsPunct", true);
944                 Token.setAlias("space", "IsSpace", true);
945                 Token.setAlias("upper", "IsUpper", true);
946                 Token.setAlias("word", "IsWord", true); // Perl extension
947                 Token.setAlias("xdigit", "IsXDigit", true);
948                 Token.registerNonXS("alpha");
949                 Token.registerNonXS("alnum");
950                 Token.registerNonXS("ascii");
951                 Token.registerNonXS("cntrl");
952                 Token.registerNonXS("digit");
953                 Token.registerNonXS("graph");
954                 Token.registerNonXS("lower");
955                 Token.registerNonXS("print");
956                 Token.registerNonXS("punct");
957                 Token.registerNonXS("space");
958                 Token.registerNonXS("upper");
959                 Token.registerNonXS("word");
960                 Token.registerNonXS("xdigit");
961             } // synchronized
962         } // if null
963         RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
964             : (RangeToken)Token.categories2.get(name);
965         //if (tok == null) System.out.println(name);
966         return tok;
967     }
968     static protected RangeToken getRange(String name, boolean positive, boolean xs) {
969         RangeToken range = Token.getRange(name, positive);
970         if (xs && range != null && Token.isRegisterNonXS(name))
971             range = null;
972         return range;
973     }
974 
975     static Hashtable nonxs = null;
976     /**
977      * This method is called by only getRange().
978      * So this method need not MT-safe.
979      */
980     static protected void registerNonXS(String name) {
981         if (Token.nonxs == null)
982             Token.nonxs = new Hashtable();
983         Token.nonxs.put(name, name);
984     }
985     static protected boolean isRegisterNonXS(String name) {
986         if (Token.nonxs == null)
987             return false;
988         //DEBUG
989         //System.err.println("isRegisterNonXS: "+name);
990         return Token.nonxs.containsKey(name);
991     }
992 
993     private static void setAlias(String newName, String name, boolean positive) {
994         Token t1 = (Token)Token.categories.get(name);
995         Token t2 = (Token)Token.categories2.get(name);
996         if (positive) {
997             Token.categories.put(newName, t1);
998             Token.categories2.put(newName, t2);
999         } else {
1000             Token.categories2.put(newName, t1);
1001             Token.categories.put(newName, t2);
1002         }
1003     }
1004 
1005     // ------------------------------------------------------
1006 
1007     static final String viramaString =
1008     "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1009     +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1010     +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1011     +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1012     +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1013     +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1014     +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1015     +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1016     +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1017     +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
1018     +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
1019 
1020     static private Token token_grapheme = null;
1021     static synchronized Token getGraphemePattern() {
1022         if (Token.token_grapheme != null)
1023             return Token.token_grapheme;
1024 
1025         Token base_char = Token.createRange();  // [{ASSIGNED}]-[{M},{C}]
1026         base_char.mergeRanges(Token.getRange("ASSIGNED", true));
1027         base_char.subtractRanges(Token.getRange("M", true));
1028         base_char.subtractRanges(Token.getRange("C", true));
1029 
1030         Token virama = Token.createRange();
1031         for (int i = 0;  i < Token.viramaString.length(); i++) {
1032             virama.addRange(i, i);
1033         }
1034 
1035         Token combiner_wo_virama = Token.createRange();
1036         combiner_wo_virama.mergeRanges(Token.getRange("M", true));
1037         combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
1038         combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
1039 
1040         Token left = Token.createUnion();       // base_char?
1041         left.addChild(base_char);
1042         left.addChild(Token.token_empty);
1043 
1044         Token foo = Token.createUnion();
1045         foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
1046         foo.addChild(combiner_wo_virama);
1047 
1048         foo = Token.createClosure(foo);
1049 
1050         foo = Token.createConcat(left, foo);
1051 
1052         Token.token_grapheme = foo;
1053         return Token.token_grapheme;
1054     }
1055 
1056     /**
1057      * Combing Character Sequence in Perl 5.6.
1058      */
1059     static private Token token_ccs = null;
1060     static synchronized Token getCombiningCharacterSequence() {
1061         if (Token.token_ccs != null)
1062             return Token.token_ccs;
1063 
1064         Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
1065         foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
1066         Token.token_ccs = foo;
1067         return Token.token_ccs;
1068     }
1069 
1070     // ------------------------------------------------------
1071 
1072     // ------------------------------------------------------
1073     /**
1074      * This class represents a node in parse tree.
1075      */
1076     static class StringToken extends Token implements java.io.Serializable {
1077 
1078         private static final long serialVersionUID = -4614366944218504172L;
1079 
1080         String string;
1081         final int refNumber;
1082 
1083         StringToken(int type, String str, int n) {
1084             super(type);
1085             this.string = str;
1086             this.refNumber = n;
1087         }
1088 
1089         int getReferenceNumber() {              // for STRING
1090             return this.refNumber;
1091         }
1092         String getString() {                    // for STRING
1093             return this.string;
1094         }
1095 
1096         public String toString(int options) {
1097             if (this.type == BACKREFERENCE)
1098                 return "\\"+this.refNumber;
1099             else
1100                 return REUtil.quoteMeta(this.string);
1101         }
1102     }
1103 
1104     /**
1105      * This class represents a node in parse tree.
1106      */
1107     static class ConcatToken extends Token implements java.io.Serializable {
1108 
1109         private static final long serialVersionUID = 8717321425541346381L;
1110 
1111         final Token child;
1112         final Token child2;
1113 
1114         ConcatToken(Token t1, Token t2) {
1115             super(Token.CONCAT);
1116             this.child = t1;
1117             this.child2 = t2;
1118         }
1119 
1120         int size() {
1121             return 2;
1122         }
1123         Token getChild(int index) {
1124             return index == 0 ? this.child : this.child2;
1125         }
1126 
1127         public String toString(int options) {
1128             String ret;
1129             if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
1130                 ret = this.child.toString(options)+"+";
1131             } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
1132                 ret = this.child.toString(options)+"+?";
1133             } else
1134                 ret = this.child.toString(options)+this.child2.toString(options);
1135             return ret;
1136         }
1137     }
1138 
1139     /**
1140      * This class represents a node in parse tree.
1141      */
1142     static class CharToken extends Token implements java.io.Serializable {
1143 
1144         private static final long serialVersionUID = -4394272816279496989L;
1145 
1146         final int chardata;
1147 
1148         CharToken(int type, int ch) {
1149             super(type);
1150             this.chardata = ch;
1151         }
1152 
1153         int getChar() {
1154             return this.chardata;
1155         }
1156 
1157         public String toString(int options) {
1158             String ret;
1159             switch (this.type) {
1160               case CHAR:
1161                 switch (this.chardata) {
1162                   case '|':  case '*':  case '+':  case '?':
1163                   case '(':  case ')':  case '.':  case '[':
1164                   case '{':  case '\\':
1165                     ret = "\\"+(char)this.chardata;
1166                     break;
1167                   case '\f':  ret = "\\f";  break;
1168                   case '\n':  ret = "\\n";  break;
1169                   case '\r':  ret = "\\r";  break;
1170                   case '\t':  ret = "\\t";  break;
1171                   case 0x1b:  ret = "\\e";  break;
1172                     //case 0x0b:  ret = "\\v";  break;
1173                   default:
1174                     if (this.chardata >= 0x10000) {
1175                         String pre = "0"+Integer.toHexString(this.chardata);
1176                         ret = "\\v"+pre.substring(pre.length()-6, pre.length());
1177                     } else
1178                         ret = ""+(char)this.chardata;
1179                 }
1180                 break;
1181 
1182               case ANCHOR:
1183                 if (this == Token.token_linebeginning || this == Token.token_lineend)
1184                     ret = ""+(char)this.chardata;
1185                 else
1186                     ret = "\\"+(char)this.chardata;
1187                 break;
1188 
1189               default:
1190                 ret = null;
1191             }
1192             return ret;
1193         }
1194 
1195         boolean match(int ch) {
1196             if (this.type == CHAR) {
1197                 return ch == this.chardata;
1198             } else
1199                 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
1200         }
1201     }
1202 
1203     /**
1204      * This class represents a node in parse tree.
1205      */
1206     static class ClosureToken extends Token implements java.io.Serializable {
1207 
1208         private static final long serialVersionUID = 1308971930673997452L;
1209 
1210         int min;
1211         int max;
1212         final Token child;
1213 
1214         ClosureToken(int type, Token tok) {
1215             super(type);
1216             this.child = tok;
1217             this.setMin(-1);
1218             this.setMax(-1);
1219         }
1220 
1221         int size() {
1222             return 1;
1223         }
1224         Token getChild(int index) {
1225             return this.child;
1226         }
1227 
1228         final void setMin(int min) {
1229             this.min = min;
1230         }
1231         final void setMax(int max) {
1232             this.max = max;
1233         }
1234         final int getMin() {
1235             return this.min;
1236         }
1237         final int getMax() {
1238             return this.max;
1239         }
1240 
1241         public String toString(int options) {
1242             String ret;
1243             if (this.type == CLOSURE) {
1244                 if (this.getMin() < 0 && this.getMax() < 0) {
1245                     ret = this.child.toString(options)+"*";
1246                 } else if (this.getMin() == this.getMax()) {
1247                     ret = this.child.toString(options)+"{"+this.getMin()+"}";
1248                 } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1249                     ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
1250                 } else if (this.getMin() >= 0 && this.getMax() < 0) {
1251                     ret = this.child.toString(options)+"{"+this.getMin()+",}";
1252                 } else
1253                     throw new RuntimeException("Token#toString(): CLOSURE "
1254                                                +this.getMin()+", "+this.getMax());
1255             } else {
1256                 if (this.getMin() < 0 && this.getMax() < 0) {
1257                     ret = this.child.toString(options)+"*?";
1258                 } else if (this.getMin() == this.getMax()) {
1259                     ret = this.child.toString(options)+"{"+this.getMin()+"}?";
1260                 } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1261                     ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
1262                 } else if (this.getMin() >= 0 && this.getMax() < 0) {
1263                     ret = this.child.toString(options)+"{"+this.getMin()+",}?";
1264                 } else
1265                     throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE "
1266                                                +this.getMin()+", "+this.getMax());
1267             }
1268             return ret;
1269         }
1270     }
1271 
1272     /**
1273      * This class represents a node in parse tree.
1274      */
1275     static class ParenToken extends Token implements java.io.Serializable {
1276 
1277         private static final long serialVersionUID = -5938014719827987704L;
1278 
1279         final Token child;
1280         final int parennumber;
1281 
1282         ParenToken(int type, Token tok, int paren) {
1283             super(type);
1284             this.child = tok;
1285             this.parennumber = paren;
1286         }
1287 
1288         int size() {
1289             return 1;
1290         }
1291         Token getChild(int index) {
1292             return this.child;
1293         }
1294 
1295         int getParenNumber() {
1296             return this.parennumber;
1297         }
1298 
1299         public String toString(int options) {
1300             String ret = null;
1301             switch (this.type) {
1302               case PAREN:
1303                 if (this.parennumber == 0) {
1304                     ret = "(?:"+this.child.toString(options)+")";
1305                 } else {
1306                     ret = "("+this.child.toString(options)+")";
1307                 }
1308                 break;
1309 
1310               case LOOKAHEAD:
1311                 ret = "(?="+this.child.toString(options)+")";
1312                 break;
1313               case NEGATIVELOOKAHEAD:
1314                 ret = "(?!"+this.child.toString(options)+")";
1315                 break;
1316               case LOOKBEHIND:
1317                 ret = "(?<="+this.child.toString(options)+")";
1318                 break;
1319               case NEGATIVELOOKBEHIND:
1320                 ret = "(?<!"+this.child.toString(options)+")";
1321                 break;
1322               case INDEPENDENT:
1323                 ret = "(?>"+this.child.toString(options)+")";
1324                 break;
1325             }
1326             return ret;
1327         }
1328     }
1329 
1330     /**
1331      * (?(condition)yes-pattern|no-pattern)
1332      */
1333     static class ConditionToken extends Token implements java.io.Serializable {
1334 
1335         private static final long serialVersionUID = 4353765277910594411L;
1336 
1337         final int refNumber;
1338         final Token condition;
1339         final Token yes;
1340         final Token no;
1341         ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
1342             super(Token.CONDITION);
1343             this.refNumber = refno;
1344             this.condition = cond;
1345             this.yes = yespat;
1346             this.no = nopat;
1347         }
1348         int size() {
1349             return this.no == null ? 1 : 2;
1350         }
1351         Token getChild(int index) {
1352             if (index == 0)  return this.yes;
1353             if (index == 1)  return this.no;
1354             throw new RuntimeException("Internal Error: "+index);
1355         }
1356 
1357         public String toString(int options) {
1358             String ret;
1359             if (refNumber > 0) {
1360                 ret = "(?("+refNumber+")";
1361             } else if (this.condition.type == Token.ANCHOR) {
1362                 ret = "(?("+this.condition+")";
1363             } else {
1364                 ret = "(?"+this.condition;
1365             }
1366 
1367             if (this.no == null) {
1368                 ret += this.yes+")";
1369             } else {
1370                 ret += this.yes+"|"+this.no+")";
1371             }
1372             return ret;
1373         }
1374     }
1375 
1376     /**
1377      * (ims-ims: .... )
1378      */
1379     static class ModifierToken extends Token implements java.io.Serializable {
1380 
1381         private static final long serialVersionUID = -9114536559696480356L;
1382 
1383         final Token child;
1384         final int add;
1385         final int mask;
1386 
1387         ModifierToken(Token tok, int add, int mask) {
1388             super(Token.MODIFIERGROUP);
1389             this.child = tok;
1390             this.add = add;
1391             this.mask = mask;
1392         }
1393 
1394         int size() {
1395             return 1;
1396         }
1397         Token getChild(int index) {
1398             return this.child;
1399         }
1400 
1401         int getOptions() {
1402             return this.add;
1403         }
1404         int getOptionsMask() {
1405             return this.mask;
1406         }
1407 
1408         public String toString(int options) {
1409             return "(?"
1410                 +(this.add == 0 ? "" : REUtil.createOptionString(this.add))
1411                 +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
1412                 +":"
1413                 +this.child.toString(options)
1414                 +")";
1415         }
1416     }
1417 
1418     /**
1419      * This class represents a node in parse tree.
1420      * for UNION or CONCAT.
1421      */
1422     static class UnionToken extends Token implements java.io.Serializable {
1423 
1424         private static final long serialVersionUID = -2568843945989489861L;
1425 
1426         Vector children;
1427 
1428         UnionToken(int type) {
1429             super(type);
1430         }
1431 
1432         void addChild(Token tok) {
1433             if (tok == null)  return;
1434             if (this.children == null)  this.children = new Vector();
1435             if (this.type == UNION) {
1436                 this.children.addElement(tok);
1437                 return;
1438             }
1439                                                 // This is CONCAT, and new child is CONCAT.
1440             if (tok.type == CONCAT) {
1441                 for (int i = 0;  i < tok.size();  i ++)
1442                     this.addChild(tok.getChild(i)); // Recursion
1443                 return;
1444             }
1445             int size = this.children.size();
1446             if (size == 0) {
1447                 this.children.addElement(tok);
1448                 return;
1449             }
1450             Token previous = (Token)this.children.elementAt(size-1);
1451             if (!((previous.type == CHAR || previous.type == STRING)
1452                   && (tok.type == CHAR || tok.type == STRING))) {
1453                 this.children.addElement(tok);
1454                 return;
1455             }
1456 
1457             //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
1458 
1459             StringBuffer buffer;
1460             int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length());
1461             if (previous.type == CHAR) {        // Replace previous token by STRING
1462                 buffer = new StringBuffer(2 + nextMaxLength);
1463                 int ch = previous.getChar();
1464                 if (ch >= 0x10000)
1465                     buffer.append(REUtil.decomposeToSurrogates(ch));
1466                 else
1467                     buffer.append((char)ch);
1468                 previous = Token.createString(null);
1469                 this.children.setElementAt(previous, size-1);
1470             } else {                            // STRING
1471                 buffer = new StringBuffer(previous.getString().length() + nextMaxLength);
1472                 buffer.append(previous.getString());
1473             }
1474 
1475             if (tok.type == CHAR) {
1476                 int ch = tok.getChar();
1477                 if (ch >= 0x10000)
1478                     buffer.append(REUtil.decomposeToSurrogates(ch));
1479                 else
1480                     buffer.append((char)ch);
1481             } else {
1482                 buffer.append(tok.getString());
1483             }
1484 
1485             ((StringToken)previous).string = new String(buffer);
1486         }
1487 
1488         int size() {
1489             return this.children == null ? 0 : this.children.size();
1490         }
1491         Token getChild(int index) {
1492             return (Token)this.children.elementAt(index);
1493         }
1494 
1495         public String toString(int options) {
1496             String ret;
1497             if (this.type == CONCAT) {
1498                 if (this.children.size() == 2) {
1499                     Token ch = this.getChild(0);
1500                     Token ch2 = this.getChild(1);
1501                     if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
1502                         ret = ch.toString(options)+"+";
1503                     } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
1504                         ret = ch.toString(options)+"+?";
1505                     } else
1506                         ret = ch.toString(options)+ch2.toString(options);
1507                 } else {
1508                     StringBuffer sb = new StringBuffer();
1509                     for (int i = 0;  i < this.children.size();  i ++) {
1510                         sb.append(((Token)this.children.elementAt(i)).toString(options));
1511                     }
1512                     ret = new String(sb);
1513                 }
1514                 return ret;
1515             }
1516             if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
1517                 ret = this.getChild(0).toString(options)+"?";
1518             } else if (this.children.size() == 2
1519                        && this.getChild(0).type == EMPTY) {
1520                 ret = this.getChild(1).toString(options)+"??";
1521             } else {
1522                 StringBuffer sb = new StringBuffer();
1523                 sb.append(((Token)this.children.elementAt(0)).toString(options));
1524                 for (int i = 1;  i < this.children.size();  i ++) {
1525                     sb.append((char)'|');
1526                     sb.append(((Token)this.children.elementAt(i)).toString(options));
1527                 }
1528                 ret = new String(sb);
1529             }
1530             return ret;
1531         }
1532     }
1533 }