View Javadoc
1   /*
2    * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
3    * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4    *
5    * This code is free software; you can redistribute it and/or modify it
6    * under the terms of the GNU General Public License version 2 only, as
7    * published by the Free Software Foundation.  Oracle designates this
8    * particular file as subject to the "Classpath" exception as provided
9    * by Oracle in the LICENSE file that accompanied this code.
10   *
11   * This code is distributed in the hope that it will be useful, but WITHOUT
12   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14   * version 2 for more details (a copy is included in the LICENSE file that
15   * accompanied this code).
16   *
17   * You should have received a copy of the GNU General Public License version
18   * 2 along with this work; if not, write to the Free Software Foundation,
19   * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20   *
21   * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22   * or visit www.oracle.com if you need additional information or have any
23   * questions.
24   */
25  
26  /*
27   * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
28   * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
29   *
30   * The original version of this source code and documentation
31   * is copyrighted and owned by Taligent, Inc., a wholly-owned
32   * subsidiary of IBM. These materials are provided under terms
33   * of a License Agreement between Taligent and Sun. This technology
34   * is protected by multiple US and International patents.
35   *
36   * This notice and attribution to Taligent may not be removed.
37   * Taligent is a registered trademark of Taligent, Inc.
38   */
39  
40  package build.tools.generatebreakiteratordata;
41  
42  import java.util.Arrays;
43  import java.util.Hashtable;
44  
45  /**
46   * An object representing a set of characters.  (This is a "set" in the
47   * mathematical sense: an unduplicated list of characters on which set
48   * operations such as union and intersection can be performed.)  The
49   * set information is stored in compressed, optimized form: The object
50   * contains an integer array with an even number of characters.  Each
51   * pair of characters represents a range of characters contained in the set
52   * (a pair of the same character represents a single character).  The
53   * characters are sorted in increasing order.
54   */
55  class CharSet {
56      /**
57       * The structure containing the set information.  The characters
58       * in this array are organized into pairs, each pair representing
59       * a range of characters contained in the set
60       */
61      private int[] chars;
62  
63      //==========================================================================
64      // parseString() and associated routines
65      //==========================================================================
66      /**
67       * A cache which is used to speed up parseString() whenever it is
68       * used to parse a description that has been parsed before
69       */
70      private static Hashtable<String, CharSet> expressionCache = null;
71  
72      /**
73       * Builds a CharSet based on a textual description.  For the syntax of
74       * the description, see the documentation of RuleBasedBreakIterator.
75       * @see java.text.RuleBasedBreakIterator
76       */
77      public static CharSet parseString(String s) {
78          CharSet result = null;
79  
80          // if "s" is in the expression cache, pull the result out
81          // of the expresison cache
82          if (expressionCache != null) {
83              result = expressionCache.get(s);
84          }
85  
86          // otherwise, use doParseString() to actually parse the string,
87          // and then add a corresponding entry to the expression cache
88          if (result == null) {
89              result = doParseString(s);
90              if (expressionCache == null) {
91                  expressionCache = new Hashtable<>();
92              }
93              expressionCache.put(s, result);
94          }
95          result = (CharSet)(result.clone());
96          return result;
97      }
98  
99      /**
100      * This function is used by parseString() to actually parse the string
101      */
102     private static CharSet doParseString(String s) {
103         CharSet result = new CharSet();
104         int p = 0;
105 
106         boolean haveDash = false;
107         boolean haveTilde = false;
108         boolean wIsReal = false;
109         int w = 0x0000;
110 
111         // for each character in the description...
112         while (p < s.length()) {
113             int c = s.codePointAt(p);
114 
115             // if it's an opening bracket...
116             if (c == '[') {
117                 // flush the single-character cache
118                 if (wIsReal) {
119                     result.internalUnion(new CharSet(w));
120                 }
121 
122                 // locate the matching closing bracket
123                 int bracketLevel = 1;
124                 int q = p + 1;
125                 while (bracketLevel != 0) {
126                     // if no matching bracket by end of string then...
127                     if (q >= s.length()) {
128                         throw new IllegalArgumentException("Parse error at position " + p + " in " + s);
129                     }
130                     int ch = s.codePointAt(q);
131                     switch (ch) {
132                     case '\\': // need to step over next character
133                         ch = s.codePointAt(++q);
134                         break;
135                     case '[':
136                         ++bracketLevel;
137                         break;
138                     case ']':
139                         --bracketLevel;
140                         break;
141                     }
142                     q += Character.charCount(ch);
143                 }
144                 --q;
145 
146                 // call parseString() recursively to parse the text inside
147                 // the brackets, then either add or subtract the result from
148                 // our running result depending on whether or not the []
149                 // expresison was preceded by a ^
150                 if (!haveTilde) {
151                     result.internalUnion(CharSet.parseString(s.substring(p + 1, q)));
152                 }
153                 else {
154                     result.internalDifference(CharSet.parseString(s.substring(p + 1, q)));
155                 }
156                 haveTilde = false;
157                 haveDash = false;
158                 wIsReal = false;
159                 p = q + 1;
160             }
161 
162             // if the character is a colon...
163             else if (c == ':') {
164                 // flush the single-character cache
165                 if (wIsReal) {
166                     result.internalUnion(new CharSet(w));
167                 }
168 
169                 // locate the matching colon (and throw an error if there
170                 // isn't one)
171                 int q = s.indexOf(':', p + 1);
172                 if (q == -1) {
173                     throw new IllegalArgumentException("Parse error at position " + p + " in " + s);
174                 }
175 
176                 // use charSetForCategory() to parse the text in the colons,
177                 // and either add or substract the result from our running
178                 // result depending on whether the :: expression was
179                 // preceded by a ^
180                 if (!haveTilde) {
181                     result.internalUnion(charSetForCategory(s.substring(p + 1, q)));
182                 }
183                 else {
184                     result.internalDifference(charSetForCategory(s.substring(p + 1, q)));
185                 }
186 
187                 // reset everything and advance to the next character
188                 haveTilde = false;
189                 haveDash = false;
190                 wIsReal = false;
191                 p = q + 1;
192             }
193 
194             // if the character is a dash, set an appropriate flag
195             else if (c == '-') {
196                 if (wIsReal) {
197                     haveDash = true;
198                 }
199                 ++p;
200             }
201 
202             // if the character is a caret, flush the single-character
203             // cache and set an appropriate flag.  If the set is empty
204             // (i.e., if the expression begins with ^), invert the set
205             // (i.e., set it to include everything).  The idea here is
206             // that a set that includes nothing but ^ expressions
207             // means "everything but these things".
208             else if (c == '^') {
209                 if (wIsReal) {
210                     result.internalUnion(new CharSet(w));
211                     wIsReal = false;
212                 }
213                 haveTilde = true;
214                 ++p;
215                 if (result.empty()) {
216                     result.internalComplement();
217                 }
218             }
219 
220             // throw an exception on an illegal character
221             else if (c >= ' ' && c < '\u007f' && !Character.isLetter((char)c)
222                      && !Character.isDigit((char)c) && c != '\\') {
223                 throw new IllegalArgumentException("Parse error at position " + p + " in " + s);
224             }
225 
226             // otherwise, we end up here...
227             else {
228                 // on a backslash, advance to the next character
229                 if (c == '\\') {
230                     ++p;
231                 }
232 
233                 // if the preceding character was a dash, this character
234                 // defines the end of a range.  Add or subtract that range
235                 // from the running result depending on whether or not it
236                 // was preceded by a ^
237                 if (haveDash) {
238                     if (s.codePointAt(p) < w) {
239                         throw new IllegalArgumentException("U+" +
240                             Integer.toHexString(s.codePointAt(p))
241                             + " is less than U+" + Integer.toHexString(w) + ".  Dash expressions "
242                             + "can't have their endpoints in reverse order.");
243                     }
244 
245                     int ch = s.codePointAt(p);
246                     if (!haveTilde) {
247                         result.internalUnion(new CharSet(w, ch));
248                     }
249                     else {
250                         result.internalDifference(new CharSet(w, ch));
251                     }
252                     p += Character.charCount(ch);
253                     haveDash = false;
254                     haveTilde = false;
255                     wIsReal = false;
256                 }
257 
258                 // if the preceding character was a caret, remove this character
259                 // from the running result
260                 else if (haveTilde) {
261                     w = s.codePointAt(p);
262                     result.internalDifference(new CharSet(w));
263                     p += Character.charCount(w);
264                     haveTilde = false;
265                     wIsReal = false;
266                 }
267 
268                 // otherwise, flush the single-character cache and then
269                 // put this character into the cache
270                 else if (wIsReal) {
271                     result.internalUnion(new CharSet(w));
272                     w = s.codePointAt(p);
273                     p += Character.charCount(w);
274                     wIsReal = true;
275                 } else {
276                     w = s.codePointAt(p);
277                     p += Character.charCount(w);
278                     wIsReal = true;
279                 }
280             }
281         }
282 
283         // finally, flush the single-character cache one last time
284         if (wIsReal) {
285             result.internalUnion(new CharSet(w));
286         }
287 
288         return result;
289     }
290 
291     /**
292      * Creates a CharSet containing all the characters in a particular
293      * Unicode category.  The text is either a two-character code from
294      * the Unicode database or a single character that begins one or more
295      * two-character codes.
296      */
297     private static CharSet charSetForCategory(String category) {
298         // throw an exception if we have anything other than one or two
299         // characters inside the colons
300         if (category.length() == 0 || category.length() >= 3) {
301             throw new IllegalArgumentException("Invalid character category: " + category);
302         }
303 
304         // if we have two characters, search the category map for that code
305         // and either construct and return a CharSet from the data in the
306         // category map or throw an exception
307         if (category.length() == 2) {
308             for (int i = 0; i < CharacterCategory.categoryNames.length; i++) {
309                 if (CharacterCategory.categoryNames[i].equals(category)) {
310                     return new CharSet(CharacterCategory.getCategoryMap(i));
311                 }
312             }
313             throw new IllegalArgumentException("Invalid character category: " + category);
314         }
315 
316         // if we have one character, search the category map for codes beginning
317         // with that letter, and union together all of the matching sets that
318         // we find (or throw an exception if there are no matches)
319         else if (category.length() == 1) {
320             CharSet result = new CharSet();
321             for (int i = 0; i < CharacterCategory.categoryNames.length; i++) {
322                 if (CharacterCategory.categoryNames[i].startsWith(category)) {
323                     result = result.union(new CharSet(CharacterCategory.getCategoryMap(i)));
324                 }
325             }
326             if (result.empty()) {
327                 throw new IllegalArgumentException("Invalid character category: " + category);
328             }
329             else {
330                 return result;
331             }
332         }
333         return new CharSet(); // should never get here, but to make the compiler happy...
334     }
335 
336     /**
337      * Returns a copy of CharSet's expression cache and sets CharSet's
338      * expression cache to empty.
339      */
340     public static Hashtable<String, CharSet> releaseExpressionCache() {
341         Hashtable<String, CharSet> result = expressionCache;
342         expressionCache = null;
343         return result;
344     }
345 
346     //==========================================================================
347     // CharSet manipulation
348     //==========================================================================
349     /**
350      * Creates an empty CharSet.
351      */
352     public CharSet() {
353         chars = new int[0];
354     }
355 
356     /**
357      * Creates a CharSet containing a single character.
358      * @param c The character to put into the CharSet
359      */
360     public CharSet(int c) {
361         chars = new int[2];
362         chars[0] = c;
363         chars[1] = c;
364     }
365 
366     /**
367      * Creates a CharSet containing a range of characters.
368      * @param lo The lowest-numbered character to include in the range
369      * @param hi The highest-numbered character to include in the range
370      */
371     public CharSet(int lo, int hi) {
372         chars = new int[2];
373         if (lo <= hi) {
374             chars[0] = lo;
375             chars[1] = hi;
376         }
377         else {
378             chars[0] = hi;
379             chars[1] = lo;
380         }
381     }
382 
383     /**
384      * Creates a CharSet, initializing it from the internal storage
385      * of another CharSet (this function performs no error checking
386      * on "chars", so if it's malformed, undefined behavior will result)
387      */
388     private CharSet(int[] chars) {
389         this.chars = chars;
390     }
391 
392     /**
393      * Returns a CharSet representing the union of two CharSets.
394      */
395     public CharSet union(CharSet that) {
396         return new CharSet(doUnion(that.chars));
397     }
398 
399     /**
400      * Adds the characters in "that" to this CharSet
401      */
402     private void internalUnion(CharSet that) {
403         chars = doUnion(that.chars);
404     }
405 
406     /**
407      * The actual implementation of the union functions
408      */
409     private int[] doUnion(int[] c2) {
410         int[] result = new int[chars.length+c2.length];
411 
412         int i = 0;
413         int j = 0;
414         int index = 0;
415 
416         // consider all the characters in both strings
417         while (i < chars.length && j < c2.length) {
418             int ub;
419 
420             // the first character in the result is the lower of the
421             // starting characters of the two strings, and "ub" gets
422             // set to the upper bound of that range
423             if (chars[i] < c2[j]) {
424                 result[index++] = chars[i];
425                 ub = chars[++i];
426             }
427             else {
428                 result[index++] = c2[j];
429                 ub = c2[++j];
430             }
431 
432             // for as long as one of our two pointers is pointing to a range's
433             // end point, or i is pointing to a character that is less than
434             // "ub" plus one (the "plus one" stitches touching ranges together)...
435             while (i % 2 == 1 ||
436                    j % 2 == 1 ||
437                    (i < chars.length && chars[i] <= ub + 1)) {
438 
439                 // advance i to the first character that is greater than
440                 // "ub" plus one
441                 while (i < chars.length && chars[i] <= ub + 1) {
442                     ++i;
443                 }
444 
445                 // if i points to the endpoint of a range, update "ub"
446                 // to that character, or if i points to the start of
447                 // a range and the endpoint of the preceding range is
448                 // greater than "ub", update "up" to _that_ character
449                 if (i % 2 == 1) {
450                     ub = chars[i];
451                 }
452                 else if (i > 0 && chars[i - 1] > ub) {
453                     ub = chars[i - 1];
454                 }
455 
456                 // now advance j to the first character that is greater
457                 // that "ub" plus one
458                 while (j < c2.length && c2[j] <= ub + 1) {
459                     ++j;
460                 }
461 
462                 // if j points to the endpoint of a range, update "ub"
463                 // to that character, or if j points to the start of
464                 // a range and the endpoint of the preceding range is
465                 // greater than "ub", update "up" to _that_ character
466                 if (j % 2 == 1) {
467                     ub = c2[j];
468                 }
469                 else if (j > 0 && c2[j - 1] > ub) {
470                     ub = c2[j - 1];
471                 }
472             }
473             // when we finally fall out of this loop, we will have stitched
474             // together a series of ranges that overlap or touch, i and j
475             // will both point to starting points of ranges, and "ub" will
476             // be the endpoint of the range we're working on.  Write "ub"
477             // to the result
478             result[index++] = ub;
479 
480         // loop back around to create the next range in the result
481         }
482 
483         // we fall out to here when we've exhausted all the characters in
484         // one of the operands.  We can append all of the remaining characters
485         // in the other operand without doing any extra work.
486         if (i < chars.length) {
487             for (int k = i; k < chars.length; k++) {
488                 result[index++] = chars[k];
489             }
490         }
491         if (j < c2.length) {
492             for (int k = j; k < c2.length; k++) {
493                 result[index++] = c2[k];
494             }
495         }
496 
497         if (result.length > index) {
498             int[] tmpbuf = new int[index];
499             System.arraycopy(result, 0, tmpbuf, 0, index);
500             return tmpbuf;
501         }
502 
503         return result;
504     }
505 
506     /**
507      * Returns the intersection of two CharSets.
508      */
509     public CharSet intersection(CharSet that) {
510         return new CharSet(doIntersection(that.chars));
511     }
512 
513     /**
514      * Removes from this CharSet any characters that aren't also in "that"
515      */
516     private void internalIntersection(CharSet that) {
517         chars = doIntersection(that.chars);
518     }
519 
520     /**
521      * The internal implementation of the two intersection functions
522      */
523     private int[] doIntersection(int[] c2) {
524         int[] result = new int[chars.length+c2.length];
525 
526         int i = 0;
527         int j = 0;
528         int oldI;
529         int oldJ;
530         int index = 0;
531 
532         // iterate until we've exhausted one of the operands
533         while (i < chars.length && j < c2.length) {
534 
535             // advance j until it points to a character that is larger than
536             // the one i points to.  If this is the beginning of a one-
537             // character range, advance j to point to the end
538             if (i < chars.length && i % 2 == 0) {
539                 while (j < c2.length && c2[j] < chars[i]) {
540                     ++j;
541                 }
542                 if (j < c2.length && j % 2 == 0 && c2[j] == chars[i]) {
543                     ++j;
544                 }
545             }
546 
547             // if j points to the endpoint of a range, save the current
548             // value of i, then advance i until it reaches a character
549             // which is larger than the character pointed at
550             // by j.  All of the characters we've advanced over (except
551             // the one currently pointed to by i) are added to the result
552             oldI = i;
553             while (j % 2 == 1 && i < chars.length && chars[i] <= c2[j]) {
554                 ++i;
555             }
556             for (int k = oldI; k < i; k++) {
557                 result[index++] = chars[k];
558             }
559 
560             // if i points to the endpoint of a range, save the current
561             // value of j, then advance j until it reaches a character
562             // which is larger than the character pointed at
563             // by i.  All of the characters we've advanced over (except
564             // the one currently pointed to by i) are added to the result
565             oldJ = j;
566             while (i % 2 == 1 && j < c2.length && c2[j] <= chars[i]) {
567                 ++j;
568             }
569             for (int k = oldJ; k < j; k++) {
570                 result[index++] = c2[k];
571             }
572 
573             // advance i until it points to a character larger than j
574             // If it points at the beginning of a one-character range,
575             // advance it to the end of that range
576             if (j < c2.length && j % 2 == 0) {
577                 while (i < chars.length && chars[i] < c2[j]) {
578                     ++i;
579                 }
580                 if (i < chars.length && i % 2 == 0 && c2[j] == chars[i]) {
581                     ++i;
582                 }
583             }
584         }
585 
586         if (result.length > index) {
587             int[] tmpbuf = new int[index];
588             System.arraycopy(result, 0, tmpbuf, 0, index);
589             return tmpbuf;
590         }
591 
592         return result;
593     }
594 
595     /**
596      * Returns a CharSet containing all the characters in "this" that
597      * aren't also in "that"
598      */
599     public CharSet difference(CharSet that) {
600         return new CharSet(doIntersection(that.doComplement()));
601     }
602 
603     /**
604      * Removes from "this" all the characters that are also in "that"
605      */
606     private void internalDifference(CharSet that) {
607         chars = doIntersection(that.doComplement());
608     }
609 
610     /**
611      * Returns a CharSet containing all the characters which are not
612      * in "this"
613      */
614     public CharSet complement() {
615         return new CharSet(doComplement());
616     }
617 
618     /**
619      * Complements "this".  All the characters it contains are removed,
620      * and all the characters it doesn't contain are added.
621      */
622     private void internalComplement() {
623         chars = doComplement();
624     }
625 
626     /**
627      * The internal implementation function for the complement routines
628      */
629     private int[] doComplement() {
630         // the complement of an empty CharSet is one containing everything
631         if (empty()) {
632             int[] result = new int[2];
633             result[0] = 0x0000;
634             result[1] = 0x10FFFF;
635             return result;
636         }
637 
638         int[] result = new int[chars.length+2];
639 
640         int i = 0;
641         int index = 0;
642 
643         // the result begins with \u0000 unless the original CharSet does
644         if (chars[0] != 0x0000) {
645             result[index++] = 0x0000;
646         }
647 
648         // walk through the characters in this CharSet.  Append a pair of
649         // characters the first of which is one less than the first
650         // character we see and the second of which is one plus the second
651         // character we see (don't write the first character if it's \u0000,
652         // and don't write the second character if it's \uffff.
653         while (i < chars.length) {
654             if (chars[i] != 0x0000) {
655                 result[index++] = chars[i] - 1;
656             }
657             if (chars[i + 1] != 0x10FFFF) {
658                 result[index++] = chars[i + 1] + 1;
659             }
660             i += 2;
661         }
662 
663         // add 0x10ffff to the end of the result, unless it was in
664         // the original set
665         if (chars[i-1] != 0x10FFFF) {
666             result[index++] = 0x10FFFF;
667         }
668 
669         if (result.length > index) {
670             int[] tmpbuf = new int[index];
671             System.arraycopy(result, 0, tmpbuf, 0, index);
672             return tmpbuf;
673         }
674 
675         return result;
676     }
677 
678     /**
679      * Returns true if this CharSet contains the specified character
680      * @param c The character we're testing for set membership
681      */
682     public boolean contains(int c) {
683         // search for the first range endpoint that is greater than or
684         // equal to c
685         int i = 1;
686         while (i < chars.length && chars[i] < c) {
687             i += 2;
688         }
689 
690         // if we've walked off the end, we don't contain c
691         if (i == chars.length) {
692             return false;
693         }
694 
695         // otherwise, we contain c if the beginning of the range is less
696         // than or equal to c
697         return chars[i - 1] <= c;
698     }
699 
700     /**
701      * Returns true if "that" is another instance of CharSet containing
702      * the exact same characters as this one
703      */
704     public boolean equals(Object that) {
705         return (that instanceof CharSet) && Arrays.equals(chars, ((CharSet)that).chars);
706     }
707 
708     /**
709      * Returns the hash code for this set of characters
710      */
711     public int hashCode() {
712        return Arrays.hashCode(chars);
713     }
714 
715     /**
716      * Creates a new CharSet that is equal to this one
717      */
718     public Object clone() {
719         return new CharSet(chars);
720     }
721 
722     /**
723      * Returns true if this CharSet contains no characters
724      */
725     public boolean empty() {
726         return chars.length == 0;
727     }
728 
729     /**
730      * Returns a textual representation of this CharSet.  If the result
731      * of calling this function is passed to CharSet.parseString(), it
732      * will produce another CharSet that is equal to this one.
733      */
734     public String toString() {
735         StringBuffer result = new StringBuffer();
736 
737         // the result begins with an opening bracket
738         result.append('[');
739 
740         // iterate through the ranges in the CharSet
741         for (int i = 0; i < chars.length; i += 2) {
742             // for a range with the same beginning and ending point,
743             // output that character
744             if (chars[i] == chars[i + 1]) {
745                 result.append("0x");
746                 result.append(Integer.toHexString(chars[i]));
747             }
748 
749             // otherwise, output the start and end points of the range
750             // separated by a dash
751             else {
752                 result.append("0x");
753                 result.append(Integer.toHexString(chars[i]));
754                 result.append("-0x");
755                 result.append(Integer.toHexString(chars[i + 1]));
756             }
757         }
758 
759         // the result ends with a closing bracket
760         result.append(']');
761         return result.toString();
762     }
763 
764     /**
765      * Returns an integer array representing the contents of this CharSet
766      * in the same form in which they're stored internally: as pairs
767      * of characters representing the start and end points of ranges
768      */
769     public int[] getRanges() {
770         return chars;
771     }
772 
773     /**
774      * Returns an Enumeration that will return the ranges of characters
775      * contained in this CharSet one at a time
776      */
777     public Enumeration getChars() {
778         return new Enumeration(this);
779     }
780 
781     //==========================================================================
782     // CharSet.Enumeration
783     //==========================================================================
784 
785     /**
786      * An Enumeration that can be used to extract the character ranges
787      * from a CharSet one at a time
788      */
789     public class Enumeration implements java.util.Enumeration<int[]> {
790         /**
791          * Initializes a CharSet.Enumeration
792          */
793         Enumeration(CharSet cs) {
794             this.chars = cs.chars;
795             p = 0;
796         }
797 
798         /**
799          * Returns true if the enumeration hasn't yet returned
800          * all the ranges in the CharSet
801          */
802         public boolean hasMoreElements() {
803             return p < chars.length;
804         }
805 
806         /**
807          * Returns the next range in the CarSet
808          */
809         public int[] nextElement() {
810             int[] result = new int[2];
811             result[0] = chars[p++];
812             result[1] = chars[p++];
813             return result;
814         }
815 
816         int p;
817         int[] chars;
818     }
819 }