View Javadoc
1   /*
2    * Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved.
3    * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4    *
5    * This code is free software; you can redistribute it and/or modify it
6    * under the terms of the GNU General Public License version 2 only, as
7    * published by the Free Software Foundation.  Oracle designates this
8    * particular file as subject to the "Classpath" exception as provided
9    * by Oracle in the LICENSE file that accompanied this code.
10   *
11   * This code is distributed in the hope that it will be useful, but WITHOUT
12   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14   * version 2 for more details (a copy is included in the LICENSE file that
15   * accompanied this code).
16   *
17   * You should have received a copy of the GNU General Public License version
18   * 2 along with this work; if not, write to the Free Software Foundation,
19   * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20   *
21   * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22   * or visit www.oracle.com if you need additional information or have any
23   * questions.
24   */
25  
26  package build.tools.generatecharacter;
27  
28  import java.io.IOException;
29  import java.io.FileNotFoundException;
30  import java.io.BufferedReader;
31  import java.io.FileReader;
32  import java.io.PrintWriter;
33  import java.io.BufferedWriter;
34  import java.io.FileWriter;
35  import java.io.File;
36  import java.util.List;
37  
38  import build.tools.generatecharacter.CharacterName;
39  
40  /**
41   * This program generates the source code for the class java.lang.Character.
42   * It also generates native C code that can perform the same operations.
43   * It requires two external input data files:
44   * <ul>
45   * <li> Unicode specification file
46   * <li> Character class template file
47   * </ul>
48   * The Unicode specification file is available from the Unicode consortium.
49   * It has character specification lines that look like this:
50   * <listing>
51   * 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
52   * </listing>
53   * The Character class template file is filled in with additional
54   * information to produce the file Character.java, which can then be
55   * compiled by a Java compiler.  The template file contains certain
56   * markers consisting of an alphabetic name string preceded by "$$".
57   * Such markers are replaced with generated program text.  As a special
58   * case, the marker "Lookup(xxx)" is recognized, where "xxx" consists of
59   * alphabetic characters constituting a variable name.  The character "_"
60   * is considered alphabetic for these purposes.
61   *
62   * @author  Guy Steele
63   * @author  Alan Liu
64   * @author  John O'Conner
65   */
66  
67  public class GenerateCharacter {
68  
69      final static boolean DEBUG = false;
70  
71      final static String commandMarker = "$$";
72      static String ROOT                        = "";
73      static String DefaultUnicodeSpecFileName  = ROOT + "UnicodeData.txt";
74      static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt";
75      static String DefaultPropListFileName     = ROOT + "PropList.txt";
76      static String DefaultJavaTemplateFileName = ROOT + "Character.java.template";
77      static String DefaultJavaOutputFileName   = ROOT + "Character.java";
78      static String DefaultCTemplateFileName    = ROOT + "Character.c.template";
79      static String DefaultCOutputFileName      = ROOT + "Character.c";
80  
81      static int plane = 0;
82  
83      /* The overall idea is that, in the generated Character class source code,
84      most character property data is stored in a special multi-level table whose
85      structure is defined by a sequence of nonnegative integers [k1, k2, ..., kn].
86      The integers must sum to 16 (the number of bits in a character).
87      The first table is indexed by the k1 high-order bits of the character code.
88      The result is concatenated to the next k2 bits of the character code to index
89      the second table, and so on.  Eventually the kn low-order bits of the character
90      code are concatenated and used to index one of two tables A and B; A contains
91      32-bit integer entries and B contains 16-bit short entries.  The 48 bits that
92      can be thus obtained encode the properties for the character.
93  
94      The default specification is [9, 4, 3, 0].  This particular table format was
95      designed by conducting an exhaustive search of table formats to minimize the
96      space consumed by the tables: the first and third tables need have only byte
97      values (the second table must have short values).  Another good choice is
98      [10, 6, 0], which produces a larger table but allows particularly fast table
99      lookup code.
100 
101     In each case, where the word "concatenated" is used, this may imply
102     first a << and then a | operation, or perhaps just a | operation if
103     the values in the table can be preshifted (generally possible if the table
104     entries are short rather than byte).
105     */
106 
107     /* The character properties are currently encoded into A (32 bits)and B (16 bits)
108        two parts.
109 
110     A: the low 32 bits are defined  in the following manner:
111 
112     1 bit Mirrored property.
113     4 bits      Bidirectional category (see below) (unused if -nobidi switch specified)
114     9 bits      A signed offset used for converting case .
115     1 bit       If 1, adding the signed offset converts the character to lowercase.
116     1 bit       If 1, subtracting the signed offset converts the character to uppercase.
117         Note: for a titlecase character, both of the preceding bits will be 1
118         and the signed offset will be 1.
119     1 bit   If 1, this character has a titlecase equivalent (possibly itself);
120         in this case, the two bits before this bit can be used to decide
121         whether this character is in fact uppercase, lowercase, or titlecase.
122     3 bits      This field provides a quick way to lex identifiers.
123         The eight possible values for this field are as follows:
124         0  May not be part of an identifier
125         1  Ignorable control; may continue a Unicode identifier or Java identifier
126         2  May continue a Java identifier but not a Unicode identifier (unused)
127         3  May continue a Unicode identifier or Java identifier
128         4  Is a Java whitespace character
129         5  May start or continue a Java identifier;
130            may continue but not start a Unicode identifier
131            (this value is used for connector punctuation such as _)
132         6  May start or continue a Java identifier;
133            may not occur in a Unicode identifier
134            (this value is used for currency symbols such as $)
135         7  May start or continue a Unicode identifier or Java identifier
136         Thus:
137            5, 6, 7 may start a Java identifier
138            1, 2, 3, 5, 6, 7 may continue a Java identifier
139            7 may start a Unicode identifier
140            1, 3, 5, 7 may continue a Unicode identifier
141            1 is ignorable within an identifier
142            4 is Java whitespace
143     2 bits      This field indicates whether the character has a numeric property.
144         The four possible values for this field are as follows:
145         0  This character has no numeric property.
146         1  Adding the digit offset to the character code and then
147            masking with 0x1F will produce the desired numeric value.
148         2  This character has a "strange" numeric value.
149         3  A Java supradecimal digit: adding the digit offset to the
150            character code, then masking with 0x1F, then adding 10
151            will produce the desired numeric value.
152     5 bits  The digit offset (see description of previous field)
153     5 bits      Character type (see below)
154 
155     B: the high 16 bits are defined as:
156     1 bit Other_Lowercase property
157     1 bit Other_Uppercase property
158     1 bit Other_Alphabetic property
159     1 bit Other_Math property
160     1 bit Ideographic property
161     1 bit Noncharacter codepoint property
162     */
163 
164 
165     // bit masks identify each component of a 32-bit property field described
166     // above.
167     // shift* indicates how many shifts right must happen to get the
168     // indicated property value in the lowest bits of the 32-bit space.
169     private static final int
170         shiftType           = 0,        maskType            =       0x001F,
171         shiftDigitOffset    = 5,        maskDigitOffset     =       0x03E0,
172         shiftNumericType    = 10,       maskNumericType     =       0x0C00,
173         shiftIdentifierInfo = 12,       maskIdentifierInfo  =       0x7000,
174                                         maskUnicodePart     =       0x1000,
175         shiftCaseInfo       = 15,       maskCaseInfo        =      0x38000,
176                                         maskLowerCase       =      0x20000,
177                                         maskUpperCase       =      0x10000,
178                                         maskTitleCase       =      0x08000,
179         shiftCaseOffset     = 18,       maskCaseOffset      =   0x07FC0000,
180         shiftCaseOffsetSign = 5,
181                                         // used only when calculating and
182                                         // storing digit offsets from char values
183                                         maskDigit               =   0x001F,
184                                         // case offset are 9 bits
185                                         maskCase                =   0x01FF,
186         shiftBidi           = 27,       maskBidi              = 0x78000000,
187         shiftMirrored       = 31,       //maskMirrored          = 0x80000000,
188         shiftPlane          = 16,       maskPlane = 0xFF0000;
189 
190     // maskMirrored needs to be long, if up 16-bit
191     private static final long maskMirrored          = 0x80000000L;
192 
193     // bit masks identify the 16-bit priperty field described above, in B
194     // table
195     private static final long
196         maskOtherLowercase  = 0x100000000L,
197         maskOtherUppercase  = 0x200000000L,
198         maskOtherAlphabetic = 0x400000000L,
199         maskOtherMath       = 0x800000000L,
200         maskIdeographic     = 0x1000000000L,
201         maskNoncharacterCP  = 0x2000000000L;
202 
203     // Can compare masked values with these to determine
204     // numeric or lexical types.
205     public static int
206         valueNotNumeric             = 0x0000,
207         valueDigit                  = 0x0400,
208         valueStrangeNumeric         = 0x0800,
209         valueJavaSupradecimal       = 0x0C00,
210         valueIgnorable              = 0x1000,
211         valueJavaOnlyPart           = 0x2000,
212         valueJavaUnicodePart        = 0x3000,
213         valueJavaWhitespace         = 0x4000,
214         valueJavaStartUnicodePart   = 0x5000,
215         valueJavaOnlyStart          = 0x6000,
216         valueJavaUnicodeStart       = 0x7000,
217         lowJavaStart                = 0x5000,
218         nonzeroJavaPart             = 0x3000,
219         valueUnicodeStart           = 0x7000;
220 
221     // these values are used when only identifier properties are generated
222     // for use in verifier code. Shortens the property down to a single byte.
223     private static final int
224         bitJavaStart            = 0x02,
225         bitJavaPart             = 0x01,
226         maskIsJavaIdentifierPart = bitJavaPart,
227         maskIsJavaIdentifierStart = bitJavaStart;
228 
229     static int maxOffset = maskCase/2 ;
230     static int minOffset = -maxOffset;
231 
232     /* The following routines provide simple, concise formatting of long integer values.
233      The number in the name of the method indicates the desired number of characters
234      to be produced.  If the number of digits required to represent the integer value
235      is less than that number, then the output is padded on the left  with zeros
236      (for hex) or with spaces (for decimal).  If the number of digits required to
237      represent the integer value is greater than the desired number, then all the digits
238      that are required are actually produced.
239     */
240 
241     static String hex(long n) { return Long.toHexString(n).toUpperCase(); }
242 
243     static String hex2(long n) {
244         String q = Long.toHexString(n & 0xFF).toUpperCase();
245         return "00".substring(Math.min(2, q.length())) + q;
246     }
247 
248     static String hex4(long n) {
249         String q = Long.toHexString(n & 0xFFFF).toUpperCase();
250         return "0000".substring(Math.min(4, q.length())) + q;
251     }
252 
253     static String hex8(long n) {
254         String q = Long.toHexString(n & 0xFFFFFFFFL).toUpperCase();
255         return "00000000".substring(Math.min(8, q.length())) + q;
256     }
257 
258     static String hex16(long n) {
259         String q = Long.toHexString(n).toUpperCase();
260         return "0000000000000000".substring(Math.min(16, q.length())) + q;
261     }
262 
263     static String dec3(long n) {
264         String q = Long.toString(n);
265         return "   ".substring(Math.min(3, q.length())) + q;
266     }
267 
268     static String dec5(long n) {
269         String q = Long.toString(n);
270         return "     ".substring(Math.min(5, q.length())) + q;
271     }
272 
273     /* This routine is called when some failure occurs. */
274 
275     static void FAIL(String s) {
276         System.out.println("** " + s);
277     }
278 
279     /**
280     * Given the data from the Unicode specification file, this routine builds a map.
281     *
282     * The specification file is assumed to contain its data in sorted order by
283     * character code; as a result, the array passed as an argument to this method
284     * has its components in the same sorted order, with one entry for each defined
285     * Unicode character or character range.  (A range is indicated by two consecutive
286     * entries, such that the name of the first entry begins with "<" and ends with
287     * "First>" and the second entry begins with "<" and ends with "Last>".)  This is
288     * therefore a sparse representation of the character property data.
289     *
290     * The resulting map is dense representation of the character data.  It contains
291     * 2^16 = 65536 entries, each of which is a long integer.  (Right now only 32 bits
292     * of this long value are used, but type long is used rather than int to facilitate
293     * future extensions of this source code generator that might require more than
294     * 32 bits to encode relevant character properties.)  Entry k holds the encoded
295     * properties for character k.
296     *
297     * Method buildMap manages the transformation from the sparse representation to
298     * the dense representation.  It calls method buildOne to handle the encoding
299     * of character property data from a single UnicodeSpec object into 32 bits.
300     * For undefined characters, method buildOne is not called and the map entry for
301     * that character is set to UnicodeSpec.UNASSIGNED.
302     *
303     * @param data       character property data from the Unicode specification file
304     * @return   an array of length 65536 with one entry for every possible char value
305     *
306     * @see GenerateCharacter#buildOne
307     */
308 
309     static long[] buildMap(UnicodeSpec[] data, SpecialCaseMap[] specialMaps, PropList propList)
310     {
311         long[] result;
312         if (bLatin1 == true) {
313             result = new long[256];
314         } else {
315             result = new long[1<<16];
316         }
317         int k=0;
318         int codePoint = plane<<16;
319         UnicodeSpec nonCharSpec = new UnicodeSpec();
320         for (int j = 0; j < data.length && k < result.length; j++) {
321             if (data[j].codePoint == codePoint) {
322                 result[k] = buildOne(codePoint, data[j], specialMaps);
323                 ++k;
324                 ++codePoint;
325             }
326             else if(data[j].codePoint > codePoint) {
327                 if (data[j].name.endsWith("Last>")) {
328                     // build map data for all chars except last in range
329                     while (codePoint < data[j].codePoint && k < result.length) {
330                         result[k] = buildOne(codePoint, data[j], specialMaps);
331                         ++k;
332                         ++codePoint;
333                     }
334                 }
335                 else {
336                     // we have a few unassigned chars before data[j].codePoint
337                     while (codePoint < data[j].codePoint && k < result.length) {
338                         result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
339                         ++k;
340                         ++codePoint;
341                     }
342                 }
343                 k = data[j].codePoint & 0xFFFF;
344                 codePoint = data[j].codePoint;
345                 result[k] = buildOne(codePoint, data[j], specialMaps);
346                 ++k;
347                 ++codePoint;
348             }
349             else {
350                 System.out.println("An error has occured during spec mapping.");
351                 System.exit(0);
352             }
353         }
354         // if there are still unprocessed chars, process them
355         // as unassigned/undefined.
356         codePoint = (plane<<16) | k;
357         while (k < result.length) {
358             result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
359             ++k;
360             ++codePoint;
361         }
362         // now add all extra supported properties from PropList, to the
363         // upper 16-bit
364         addExProp(result, propList, "Other_Lowercase", maskOtherLowercase);
365         addExProp(result, propList, "Other_Uppercase", maskOtherUppercase);
366         addExProp(result, propList, "Other_Alphabetic", maskOtherAlphabetic);
367         addExProp(result, propList, "Ideographic", maskIdeographic);
368         //addExProp(result, propList, "Other_Math", maskOtherMath);
369         //addExProp(result, propList, "Noncharacter_CodePoint", maskNoncharacterCP);
370 
371         return result;
372     }
373 
374     // The maximum and minimum offsets found while scanning the database
375     static int maxOffsetSeen = 0;
376     static int minOffsetSeen = 0;
377 
378     /**
379      * Some Unicode separator characters are not considered Java whitespace.
380      * @param c character to test
381      * @return true if c in an invalid Java whitespace character, false otherwise.
382      */
383     static boolean isInvalidJavaWhiteSpace(int c) {
384         int[] exceptions = {0x00A0, 0x2007, 0x202F, 0xFEFF};
385         boolean retValue = false;
386         for(int x=0;x<exceptions.length;x++) {
387             if(c == exceptions[x]) {
388                 retValue = true;
389                 break;
390             }
391         }
392         return retValue;
393 
394     }
395 
396     /**
397     * Given the character property data for one Unicode character, encode the data
398     * of interest into a single long integer value.  (Right now only 32 bits
399     * of this long value are used, but type long is used rather than int to facilitate
400     * future extensions of this source code generator that might require more than
401     * 32 bits to encode relevant character properties.)
402     *
403     * @param c   the character code for which to encode property data
404     * @param us  property data record from the Unicode specification file
405     *            (its character code might not be equal to c if it specifies data
406     *            for a range of characters)
407     * @return   an encoded long value that contains the properties for a single char
408     *
409     * @see GenerateCharacter#buildMap
410     */
411 
412     static long buildOne(int c, UnicodeSpec us, SpecialCaseMap[] specialMaps) {
413         long resultA = 0;
414         // record the general category
415         resultA |= us.generalCategory;
416 
417         // record the numeric properties
418         NUMERIC: {
419         STRANGE: {
420             int val = 0;
421             // c is A-Z
422             if ((c >= 0x0041) && (c <= 0x005A)) {
423                 val = c - 0x0041;
424                 resultA |= valueJavaSupradecimal;
425             // c is a-z
426             } else if ((c >= 0x0061) && (c <= 0x007A)) {
427                 val = c - 0x0061;
428                 resultA |= valueJavaSupradecimal;
429             // c is a full-width A-Z
430             } else if ((c >= 0xFF21) && (c <= 0xFF3A)) {
431                 val = c - 0xFF21;
432                 resultA |= valueJavaSupradecimal;
433             // c is a full-width a-z
434             } else if ((c >= 0xFF41) && (c <= 0xFF5A)) {
435                 val = c - 0xFF41;
436                 resultA |= valueJavaSupradecimal;
437             } else if (us.isDecimalValue()) {
438                 val = us.decimalValue;
439                 resultA |= valueDigit;
440             } else if (us.isDigitValue()) {
441                 val = us.digitValue;
442                 resultA |= valueDigit;
443             } else {
444                 if (us.numericValue.length() == 0) {
445                     break NUMERIC;                      // no numeric value at all
446                 } else {
447                     try {
448                         val = Integer.parseInt(us.numericValue);
449                         if (val >= 32 || val < 0) break STRANGE;
450                         if (c == 0x215F) break STRANGE;
451                     } catch(NumberFormatException e) {
452                         break STRANGE;
453                     }
454                     resultA |= valueDigit;
455                 }
456             }
457             if (val >= 32 || val < 0) break STRANGE;
458             resultA |= ((val - c & maskDigit) << shiftDigitOffset);
459             break NUMERIC;
460         } // end STRANGE
461         resultA |= valueStrangeNumeric;
462         } // end NUMERIC
463 
464         // record case mapping
465         int offset = 0;
466         // might have a 1:M mapping
467         int specialMap = SpecialCaseMap.find(c, specialCaseMaps);
468         boolean bHasUpper = (us.hasUpperMap()) || (specialMap != -1);
469         if (bHasUpper) {
470             resultA |= maskUpperCase;
471         }
472         if (specialMap != -1) {
473             // has mapping, but cannot record the
474             // proper offset; can only flag it and provide special case
475             // code in Character.java
476             offset = -1;
477         }
478         else if (us.hasUpperMap())  {
479             offset = c - us.upperMap;
480         }
481 
482         if (us.hasLowerMap()) {
483             resultA |= maskLowerCase;
484             if (offset == 0)
485                 offset = us.lowerMap - c;
486             else if (offset != (us.lowerMap - c)) {
487                 if (DEBUG) {
488                 FAIL("Character " + hex(c) +
489                 " has incompatible lowercase and uppercase mappings");
490                 }
491             }
492         }
493         if ((us.hasTitleMap() && us.titleMap != us.upperMap) ||
494             (bHasUpper && us.hasLowerMap())) {
495             resultA |= maskTitleCase;
496         }
497         if (bHasUpper && !us.hasLowerMap() && !us.hasTitleMap() && verbose) {
498             System.out.println("Warning: Character " + hex4(c) + " has upper but " +
499                                "no title case; Java won't know this");
500         }
501         if (offset < minOffsetSeen) minOffsetSeen = offset;
502         if (offset > maxOffsetSeen) maxOffsetSeen = offset;
503         if (offset > maxOffset || offset < minOffset) {
504             if (DEBUG) {
505             FAIL("Case offset " + offset + " for character " + hex4(c) + " must be handled as a special case");
506             }
507             offset = maskCase;
508         }
509         resultA |= ((offset & maskCase) << shiftCaseOffset);
510 
511         // record lexical info about this character
512         if (us.generalCategory == UnicodeSpec.LOWERCASE_LETTER
513                 || us.generalCategory == UnicodeSpec.UPPERCASE_LETTER
514                 || us.generalCategory == UnicodeSpec.TITLECASE_LETTER
515                 || us.generalCategory == UnicodeSpec.MODIFIER_LETTER
516                 || us.generalCategory == UnicodeSpec.OTHER_LETTER
517                 || us.generalCategory == UnicodeSpec.LETTER_NUMBER) {
518             resultA |= valueJavaUnicodeStart;
519         }
520         else if (us.generalCategory == UnicodeSpec.COMBINING_SPACING_MARK
521                 || us.generalCategory == UnicodeSpec.NON_SPACING_MARK
522                 || us.generalCategory == UnicodeSpec.DECIMAL_DIGIT_NUMBER) {
523             resultA |= valueJavaUnicodePart;
524         }
525         else if (us.generalCategory == UnicodeSpec.CONNECTOR_PUNCTUATION) {
526             resultA |= valueJavaStartUnicodePart;
527         }
528         else if (us.generalCategory == UnicodeSpec.CURRENCY_SYMBOL) {
529             resultA |= valueJavaOnlyStart;
530         }
531         else if (((c >= 0x0000) && (c <= 0x0008))
532                 || ((c >= 0x000E) && (c <= 0x001B))
533                 || ((c >= 0x007F) && (c <= 0x009F))
534                 || us.generalCategory == UnicodeSpec.FORMAT) {
535             resultA |= valueIgnorable;
536         }
537         else if (us.generalCategory == UnicodeSpec.SPACE_SEPARATOR
538                 || us.generalCategory == UnicodeSpec.LINE_SEPARATOR
539                 || us.generalCategory == UnicodeSpec.PARAGRAPH_SEPARATOR) {
540             if (!isInvalidJavaWhiteSpace(c)) resultA |= valueJavaWhitespace;
541         }
542         else if (((c >= 0x0009) && (c <= 0x000D))
543                 || ((c >= 0x001C) && (c <= 0x001F))) {
544             resultA |= valueJavaWhitespace;
545         }
546 
547         // record bidi category
548         if (!nobidi) {
549             int tmpBidi =
550                 (us.bidiCategory > UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS ||
551                     us.bidiCategory == -1) ? maskBidi : (us.bidiCategory << shiftBidi);
552             resultA |= tmpBidi;
553         }
554 
555         // record mirrored property
556         if (!nomirror) {
557             resultA |= us.mirrored ? maskMirrored : 0;
558         }
559 
560         if (identifiers) {
561             long replacement = 0;
562             if ((resultA & maskIdentifierInfo) >= lowJavaStart) {
563                 replacement |= bitJavaStart;
564             }
565             if ( ((resultA & nonzeroJavaPart) != 0)
566                     && ((resultA & maskIdentifierInfo) != valueIgnorable)) {
567                 replacement |= bitJavaPart;
568             }
569             resultA = replacement;
570         }
571         return resultA;
572     }
573 
574     static void addExProp(long[] map, PropList propList, String prop, long mask) {
575         List<Integer> cps = propList.codepoints(prop);
576         if (cps != null) {
577             for (Integer cp : cps) {
578                 if (cp < map.length)
579                     map[cp] |= mask;
580             }
581         }
582     }
583 
584     /**
585     * This is the heart of the table compression strategy.  The inputs are a map
586     * and a number of bits (size).  The map is simply an array of long integer values;
587     * the number of bits indicates how index values for that map are to be split.
588     * The length of the given map must be a multiple of (1 << size).  The result is
589     * a new map z and a compressed table t such that for every valid index value k
590     * for the original map, t[(z[k>>size]<<size)|(k & ((1<<size)-1))] == map[k].
591     *
592     * In other words, the index k can be split into two parts, namely the "size"
593     * low-order bits and all the remaining high-order bits; the high-order bits are then
594     * remapped by map z to produce an index into table t.  In effect, the data of the
595     * original map m is broken up into blocks of size (1<<size); the compression relies
596     * on the expectation that many of these blocks will be identical and therefore need
597     * be represented only once in the compressed table t.
598     *
599     * This method is intended to be used iteratively.  The first map to be handed
600     * to it is the one constructed by method buildMap.  After that, the first of the
601     * two arrays returned by this method is fed back into it for further compression.
602     * At the end of the iteration, one has a starter map and a sequence of tables.
603     *
604     * The algorithm used to implement this computation is straightforward and not
605     * especially clever.  It uses brute-force linear search (the loop labeled MIDDLE)
606     * to locate identical blocks, so overall the time complexity of the algorithm
607     * is quadratic in the length of the input map.  Fortunately, speed is not crucial
608     * to this application.
609     *
610     * @param map                a map to be compressed
611     * @param size       the number of index bits to be split off by the compression
612     * @return   an array of length 2 containing two arrays; the first is a new map
613     *           and the second is a compressed data table
614     *
615     * @see GenerateCharacter#buildMap
616     */
617 
618     static long[][] buildTable(long[] map, int size) {
619         int n = map.length;
620         if (((n >> size) << size) != n) {
621             FAIL("Length " + n + " is not a multiple of " + (1 << size));
622         }
623         int m = 1 << size;
624         // We know the final length of the new map up front.
625         long[] newmap = new long[n >> size];
626         // The buffer is used temporarily to hold data for the compressed table
627         // because we don't know its final length yet.
628         long[] buffer = new long[n];
629         int ptr = 0;
630 OUTER:  for (int i = 0; i < n; i += m) {
631             // For every block of size m in the original map...
632     MIDDLE: for (int j = 0; j < ptr; j += m) {
633             // Find out whether there is already a block just like it in the buffer.
634                 for (int k = 0; k < m; k++) {
635                     if (buffer[j+k] != map[i+k])
636                         continue MIDDLE;
637                 }
638                 // There is a block just like it at position j, so just
639                 // put its index into the new map (thereby sharing it).
640                 newmap[i >> size] = (j >> size);
641                 continue OUTER;
642             } // end MIDDLE
643             // There is no block just like it already, so add it to
644             // the buffer and put its index into the new map.
645             for (int k = 0; k < m; k++) {
646                 buffer[ptr+k] = map[i+k];
647             }
648             newmap[i >> size] = (ptr >> size);
649             ptr += m;
650         } // end OUTER
651         // Now we know how long the compressed table should be,
652         // so create a new array and copy data from the temporary buffer.
653         long[] newdata = new long[ptr];
654         for (int j = 0; j < ptr; j++) {
655             newdata[j] = buffer[j];
656         }
657         // Return the new map and the new data table.
658         long[][] result = { newmap, newdata };
659         return result;
660     }
661 
662     /**
663     * Once the compressed tables have been computed, this method reads in a
664     * template file for the source code to be generated and writes out the final
665     * source code by acting as a sort of specialized macro processor.
666     *
667     * The first output line is a comment saying that the file was automatically
668     * generated; it includes a timestamp.  All other output is generated by
669     * reading a line from the template file, performing macro replacements,
670     * and then writing the resulting line or lines of code to the output file.
671     *
672     * This method handles the I/O, the timestamp comment, and the locating of
673     * macro calls within each input line.  The method replaceCommand is called
674     * to generate replacement text for each macro call.
675     *
676     * Macro calls to be replaced are indicated in the template file by
677     * occurrences of the commandMarker "$$".  The rest of the call may consist
678     * of Java letters (including the underscore "_") and also of balanced
679     * parentheses.
680     *
681     * @param theTemplateFileName
682     *           the file name for the template input file
683     * @param theOutputFileName
684     *           the file name for the source code output file
685     *
686     *     @see GenerateCharacter#replaceCommand
687     */
688 
689     static void generateCharacterClass(String theTemplateFileName,
690                                        String theOutputFileName)
691         throws FileNotFoundException, IOException {
692         BufferedReader in = new BufferedReader(new FileReader(theTemplateFileName));
693         PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(theOutputFileName)));
694         out.println(commentStart +
695             " This file was generated AUTOMATICALLY from a template file " +
696             new java.util.Date() + commentEnd);
697         int marklen = commandMarker.length();
698         LOOP: while(true) {
699             try {
700                 String line = in.readLine();
701                 if (line == null) break LOOP;
702                 int pos = 0;
703                 int depth = 0;
704                 while ((pos = line.indexOf(commandMarker, pos)) >= 0) {
705                     int newpos = pos + marklen;
706                     char ch = 'x';
707                     SCAN: while (newpos < line.length() &&
708                             (Character.isJavaIdentifierStart(ch = line.charAt(newpos))
709                             || ch == '(' || (ch == ')' && depth > 0))) {
710                         ++newpos;
711                         if (ch == '(') {
712                             ++depth;
713                         }
714                         else if (ch == ')') {
715                             --depth;
716                             if (depth == 0)
717                                 break SCAN;
718                         }
719                     }
720                     String replacement = replaceCommand(line.substring(pos + marklen, newpos));
721                     line = line.substring(0, pos) + replacement + line.substring(newpos);
722                     pos += replacement.length();
723                 }
724                 out.println(line);
725             }
726             catch (IOException e) {
727                 break LOOP;
728             }
729         }
730         in.close();
731         out.close();
732     }
733 
734     /**
735     * The replaceCommand method takes a command (a macro call without the
736     * leading marker "$$") and computes replacement text for it.
737     *
738     * Most of the commands are simply names of integer constants that are defined
739     * in the source code of this GenerateCharacter class.  The replacement text is
740     * simply the value of the constant as an appropriately formatted integer literal.
741     *
742     * Two cases are more complicated, however.  The command "Tables" causes the
743     * final map and compressed tables to be emitted, with elaborate comments
744     * describing their contents.  (This is actually handled by method genTables.)
745     * The command "Lookup(xxx)", where "xxx" is the name of a variable, generates
746     * an expression that will return the character property data for the character
747     * whose code is the value of the variable "xxx".  (this is handled by method
748     * "genAccess".)
749     *
750     * @param x  a command from the template file to be replaced
751     * @return   the replacement text, as a String
752     *
753     * @see GenerateCharacter#genTables
754     * @see GenerateCharacter#genAccess
755     * @see GenerateCharacter#generateCharacterClass
756     */
757 
758     static String replaceCommand(String x) {
759         if (x.equals("Tables")) return genTables();
760         if (x.equals("Initializers")) return genInitializers();
761         if (x.length() >= 9 && x.substring(0, 7).equals("Lookup(") &&
762                 x.substring(x.length()-1).equals(")") )
763             return genAccess("A", x.substring(7, x.length()-1), (identifiers ? 2 : 32));
764         if (x.length() >= 11 && x.substring(0, 9).equals("LookupEx(") &&
765                 x.substring(x.length()-1).equals(")") )
766             return genAccess("B", x.substring(9, x.length()-1), 16);
767         if (x.equals("shiftType")) return Long.toString(shiftType);
768         if (x.equals("shiftIdentifierInfo")) return Long.toString(shiftIdentifierInfo);
769         if (x.equals("maskIdentifierInfo")) return "0x" + hex8(maskIdentifierInfo);
770         if (x.equals("maskUnicodePart")) return "0x" + hex8(maskUnicodePart);
771         if (x.equals("shiftCaseOffset")) return Long.toString(shiftCaseOffset);
772         if (x.equals("shiftCaseInfo")) return Long.toString(shiftCaseInfo);
773         if (x.equals("shiftCaseOffsetSign")) return Long.toString(shiftCaseOffsetSign);
774         if (x.equals("maskCase")) return "0x" + hex8(maskCase);
775         if (x.equals("maskCaseOffset")) return "0x" + hex8(maskCaseOffset);
776         if (x.equals("maskLowerCase")) return "0x" + hex8(maskLowerCase);
777         if (x.equals("maskUpperCase")) return "0x" + hex8(maskUpperCase);
778         if (x.equals("maskTitleCase")) return "0x" + hex8(maskTitleCase);
779         if (x.equals("maskOtherLowercase")) return "0x" + hex4(maskOtherLowercase >> 32);
780         if (x.equals("maskOtherUppercase")) return "0x" + hex4(maskOtherUppercase >> 32);
781         if (x.equals("maskOtherAlphabetic")) return "0x" + hex4(maskOtherAlphabetic >> 32);
782         if (x.equals("maskIdeographic")) return "0x" + hex4(maskIdeographic >> 32);
783         if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable);
784         if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart);
785         if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart);
786         if (x.equals("valueJavaUnicodePart")) return "0x" + hex8(valueJavaUnicodePart);
787         if (x.equals("valueJavaOnlyPart")) return "0x" + hex8(valueJavaOnlyPart);
788         if (x.equals("valueJavaWhitespace")) return "0x" + hex8(valueJavaWhitespace);
789         if (x.equals("lowJavaStart")) return "0x" + hex8(lowJavaStart);
790         if (x.equals("nonzeroJavaPart")) return "0x" + hex8(nonzeroJavaPart);
791         if (x.equals("bitJavaStart")) return "0x" + hex8(bitJavaStart);
792         if (x.equals("bitJavaPart")) return Long.toString(bitJavaPart);
793         if (x.equals("valueUnicodeStart")) return "0x" + hex8(valueUnicodeStart);
794         if (x.equals("maskIsJavaIdentifierStart")) return "0x" + hex(maskIsJavaIdentifierStart);
795         if (x.equals("maskIsJavaIdentifierPart")) return "0x" + hex(maskIsJavaIdentifierPart);
796         if (x.equals("shiftDigitOffset")) return Long.toString(shiftDigitOffset);
797         if (x.equals("maskDigitOffset")) return "0x" + hex(maskDigitOffset);
798         if (x.equals("maskDigit")) return "0x" + hex(maskDigit);
799         if (x.equals("shiftNumericType")) return Long.toString(shiftNumericType);
800         if (x.equals("maskNumericType")) return "0x" + hex(maskNumericType);
801         if (x.equals("valueNotNumeric")) return "0x" + hex8(valueNotNumeric);
802         if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);
803         if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric);
804         if (x.equals("valueJavaSupradecimal")) return "0x" + hex8(valueJavaSupradecimal);
805         if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);
806         if (x.equals("valueStrangeNumeric")) return "0x" + hex8(valueStrangeNumeric);
807         if (x.equals("maskType")) return "0x" + hex(maskType);
808         if (x.equals("shiftBidi")) return Long.toString(shiftBidi);
809         if (x.equals("maskBidi")) return "0x" + hex(maskBidi);
810         if (x.equals("maskMirrored")) return "0x" + hex8(maskMirrored);
811         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UNASSIGNED][UnicodeSpec.LONG]))
812             return Integer.toString(UnicodeSpec.UNASSIGNED);
813         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.UPPERCASE_LETTER][UnicodeSpec.LONG]))
814             return Integer.toString(UnicodeSpec.UPPERCASE_LETTER);
815         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LOWERCASE_LETTER][UnicodeSpec.LONG]))
816             return Integer.toString(UnicodeSpec.LOWERCASE_LETTER);
817         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.TITLECASE_LETTER][UnicodeSpec.LONG]))
818             return Integer.toString(UnicodeSpec.TITLECASE_LETTER);
819         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_LETTER][UnicodeSpec.LONG]))
820              return Integer.toString(UnicodeSpec.MODIFIER_LETTER);
821         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_LETTER][UnicodeSpec.LONG]))
822              return Integer.toString(UnicodeSpec.OTHER_LETTER);
823         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.NON_SPACING_MARK][UnicodeSpec.LONG]))
824              return Integer.toString(UnicodeSpec.NON_SPACING_MARK);
825         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.ENCLOSING_MARK][UnicodeSpec.LONG]))
826              return Integer.toString(UnicodeSpec.ENCLOSING_MARK);
827         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.COMBINING_SPACING_MARK][UnicodeSpec.LONG]))
828              return Integer.toString(UnicodeSpec.COMBINING_SPACING_MARK);
829         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DECIMAL_DIGIT_NUMBER][UnicodeSpec.LONG]))
830              return Integer.toString(UnicodeSpec.DECIMAL_DIGIT_NUMBER);
831         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_NUMBER][UnicodeSpec.LONG]))
832              return Integer.toString(UnicodeSpec.OTHER_NUMBER);
833         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SPACE_SEPARATOR][UnicodeSpec.LONG]))
834              return Integer.toString(UnicodeSpec.SPACE_SEPARATOR);
835         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LINE_SEPARATOR][UnicodeSpec.LONG]))
836              return Integer.toString(UnicodeSpec.LINE_SEPARATOR);
837         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PARAGRAPH_SEPARATOR][UnicodeSpec.LONG]))
838              return Integer.toString(UnicodeSpec.PARAGRAPH_SEPARATOR);
839         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONTROL][UnicodeSpec.LONG]))
840             return Integer.toString(UnicodeSpec.CONTROL);
841         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FORMAT][UnicodeSpec.LONG]))
842             return Integer.toString(UnicodeSpec.FORMAT);
843         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.PRIVATE_USE][UnicodeSpec.LONG]))
844             return Integer.toString(UnicodeSpec.PRIVATE_USE);
845         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.SURROGATE][UnicodeSpec.LONG]))
846             return Integer.toString(UnicodeSpec.SURROGATE);
847         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.DASH_PUNCTUATION][UnicodeSpec.LONG]))
848             return Integer.toString(UnicodeSpec.DASH_PUNCTUATION);
849         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.START_PUNCTUATION][UnicodeSpec.LONG]))
850             return Integer.toString(UnicodeSpec.START_PUNCTUATION);
851         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.END_PUNCTUATION][UnicodeSpec.LONG]))
852             return Integer.toString(UnicodeSpec.END_PUNCTUATION);
853         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.INITIAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG]))
854             return Integer.toString(UnicodeSpec.INITIAL_QUOTE_PUNCTUATION);
855         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.FINAL_QUOTE_PUNCTUATION][UnicodeSpec.LONG]))
856             return Integer.toString(UnicodeSpec.FINAL_QUOTE_PUNCTUATION);
857         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CONNECTOR_PUNCTUATION][UnicodeSpec.LONG]))
858             return Integer.toString(UnicodeSpec.CONNECTOR_PUNCTUATION);
859         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_PUNCTUATION][UnicodeSpec.LONG]))
860             return Integer.toString(UnicodeSpec.OTHER_PUNCTUATION);
861         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.LETTER_NUMBER][UnicodeSpec.LONG]))
862             return Integer.toString(UnicodeSpec.LETTER_NUMBER);
863         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MATH_SYMBOL][UnicodeSpec.LONG]))
864             return Integer.toString(UnicodeSpec.MATH_SYMBOL);
865         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.CURRENCY_SYMBOL][UnicodeSpec.LONG]))
866             return Integer.toString(UnicodeSpec.CURRENCY_SYMBOL);
867         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.MODIFIER_SYMBOL][UnicodeSpec.LONG]))
868             return Integer.toString(UnicodeSpec.MODIFIER_SYMBOL);
869         if (x.equals(UnicodeSpec.generalCategoryList[UnicodeSpec.OTHER_SYMBOL][UnicodeSpec.LONG]))
870             return Integer.toString(UnicodeSpec.OTHER_SYMBOL);
871         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT][UnicodeSpec.LONG]))
872             return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT);
873         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING][UnicodeSpec.LONG]))
874             return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING);
875         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE][UnicodeSpec.LONG]))
876             return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE);
877         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT][UnicodeSpec.LONG]))
878             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT);
879         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC][UnicodeSpec.LONG]))
880             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC);
881         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING][UnicodeSpec.LONG]))
882             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING);
883         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE][UnicodeSpec.LONG]))
884             return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE);
885         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT][UnicodeSpec.LONG]))
886             return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT);
887         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER][UnicodeSpec.LONG]))
888             return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER);
889         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR][UnicodeSpec.LONG]))
890             return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR);
891         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR][UnicodeSpec.LONG]))
892             return Integer.toString(UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR);
893         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER][UnicodeSpec.LONG]))
894             return Integer.toString(UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER);
895         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR][UnicodeSpec.LONG]))
896             return Integer.toString(UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR);
897         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK][UnicodeSpec.LONG]))
898             return Integer.toString(UnicodeSpec.DIRECTIONALITY_NONSPACING_MARK);
899          if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL][UnicodeSpec.LONG]))
900             return Integer.toString(UnicodeSpec.DIRECTIONALITY_BOUNDARY_NEUTRAL);
901         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR][UnicodeSpec.LONG]))
902             return Integer.toString(UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR);
903         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR][UnicodeSpec.LONG]))
904             return Integer.toString(UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR);
905         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_WHITESPACE][UnicodeSpec.LONG]))
906             return Integer.toString(UnicodeSpec.DIRECTIONALITY_WHITESPACE);
907         if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS][UnicodeSpec.LONG]))
908             return Integer.toString(UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS);
909         FAIL("Unknown text substitution marker " + commandMarker + x);
910         return commandMarker + x;
911     }
912 
913     /**
914     * The genTables method generates source code for all the lookup tables
915     * needed to represent the various Unicode character properties.
916     * It simply calls the method genTable once for each table to be generated
917     * and then generates a summary comment.
918     *
919     * @return   the replacement text for the "Tables" command, as a String
920     *
921     * @see GenerateCharacter#genTable
922     * @see GenerateCharacter#replaceCommand
923     */
924     static String genTables() {
925         int n = sizes.length;
926         StringBuffer result = new StringBuffer();
927         // liu : Add a comment showing the source of this table
928         result.append(commentStart + " The following tables and code generated using:" +
929                   commentEnd + "\n  ");
930         result.append(commentStart + ' ' + commandLineDescription + commentEnd + "\n  ");
931 
932                 if (plane == 0 && bLatin1 == false) {
933             genCaseMapTableDeclaration(result);
934             genCaseMapTable(initializers, specialCaseMaps);
935                 }
936         int totalBytes = 0;
937         for (int k = 0; k < n - 1; k++) {
938             genTable(result, tableNames[k], tables[k], 0, bytes[k]<<3, sizes[k], preshifted[k],
939                 sizes[k+1], false, false, k==0);
940             int s = bytes[k];
941             if (s == 1 && useCharForByte) {
942                 s = 2;
943             }
944             totalBytes += tables[k].length * s;
945         }
946         genTable(result, "A", tables[n - 1], 0, (identifiers ? 2 : 32),
947             sizes[n - 1], false, 0, true, !(identifiers), false);
948 
949         // If we ever need more than 32 bits to represent the character properties,
950         // then a table "B" may be needed as well.
951         genTable(result, "B", tables[n - 1], 32, 16, sizes[n - 1], false, 0, true, true, false);
952 
953         totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32)) + 31) >> 5) << 2);
954         result.append(commentStart);
955         result.append(" In all, the character property tables require ");
956         result.append(totalBytes).append(" bytes.").append(commentEnd);
957         if (verbose) {
958             System.out.println("The character property tables require "
959                  + totalBytes + " bytes.");
960         }
961         return result.toString();
962     }
963 
964     /**
965      * The genInitializers method generates the body of the
966      * ensureInitted() method, which enables lazy initialization of
967      * the case map table and other tables.
968      */
969     static String genInitializers() {
970         return initializers.toString();
971     }
972 
973     /**
974      * Return the total number of bytes needed by all tables.  This is a stripped-
975      * down copy of genTables().
976      */
977     static int getTotalBytes() {
978         int n = sizes.length;
979         int totalBytes = 0;
980         for (int k = 0; k < n - 1; k++) {
981             totalBytes += tables[k].length * bytes[k];
982         }
983         totalBytes += ((((tables[n - 1].length * (identifiers ? 2 : 32))
984                          + 31) >> 5) << 2);
985         return totalBytes;
986     }
987 
988     static void appendEscapedStringFragment(StringBuffer result,
989                                             char[] line,
990                                             int length,
991                                             boolean lastFragment) {
992         result.append("    \"");
993         for (int k=0; k<length; ++k) {
994             result.append("\\u");
995             result.append(hex4(line[k]));
996         }
997         result.append("\"");
998         result.append(lastFragment ? ";" : "+");
999         result.append("\n");
1000     }
1001 
1002     static String SMALL_INITIALIZER =
1003         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1004         // "            $$name = new $$type[$$size];\n"+
1005         "            int len = $$name_DATA.length();\n"+
1006         "            int j=0;\n"+
1007         "            for (int i=0; i<len; ++i) {\n"+
1008         "                int c = $$name_DATA.charAt(i);\n"+
1009         "                for (int k=0; k<$$entriesPerChar; ++k) {\n"+
1010         "                    $$name[j++] = ($$type)c;\n"+
1011         "                    c >>= $$bits;\n"+
1012         "                }\n"+
1013         "            }\n"+
1014         "            assert (j == $$size);\n"+
1015         "        }\n";
1016 
1017     static String SAME_SIZE_INITIALIZER =
1018         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1019         "            assert ($$name_DATA.length() == $$size);\n"+
1020         // "            $$name = new $$type[$$size];\n"+
1021         "            for (int i=0; i<$$size; ++i)\n"+
1022         "                $$name[i] = ($$type)$$name_DATA.charAt(i);\n"+
1023         "        }\n";
1024 
1025     static String BIG_INITIALIZER =
1026         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1027         // "            $$name = new $$type[$$size];\n"+
1028         "            int len = $$name_DATA.length();\n"+
1029         "            int j=0;\n"+
1030         "            int charsInEntry=0;\n"+
1031         "            $$type entry=0;\n"+
1032         "            for (int i=0; i<len; ++i) {\n"+
1033         "                entry |= $$name_DATA.charAt(i);\n"+
1034         "                if (++charsInEntry == $$charsPerEntry) {\n"+
1035         "                    $$name[j++] = entry;\n"+
1036         "                    entry = 0;\n"+
1037         "                    charsInEntry = 0;\n"+
1038         "                }\n"+
1039         "                else {\n"+
1040         "                    entry <<= 16;\n"+
1041         "                }\n"+
1042         "            }\n"+
1043         "            assert (j == $$size);\n"+
1044         "        }\n";
1045 
1046     static String INT32_INITIALIZER =
1047         "        { // THIS CODE WAS AUTOMATICALLY CREATED BY GenerateCharacter:\n"+
1048         "            char[] data = $$name_DATA.toCharArray();\n"+
1049         "            assert (data.length == ($$size * 2));\n"+
1050         "            int i = 0, j = 0;\n"+
1051         "            while (i < ($$size * 2)) {\n"+
1052         "                int entry = data[i++] << 16;\n"+
1053         "                $$name[j++] = entry | data[i++];\n"+
1054         "            }\n"+
1055         "        }\n";
1056 
1057     static void addInitializer(String name, String type, int entriesPerChar,
1058                                int bits, int size) {
1059 
1060         String template = (entriesPerChar == 1) ? SAME_SIZE_INITIALIZER :
1061                           ((entriesPerChar > 0) ? SMALL_INITIALIZER : BIG_INITIALIZER);
1062         if (entriesPerChar == -2) {
1063             template = INT32_INITIALIZER;
1064         }
1065         int marklen = commandMarker.length();
1066         int pos = 0;
1067         while ((pos = template.indexOf(commandMarker, pos)) >= 0) {
1068             int newpos = pos + marklen;
1069             char ch = 'x';
1070             while (newpos < template.length() &&
1071                    Character.isJavaIdentifierStart(ch = template.charAt(newpos)) &&
1072                    ch != '_') // Don't allow this in token names
1073                 ++newpos;
1074             String token = template.substring(pos+marklen, newpos);
1075             String replacement = "ERROR";
1076 
1077             if (token.equals("name")) replacement = name;
1078             else if (token.equals("type")) replacement = type;
1079             else if (token.equals("bits")) replacement = ""+bits;
1080             else if (token.equals("size")) replacement = ""+size;
1081             else if (token.equals("entriesPerChar")) replacement = ""+entriesPerChar;
1082             else if (token.equals("charsPerEntry")) replacement = ""+(-entriesPerChar);
1083             else FAIL("Unrecognized token: " + token);
1084 
1085             template = template.substring(0, pos) + replacement + template.substring(newpos);
1086             pos += replacement.length();
1087         }
1088         initializers.append(template);
1089     }
1090 
1091     /**
1092     * The genTable method generates source code for one lookup table.
1093     * Most of the complexity stems from handling various options as to
1094     * the type of the array components, the precise representation of the
1095     * values, the format in which to render each value, the number of values
1096     * to emit on each line of source code, and the kinds of useful comments
1097     * to be generated.
1098     *
1099     * @param result     a StringBuffer, to which the generated source code
1100     *                   text is to be appended
1101     * @param name       the name of the table
1102     * @param table      the table data (an array of long values)
1103     * @param extract    a distance, in bits, by which each entry of the table
1104     *                   is to be right-shifted before it is processed
1105     * @param bits       the number of bits (not bytes) to be used to represent
1106     *                   each table entry
1107     * @param size       the table data is divided up into blocks of size (1<<size);
1108     *                   in this method, this information is used only to affect
1109     *                   how many table values are to be generated per line
1110     * @param preshifted if this flag is true, then the table entries are to be
1111     *                   emitted in a preshifted form; that is, each value should
1112     *                   be left-shifted by the amount "shift", so that this work
1113     *                   is built into the table and need not be performed by an
1114     *                   explicit shift operator at run time
1115     * @param shift      this is the shift amount for preshifting of table entries
1116     * @param hexFormat  if this flag is true, table entries should be emitted as
1117     *                   hexadecimal literals; otherwise decimal literals are used
1118     * @param properties if this flag is true, the table entries are encoded
1119     *                   character properties rather than indexes into yet other tables;
1120     *                   therefore comments describing the encoded properties should
1121     *                   be generated
1122     * @param hexComment if this flag is true, each line of output is labelled with
1123     *                   a hexadecimal comment indicating the character values to
1124     *                   which that line applies; otherwise, decimal values indicating
1125     *                   table indices are generated
1126     *
1127     * @see GenerateCharacter#genTables
1128     * @see GenerateCharacter#replaceCommand
1129     */
1130 
1131     static void genTable(StringBuffer result, String name,
1132                          long[] table, int extract, int bits, int size,
1133                          boolean preshifted, int shift, boolean hexFormat,
1134                          boolean properties, boolean hexComment) {
1135 
1136         String atype = bits == 1 ? (Csyntax ? "unsigned long" : "int") :
1137             bits == 2 ? (Csyntax ? "unsigned long" : "int") :
1138             bits == 4 ? (Csyntax ? "unsigned long" : "int") :
1139             bits == 8 ? (Csyntax ? "unsigned char" : "byte") :
1140             bits == 16 ? (Csyntax ? "unsigned short" : "char") :
1141             bits == 32 ? (Csyntax ? "unsigned long" : "int") :
1142             (Csyntax ? "int64" : "long");
1143         long maxPosEntry = bits == 1 ? Integer.MAX_VALUE : // liu
1144             bits == 2 ? Integer.MAX_VALUE :
1145             bits == 4 ? Integer.MAX_VALUE :
1146             bits == 8 ? Byte.MAX_VALUE :
1147             bits == 16 ? Short.MAX_VALUE :
1148             bits == 32 ? Integer.MAX_VALUE :
1149             Long.MAX_VALUE;
1150         int entriesPerChar = bits <= 16 ? (16 / bits) : -(bits / 16);
1151         boolean shiftEntries = preshifted && shift != 0;
1152         if (bits == 8 && tableAsString && useCharForByte) {
1153             atype = "char";
1154             maxPosEntry = Character.MAX_VALUE;
1155             entriesPerChar = 1;
1156         }
1157         boolean noConversion = atype.equals("char");
1158 
1159         result.append(commentStart);
1160         result.append(" The ").append(name).append(" table has ").append(table.length);
1161         result.append(" entries for a total of ");
1162         int sizeOfTable = ((table.length * bits + 31) >> 5) << 2;
1163         if (bits == 8 && useCharForByte) {
1164             sizeOfTable *= 2;
1165         }
1166         result.append(sizeOfTable);
1167         result.append(" bytes.").append(commentEnd).append("\n\n");
1168         if (Csyntax)
1169             result.append("  static ");
1170         else
1171             result.append("  static final ");
1172         result.append(atype);
1173         result.append(" ").append(name).append("[");
1174         if (Csyntax)
1175             result.append(table.length >> (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0));
1176         if (tableAsString) {
1177             if (noConversion) {
1178                 result.append("] = (\n");
1179             } else {
1180                 result.append("] = new ").append(atype).append("["+table.length+"];\n  ");
1181                 result.append("static final String ").append(name).append("_DATA =\n");
1182             }
1183             int CHARS_PER_LINE = 8;
1184             StringBuffer theString = new StringBuffer();
1185             int entriesInCharSoFar = 0;
1186             char ch = '\u0000';
1187             int charsPerEntry = -entriesPerChar;
1188             for (int j=0; j<table.length; ++j) {
1189                 //long entry = table[j] >> extract;
1190                 long entry;
1191                 if ("A".equals(name))
1192                     entry = (table[j] & 0xffffffffL) >> extract;
1193                 else
1194                     entry = (table[j] >> extract);
1195                 if (shiftEntries) entry <<= shift;
1196                 if (entry >= (1L << bits)) {
1197                     FAIL("Entry too big");
1198                 }
1199                 if (entriesPerChar > 0) {
1200                     // Pack multiple entries into a character
1201                     ch = (char)(((int)ch >> bits) | (entry << (entriesPerChar-1)*bits));
1202                     ++entriesInCharSoFar;
1203                     if (entriesInCharSoFar == entriesPerChar) {
1204                         // Character is full
1205                         theString.append(ch);
1206                         entriesInCharSoFar = 0;
1207                         ch = '\u0000';
1208                     }
1209                 }
1210                 else {
1211                     // Use multiple characters per entry
1212                     for (int k=0; k<charsPerEntry; ++k) {
1213                         ch = (char)(entry >> ((charsPerEntry-1)*16));
1214                         entry <<= 16;
1215                         theString.append(ch);
1216                     }
1217                 }
1218             }
1219             if (entriesInCharSoFar > 0) {
1220                 while (entriesInCharSoFar < entriesPerChar) {
1221                     ch = (char)((int)ch >> bits);
1222                     ++entriesInCharSoFar;
1223                 }
1224                 theString.append(ch);
1225                 entriesInCharSoFar = 0;
1226             }
1227             result.append(Utility.formatForSource(theString.toString(), "    "));
1228             if (noConversion) {
1229                 result.append(").toCharArray()");
1230             }
1231             result.append(";\n\n  ");
1232 
1233             if (!noConversion) {
1234                 addInitializer(name, atype, entriesPerChar, bits, table.length);
1235             }
1236         }
1237         else {
1238             result.append("] = {");
1239             boolean castEntries = shiftEntries && (bits < 32);
1240             int printPerLine = hexFormat ? (bits == 1 ? 32*4 :
1241                 bits == 2 ? 16*4 :
1242                 bits == 4 ? 8*4 :
1243                 bits == 8 ? 8 :
1244                 bits == 16 ? 8 :
1245                 bits == 32 ? 4 : 2) :
1246                 (bits == 8 ? 8 :
1247                 bits == 16 ? 8 : 4);
1248             int printMask = properties ? 0 :
1249             Math.min(1 << size,
1250                 printPerLine >> (castEntries ? (Csyntax ? 2 : 1) : 0)) - 1;
1251             int commentShift = ((1 << size) == table.length) ? 0 : size;
1252             int commentMask = ((1 << size) == table.length) ? printMask : (1 << size) - 1;
1253             long val = 0;
1254             for (int j = 0; j < table.length; j++) {
1255                 if ((j & printMask) == 0) {
1256                     while (result.charAt(result.length() - 1) == ' ')
1257                         result.setLength(result.length() - 1);
1258                     result.append("\n    ");
1259                 }
1260         PRINT:  {
1261                 if (castEntries)
1262                     result.append("(").append(atype).append(")(");
1263                 long entry = table[j] >> extract;
1264                 int packMask = ((1 << (bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 2)) - 1);
1265                 int k = j & packMask;
1266                 if (bits >= 8)
1267                     val = entry;
1268                 else if (k == 0) {
1269                     val = entry;
1270                     break PRINT;
1271                 }
1272                 else {
1273                     val |= (entry << (k*bits));
1274                     if (k != packMask)
1275                         break PRINT;
1276                 }
1277                 if (val > maxPosEntry && !Csyntax) { // liu
1278                 // For values that are out of range, convert them to in-range negative values.
1279                 // Actually, output the '-' and convert them to the negative of the corresponding
1280                 // in-range negative values.  E.g., convert 130 == -126 (in 8 bits) -> 126.
1281                     result.append('-');
1282                     val = maxPosEntry + maxPosEntry + 2 - val;
1283                 }
1284                 if (hexFormat) {
1285                     result.append("0x");
1286                     if (bits == 8)
1287                         result.append(hex2((byte)val));
1288                     else if (bits == 16)
1289                         result.append(hex4((short)val));
1290                     else if (bits == 32 || bits < 8)
1291                         result.append(hex8((int)val));
1292                     else {
1293                         result.append(hex16(val));
1294                         if (!Csyntax)
1295                             result.append("L");
1296                     }
1297                 }
1298                 else {
1299                     if (bits == 8)
1300                         result.append(dec3(val));
1301                     else if (bits == 64) {
1302                         result.append(dec5(val));
1303                         if (!Csyntax)
1304                             result.append("L");
1305                     }
1306                     else
1307                         result.append(dec5(val));
1308                 }
1309                 if (shiftEntries)
1310                     result.append("<<").append(shift);
1311                 if (castEntries) result.append(")");
1312                 if (j < (table.length - 1))
1313                     result.append(", ");
1314                 else
1315                     result.append("  ");
1316                 if ((j & printMask) == printMask) {
1317                     result.append(" ").append(commentStart).append(" ");
1318                     if (hexComment)
1319                         result.append("0x").append(hex4((j & ~commentMask) << (16 - size)));
1320                     else
1321                         result.append(dec3((j & ~commentMask) >> commentShift));
1322                     if (properties) propertiesComments(result, val);
1323                     result.append(commentEnd);
1324                 }
1325                 } // end PRINT
1326             }
1327             result.append("\n  };\n\n  ");
1328         }
1329     }
1330 
1331     static void genCaseMapTableDeclaration(StringBuffer result) {
1332         String myTab = "    ";
1333         result.append(myTab + "static final char[][][] charMap;\n");
1334     }
1335 
1336     static void genCaseMapTable(StringBuffer result, SpecialCaseMap[] specialCaseMaps){
1337         String myTab = "    ";
1338         int ch;
1339         char[] map;
1340         result.append(myTab + "charMap = new char[][][] {\n");
1341         for (int x = 0; x < specialCaseMaps.length; x++) {
1342             ch = specialCaseMaps[x].getCharSource();
1343             map = specialCaseMaps[x].getUpperCaseMap();
1344             result.append(myTab + myTab);
1345             result.append("{ ");
1346             result.append("{\'\\u"+hex4(ch)+"\'}, {");
1347             for (int y = 0; y < map.length; y++) {
1348                 result.append("\'\\u"+hex4(map[y])+"\', ");
1349             }
1350             result.append("} },\n");
1351         }
1352         result.append(myTab + "};\n");
1353 
1354     }
1355 
1356     /**
1357     * The propertiesComments method generates comments describing encoded
1358     * character properties.
1359     *
1360     * @param result     a StringBuffer, to which the generated source code
1361     *                   text is to be appended
1362     * @param val                encoded character properties
1363     *
1364     * @see GenerateCharacter#genTable
1365     */
1366 
1367     static void propertiesComments(StringBuffer result, long val) {
1368         result.append("   ");
1369         switch ((int)(val & maskType)) {
1370             case UnicodeSpec.CONTROL:
1371                 result.append("Cc");
1372                 break;
1373             case UnicodeSpec.FORMAT:
1374                 result.append("Cf");
1375                 break;
1376             case UnicodeSpec.PRIVATE_USE:
1377                 result.append("Co");
1378                 break;
1379             case UnicodeSpec.SURROGATE:
1380                 result.append("Cs");
1381                 break;
1382             case UnicodeSpec.LOWERCASE_LETTER:
1383                 result.append("Ll");
1384                 break;
1385             case UnicodeSpec.MODIFIER_LETTER:
1386                 result.append("Lm");
1387                 break;
1388             case UnicodeSpec.OTHER_LETTER:
1389                 result.append("Lo");
1390                 break;
1391             case UnicodeSpec.TITLECASE_LETTER:
1392                 result.append("Lt");
1393                 break;
1394             case UnicodeSpec.UPPERCASE_LETTER:
1395                 result.append("Lu");
1396                 break;
1397             case UnicodeSpec.COMBINING_SPACING_MARK:
1398                 result.append("Mc");
1399                 break;
1400             case UnicodeSpec.ENCLOSING_MARK:
1401                 result.append("Me");
1402                 break;
1403             case UnicodeSpec.NON_SPACING_MARK:
1404                 result.append("Mn");
1405                 break;
1406             case UnicodeSpec.DECIMAL_DIGIT_NUMBER:
1407                 result.append("Nd");
1408                 break;
1409             case UnicodeSpec.LETTER_NUMBER:
1410                 result.append("Nl");
1411                 break;
1412             case UnicodeSpec.OTHER_NUMBER:
1413                 result.append("No");
1414                 break;
1415             case UnicodeSpec.CONNECTOR_PUNCTUATION:
1416                 result.append("Pc");
1417                 break;
1418             case UnicodeSpec.DASH_PUNCTUATION:
1419                 result.append("Pd");
1420                 break;
1421             case UnicodeSpec.END_PUNCTUATION:
1422                 result.append("Pe");
1423                 break;
1424             case UnicodeSpec.OTHER_PUNCTUATION:
1425                 result.append("Po");
1426                 break;
1427             case UnicodeSpec.START_PUNCTUATION:
1428                 result.append("Ps");
1429                 break;
1430             case UnicodeSpec.CURRENCY_SYMBOL:
1431                 result.append("Sc");
1432                 break;
1433             case UnicodeSpec.MODIFIER_SYMBOL:
1434                 result.append("Sk");
1435                 break;
1436             case UnicodeSpec.MATH_SYMBOL:
1437                 result.append("Sm");
1438                 break;
1439             case UnicodeSpec.OTHER_SYMBOL:
1440                 result.append("So");
1441                 break;
1442             case UnicodeSpec.LINE_SEPARATOR:
1443                 result.append("Zl"); break;
1444             case UnicodeSpec.PARAGRAPH_SEPARATOR:
1445                 result.append("Zp");
1446                 break;
1447             case UnicodeSpec.SPACE_SEPARATOR:
1448                 result.append("Zs");
1449                 break;
1450             case UnicodeSpec.UNASSIGNED:
1451                 result.append("unassigned");
1452                 break;
1453         }
1454 
1455         switch ((int)((val & maskBidi) >> shiftBidi)) {
1456             case UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT:
1457                 result.append(", L");
1458                 break;
1459             case UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT:
1460                 result.append(", R");
1461                 break;
1462             case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER:
1463                 result.append(", EN");
1464                 break;
1465             case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR:
1466                 result.append(", ES");
1467                 break;
1468             case UnicodeSpec.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR:
1469                 result.append(", ET");
1470                 break;
1471             case UnicodeSpec.DIRECTIONALITY_ARABIC_NUMBER:
1472                 result.append(", AN");
1473                 break;
1474             case UnicodeSpec.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR:
1475                 result.append(", CS");
1476                 break;
1477             case UnicodeSpec.DIRECTIONALITY_PARAGRAPH_SEPARATOR:
1478                 result.append(", B");
1479                 break;
1480             case UnicodeSpec.DIRECTIONALITY_SEGMENT_SEPARATOR:
1481                 result.append(", S");
1482                 break;
1483             case UnicodeSpec.DIRECTIONALITY_WHITESPACE:
1484                 result.append(", WS");
1485                 break;
1486             case UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS:
1487                 result.append(", ON");
1488                 break;
1489         }
1490         if ((val & maskUpperCase) != 0) {
1491             result.append(", hasUpper (subtract ");
1492             result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")");
1493         }
1494         if ((val & maskLowerCase) != 0) {
1495             result.append(", hasLower (add ");
1496             result.append((val & maskCaseOffset) >> shiftCaseOffset).append(")");
1497         }
1498         if ((val & maskTitleCase) != 0) {
1499             result.append(", hasTitle");
1500         }
1501         if ((val & maskIdentifierInfo) == valueIgnorable) {
1502             result.append(", ignorable");
1503         }
1504         if ((val & maskIdentifierInfo) == valueJavaUnicodePart) {
1505             result.append(", identifier part");
1506         }
1507         if ((val & maskIdentifierInfo) == valueJavaStartUnicodePart) {
1508             result.append(", underscore");
1509         }
1510         if ((val & maskIdentifierInfo) == valueJavaWhitespace) {
1511             result.append(", whitespace");
1512         }
1513         if ((val & maskIdentifierInfo) == valueJavaOnlyStart) {
1514             result.append(", currency");
1515         }
1516         if ((val & maskIdentifierInfo) == valueJavaUnicodeStart) {
1517             result.append(", identifier start");
1518         }
1519         if ((val & maskNumericType) == valueDigit) {
1520             result.append(", decimal ");
1521             result.append((val & maskDigitOffset) >> shiftDigitOffset);
1522         }
1523         if ((val & maskNumericType) == valueStrangeNumeric) {
1524             result.append(", strange");
1525         }
1526         if ((val & maskNumericType) == valueJavaSupradecimal) {
1527             result.append(", supradecimal ");
1528             result.append((val & maskDigitOffset) >> shiftDigitOffset);
1529         }
1530     }
1531 
1532     static String[] tableNames = { "X", "Y", "Z", "P", "Q", "R", "S", "T", "U", "V", "W" };
1533 
1534     static String tableName(int j) { return tableNames[j]; }
1535 
1536     /**
1537     * The genAccess method generates source code for one table access expression.
1538     *
1539     * Most of the complexity stems from handling various options as to
1540     * table representation, such as whether it contains values so large that
1541     * they are represented as negative values and whether the table values are
1542     * preshifted.  This method also avoids such "ugly" expressions as shifting
1543     * by distance zero, masking when no masking is necessary, and so on.
1544     * For clarity, it generates expressions that do not rely on operator
1545     * precedence, but otherwise it avoids generating redundant parentheses.
1546     *
1547     * A generated expression might look like A[Y[(X[ch>>6]<<6)|(ch&0x3F)]]
1548     * or A[Z[Y[(X[ch>>7]<<4)|((ch>>3)&0xF)]|(ch&0x7)]], for example.
1549     *
1550     * @param tbl                the name of the final table to be accessed
1551     * @param var                the variable name that appeared in parentheses in the
1552     *                           "Lookup" command
1553     * @param bits       the number of bits (not bytes) to be used to represent
1554     *                   the final table entry
1555     * @return   the replacement text for the "Lookup(xxx)" command, as a String
1556     *
1557     * @see GenerateCharacter#replaceCommand
1558     */
1559 
1560     static String genAccess(String tbl, String var, int bits) {
1561         String access = null;
1562         int bitoffset = bits == 1 ? 5 : bits == 2 ? 4 : bits == 4 ? 3 : 0;
1563         for (int k = 0; k < sizes.length; k++) {
1564             int offset = ((k < sizes.length - 1) ? 0 : bitoffset);
1565             int shift = shifts[k] + offset;
1566             String shifted = (shift == 0) ? var : "(" + var + ">>" + shift + ")";
1567             int mask = (1 << (sizes[k] - offset)) - 1;
1568             String masked = (k == 0) ? shifted :
1569               "(" + shifted + "&0x" + hex(mask) + ")";
1570             String index = (k == 0) ? masked :
1571              (mask == 0) ? access : "(" + access + "|" + masked + ")";
1572             String indexNoParens = (index.charAt(0) != '(') ? index :
1573                  index.substring(1, index.length() - 1);
1574             String tblname = (k == sizes.length - 1) ? tbl : tableName(k);
1575             String fetched = tblname + "[" + indexNoParens + "]";
1576             String zeroextended = (zeroextend[k] == 0) ? fetched :
1577                 "(" + fetched + "&0x" + hex(zeroextend[k]) + ")";
1578             int adjustment = preshifted[k] ? 0 :
1579                sizes[k+1] - ((k == sizes.length - 2) ? bitoffset : 0);
1580             String adjusted = (preshifted[k] || adjustment == 0) ? zeroextended :
1581                 "(" + zeroextended + "<<" + adjustment + ")";
1582             String bitshift = (bits == 1) ? "(" + var + "&0x1F)" :
1583                 (bits == 2) ? "((" + var + "&0xF)<<1)" :
1584                 (bits == 4) ? "((" + var + "&7)<<2)" : null;
1585             String extracted = ((k < sizes.length - 1) || (bits >= 8)) ? adjusted :
1586                 "((" + adjusted + ">>" + bitshift + ")&" +
1587                 (bits == 4 ? "0xF" : "" + ((1 << bits) - 1)) + ")";
1588             access = extracted;
1589         }
1590         return access;
1591     }
1592 
1593     /* The command line arguments are decoded and used to set the following
1594      global variables.
1595      */
1596 
1597     static boolean verbose = false;
1598     static boolean nobidi = false;
1599     static boolean nomirror = false;
1600     static boolean identifiers = false;
1601     static boolean Csyntax = false;
1602     static String TemplateFileName = null;
1603     static String OutputFileName = null;
1604     static String UnicodeSpecFileName = null; // liu
1605     static String SpecialCasingFileName = null;
1606     static String PropListFileName = null;
1607     static boolean useCharForByte = false;
1608     static int[] sizes;
1609     static int bins = 0; // liu; if > 0, then perform search
1610     static boolean tableAsString = false;
1611     static boolean bLatin1 = false;
1612 
1613     static String commandLineDescription;
1614 
1615     /* Other global variables, equal in length to the "sizes" array. */
1616 
1617     static int[] shifts;
1618     static int[] zeroextend;
1619     static int[] bytes;
1620     static boolean[] preshifted;
1621     static long[][] tables;
1622 
1623 
1624     /* Other global variables */
1625     static String commentStart;
1626     static String commentEnd;
1627 
1628     static StringBuffer initializers = new StringBuffer();
1629 
1630     /* special casing rules for 1:M toUpperCase mappings */
1631     static SpecialCaseMap[] specialCaseMaps;
1632 
1633     /**
1634     * Process the command line arguments.
1635     *
1636     * The allowed flags in command line are:
1637     * <dl>
1638     * <dt> -verbose             <dd> Emit comments to standard output describing
1639     *                                   what's going on during the processing.
1640     * <dt> -nobidi              <dd> Do not include bidi categories in the
1641     *                                   encoded character properties.
1642     * <dt> -nomirror    <dd> Do no include mirror property in the encoded
1643     *                        character properties.
1644     * <dt> -identifiers         <dd> Generate tables for scanning identifiers only.
1645     * <dt> -c                   <dd> Output code in C syntax instead of Java syntax.
1646     * <dt> -o filename          <dd> Specify output file name.
1647     * <dt> -template filename   <dd> Specify template input file name.
1648     * <dt> -spec filename        <dd> Specify Unicode spec file name.
1649     * <dt> -specialcasing filename <dd> Specify Unicode special casing file name.
1650     * <dt> -search bins          <dd> Try different partitions into the specified
1651     *                                    number of bins.  E.g., for 2 bins, try
1652     *                                    16 0, 15 1,..., 0 16.
1653     * <dt> -string               <dd> Create table as string.  Only valid with Java
1654     *                                    syntax.
1655     * <dt> -latin1          <dd> Create a latin 1 only property table.
1656     * </dl>
1657     * In addition, decimal literals may appear as command line arguments;
1658     * each one represents the number of bits of the character to be broken
1659     * off at each lookup step.  If present, they must add up to 16 (the number
1660     * of bits in a char value).  For smaller tables, the last value should
1661     * be 0; values other than the last one may not be zero.  If no such
1662     * numeric values are provided, default values are used.
1663     *
1664     * @param args       the command line arguments, as an array of String
1665     *
1666     * @see GenerateCharacter#main
1667     */
1668 
1669     static void processArgs(String[] args) {
1670         StringBuffer desc = new StringBuffer("java GenerateCharacter");
1671         for (int j=0; j<args.length; ++j) {
1672             desc.append(" " + args[j]);
1673         }
1674         for (int j = 0; j < args.length; j++) {
1675             if (args[j].equals("-verbose") || args[j].equals("-v"))
1676                 verbose = true;
1677             else if (args[j].equals("-nobidi"))
1678                 nobidi = true;
1679             else if (args[j].equals("-nomirror"))
1680                 nomirror = true;
1681             else if (args[j].equals("-identifiers"))
1682                 identifiers = true;
1683             else if (args[j].equals("-c"))
1684                 Csyntax = true;
1685             else if (args[j].equals("-string"))
1686                 tableAsString = true;
1687             else if (args[j].equals("-o")) {
1688                 if (j == args.length - 1) {
1689                     FAIL("File name missing after -o");
1690                 }
1691                 else {
1692                     OutputFileName = args[++j];
1693                 }
1694             }
1695             else if (args[j].equals("-search")) {
1696                 if (j == args.length - 1)
1697                     FAIL("Bin count missing after -search");
1698                 else {
1699                     bins = Integer.parseInt(args[++j]);
1700                     if (bins < 1 || bins > 10)
1701                         FAIL("Bin count must be >= 1 and <= 10");
1702                 }
1703             }
1704             else if (args[j].equals("-template")) {
1705                 if (j == args.length - 1)
1706                     FAIL("File name missing after -template");
1707                 else
1708                     TemplateFileName = args[++j];
1709             }
1710             else if (args[j].equals("-spec")) { // liu
1711                 if (j == args.length - 1) {
1712                     FAIL("File name missing after -spec");
1713                 }
1714                 else {
1715                     UnicodeSpecFileName = args[++j];
1716                 }
1717             }
1718             else if (args[j].equals("-specialcasing")) {
1719                 if (j == args.length -1) {
1720                     FAIL("File name missing after -specialcasing");
1721                 }
1722                 else {
1723                     SpecialCasingFileName = args[++j];
1724                 }
1725             }
1726             else if (args[j].equals("-proplist")) {
1727                 if (j == args.length -1) {
1728                     FAIL("File name missing after -proplist");
1729                 }
1730                 else {
1731                     PropListFileName = args[++j];
1732                 }
1733             }
1734             else if (args[j].equals("-plane")) {
1735                 if (j == args.length -1) {
1736                     FAIL("Plane number missing after -plane");
1737                 }
1738                 else {
1739                     plane = Integer.parseInt(args[++j]);
1740                 }
1741                 if (plane > 0) {
1742                     bLatin1 = false;
1743                 }
1744             }
1745             else if ("-usecharforbyte".equals(args[j])) {
1746                 useCharForByte = true;
1747             }
1748             else if (args[j].equals("-latin1")) {
1749                 bLatin1 = true;
1750                 plane = 0;
1751             }
1752             else {
1753                 try {
1754                     int val = Integer.parseInt(args[j]);
1755                     if (val < 0 || val > 32) FAIL("Incorrect bit field width: " + args[j]);
1756                     if (sizes == null)
1757                         sizes = new int[1];
1758                     else {
1759                         int[] newsizes = new int[sizes.length + 1];
1760                         System.arraycopy(sizes, 0, newsizes, 0, sizes.length);
1761                         sizes = newsizes;
1762                     }
1763                     sizes[sizes.length - 1] = val;
1764                 }
1765                 catch(NumberFormatException e) {
1766                     FAIL("Unknown switch: " + args[j]);
1767                 }
1768             }
1769         }
1770         if (Csyntax && tableAsString) {
1771             FAIL("Can't specify table as string with C syntax");
1772         }
1773         if (sizes == null) {
1774             desc.append(" [");
1775             if (identifiers) {
1776                 int[] newsizes = { 8, 4, 4 };           // Good default values
1777                 desc.append("8 4 4]");
1778                 sizes = newsizes;
1779             }
1780             else {
1781                 int[] newsizes = { 10, 5, 1 }; // Guy's old defaults for 2.0.14: { 9, 4, 3, 0 }
1782                 desc.append("10 5 1]");
1783                 sizes = newsizes;
1784             }
1785         }
1786         if (UnicodeSpecFileName == null) { // liu
1787             UnicodeSpecFileName = DefaultUnicodeSpecFileName;
1788             desc.append(" [-spec " + UnicodeSpecFileName + ']');
1789         }
1790         if (SpecialCasingFileName == null) {
1791             SpecialCasingFileName = DefaultSpecialCasingFileName;
1792             desc.append(" [-specialcasing " + SpecialCasingFileName + ']');
1793         }
1794         if (PropListFileName == null) {
1795             PropListFileName = DefaultPropListFileName;
1796             desc.append(" [-proplist " + PropListFileName + ']');
1797         }
1798         if (TemplateFileName == null) {
1799             TemplateFileName = (Csyntax ? DefaultCTemplateFileName
1800                   : DefaultJavaTemplateFileName);
1801             desc.append(" [-template " + TemplateFileName + ']');
1802         }
1803         if (OutputFileName == null) {
1804             OutputFileName = (Csyntax ? DefaultCOutputFileName
1805                     : DefaultJavaOutputFileName);
1806             desc.append(" [-o " + OutputFileName + ']');
1807         }
1808         commentStart = (Csyntax ? "/*" : "//");
1809         commentEnd = (Csyntax ? " */" : "");
1810         commandLineDescription = desc.toString();
1811     }
1812 
1813     private static void searchBins(long[] map, int binsOccupied) throws Exception {
1814         int bitsFree = 16;
1815         for (int i=0; i<binsOccupied; ++i) bitsFree -= sizes[i];
1816         if (binsOccupied == (bins-1)) {
1817             sizes[binsOccupied] = bitsFree;
1818             generateForSizes(map);
1819         }
1820         else {
1821             for (int i=1; i<bitsFree; ++i) { // Don't allow bins of 0 except for last one
1822                 sizes[binsOccupied] = i;
1823                 searchBins(map, binsOccupied+1);
1824             }
1825         }
1826     }
1827 
1828     private static void generateForSizes(long[] map) throws Exception {
1829         int sum = 0;
1830         shifts = new int[sizes.length];
1831         for (int k = sizes.length - 1; k >= 0; k--) {
1832             shifts[k] = sum;
1833             sum += sizes[k];
1834         }
1835         if ((1 << sum) < map.length || (1 << (sum - 1)) >= map.length) {
1836             FAIL("Bit field widths total to " + sum +
1837              ": wrong total for map of size " + map.length);
1838         }
1839         // need a table for each set of lookup bits in char
1840         tables = new long[sizes.length][];
1841         // the last table is the map
1842         tables[sizes.length - 1] = map;
1843         for (int j = sizes.length - 1; j > 0; j--) {
1844             if (verbose && bins==0)
1845                 System.err.println("Building map " + (j+1) + " of bit width " + sizes[j]);
1846             long[][] temp = buildTable(tables[j], sizes[j]);
1847             tables[j-1] = temp[0];
1848             tables[j] = temp[1];
1849         }
1850         preshifted = new boolean[sizes.length];
1851         zeroextend = new int[sizes.length];
1852         bytes = new int[sizes.length];
1853         for (int j = 0; j < sizes.length - 1; j++) {
1854             int len = tables[j+1].length;
1855             int size = sizes[j+1];
1856             if (len > 0x100 && (len >> size) <= 0x100) {
1857                 len >>= size;
1858                 preshifted[j] = false;
1859             }
1860             else if (len > 0x10000 && (len >> size) <= 0x10000) {
1861                 len >>= size;
1862                 preshifted[j] = false;
1863             }
1864             else preshifted[j] = true;
1865             if (Csyntax)
1866                 zeroextend[j] = 0;
1867             else if (len > 0x7F && len <= 0xFF) {
1868                 if (!useCharForByte) {
1869                     zeroextend[j] = 0xFF;
1870                 }
1871             } else if (len > 0x7FFF && len <= 0xFFFF)
1872                 zeroextend[j] = 0xFFFF;
1873             else zeroextend[j] = 0;
1874             if (len <= 0x100) bytes[j] = 1;
1875             else if (len <= 0x10000) bytes[j] = 2;
1876             else bytes[j] = 4;
1877         }
1878         preshifted[sizes.length - 1] = true;
1879         zeroextend[sizes.length - 1] = 0;
1880         bytes[sizes.length - 1] = 0;
1881         if (bins > 0) {
1882             int totalBytes = getTotalBytes();
1883             String access = genAccess("A", "ch", (identifiers ? 2 : 32));
1884             int accessComplexity = 0;
1885             for (int j=0; j<access.length(); ++j) {
1886                 char ch = access.charAt(j);
1887                 if ("[&|><".indexOf(ch) >= 0) ++accessComplexity;
1888                 if (ch == '<' || ch == '>') ++j;
1889             }
1890             System.out.print("(");
1891             for (int j=0; j<sizes.length; ++j) System.out.print(" " + sizes[j]);
1892             System.out.println(" ) " + totalBytes + " " + accessComplexity + " " + access);
1893             return;
1894         }
1895         if (verbose) {
1896             System.out.println("    n\t size\tlength\tshift\tzeroext\tbytes\tpreshifted");
1897             for (int j = 0; j < sizes.length; j++) {
1898                 System.out.println(dec5(j) + "\t" +
1899                     dec5(sizes[j]) + "\t" +
1900                     dec5(tables[j].length) + "\t" +
1901                     dec5(shifts[j]) + "\t" +
1902                     dec5(zeroextend[j]) + "\t" +
1903                     dec5(bytes[j]) + "\t " +
1904                     preshifted[j]);
1905             }
1906         }
1907         if (verbose) {
1908             System.out.println("Generating source code for class Character");
1909             System.out.println("A table access looks like " +
1910                          genAccess("A", "ch", (identifiers ? 2 : 32)));
1911         }
1912         generateCharacterClass(TemplateFileName, OutputFileName);
1913     }
1914 
1915     /**
1916     * The main program for generating source code for the Character class.
1917     * The basic outline of its operation is:
1918     * <ol>
1919     * <li> Process the command line arguments.  One result of this process
1920     *           is a list of sizes (measured in bits and summing to 16).
1921     * <li> Get the Unicode character property data from the specification file.
1922     * <li> From that, build a map that has, for each character code, its
1923     *           relevant properties encoded as a long integer value.
1924     * <li> Repeatedly compress the map, producing a compressed table and a
1925     *           new map.  This is done once for each size value in the list.
1926     *           When this is done, we have a set of tables.
1927     * <li> Make some decisions about table representation; record these
1928     *           decisions in arrays named preshifted, zeroextend, and bytes.
1929     * <li> Generate the source code for the class Character by performing
1930     *           macro processing on a template file.
1931     * </ol>
1932     *
1933     * @param args       the command line arguments, as an array of String
1934     *
1935     * @see GenerateCharacter#processArgs
1936     * @see UnicodeSpec@readSpecFile
1937     * @see GenerateCharacter#buildMap
1938     * @see GenerateCharacter#buildTable
1939     * @see GenerateCharacter#generateCharacterClass
1940     */
1941 
1942     public static void main(String[] args) {
1943         processArgs(args);
1944         try {
1945 
1946             UnicodeSpec[] data = UnicodeSpec.readSpecFile(new File(UnicodeSpecFileName), plane);
1947             specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane);
1948             PropList propList = PropList.readSpecFile(new File(PropListFileName), plane);
1949 
1950             if (verbose) {
1951                 System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu
1952             }
1953             long[] map = buildMap(data, specialCaseMaps, propList);
1954             if (verbose) {
1955                 System.err.println("Completed building of initial map");
1956             }
1957 
1958             if (bins == 0) {
1959                 generateForSizes(map);
1960             }
1961             else {
1962                 while (bins > 0) {
1963                     sizes = new int[bins];
1964                     searchBins(map, 0);
1965                     --bins;
1966                 }
1967             }
1968             if (verbose && false) {
1969                 System.out.println("Offset range seen: -" + hex8(-minOffsetSeen) + "..+" +
1970                              hex8(maxOffsetSeen));
1971                 System.out.println("          allowed: -" + hex8(-minOffset) + "..+" +
1972                              hex8(maxOffset));
1973             }
1974         }
1975         catch (FileNotFoundException e) { FAIL(e.toString()); }
1976         catch (IOException e) { FAIL(e.toString()); }
1977         catch (Throwable e) {
1978             System.out.println("Unexpected exception:");
1979             e.printStackTrace();
1980             FAIL("Unexpected exception!");
1981         }
1982         if (verbose) { System.out.println("Done!");}
1983     }
1984 
1985 }   // end class