View Javadoc
1   /*
2    * Copyright (c) 2009, 2013, Oracle and/or its affiliates. All rights reserved.
3    * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4    *
5    * This code is free software; you can redistribute it and/or modify it
6    * under the terms of the GNU General Public License version 2 only, as
7    * published by the Free Software Foundation.  Oracle designates this
8    * particular file as subject to the "Classpath" exception as provided
9    * by Oracle in the LICENSE file that accompanied this code.
10   *
11   * This code is distributed in the hope that it will be useful, but WITHOUT
12   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14   * version 2 for more details (a copy is included in the LICENSE file that
15   * accompanied this code).
16   *
17   * You should have received a copy of the GNU General Public License version
18   * 2 along with this work; if not, write to the Free Software Foundation,
19   * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20   *
21   * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22   * or visit www.oracle.com if you need additional information or have any
23   * questions.
24   */
25  /*
26   *******************************************************************************
27   * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
28   *                                                                             *
29   * The original version of this source code and documentation is copyrighted   *
30   * and owned by IBM, These materials are provided under terms of a License     *
31   * Agreement between IBM and Sun. This technology is protected by multiple     *
32   * US and International patents. This notice and attribution to IBM may not    *
33   * to removed.                                                                 *
34   *******************************************************************************
35   */
36  
37  package sun.text.normalizer;
38  
39  import java.io.IOException;
40  import java.util.MissingResourceException;
41  
42  /**
43   * <p>
44   * The UCharacter class provides extensions to the
45   * <a href="http://docs.oracle.com/javase/1.5.0/docs/api/java/lang/Character.html">
46   * java.lang.Character</a> class. These extensions provide support for
47   * more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
48   * class, provide support for supplementary characters (those with code
49   * points above U+FFFF).
50   * Each ICU release supports the latest version of Unicode available at that time.
51   * </p>
52   * <p>
53   * Code points are represented in these API using ints. While it would be
54   * more convenient in Java to have a separate primitive datatype for them,
55   * ints suffice in the meantime.
56   * </p>
57   * <p>
58   * To use this class please add the jar file name icu4j.jar to the
59   * class path, since it contains data files which supply the information used
60   * by this file.<br>
61   * E.g. In Windows <br>
62   * <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
63   * Otherwise, another method would be to copy the files uprops.dat and
64   * unames.icu from the icu4j source subdirectory
65   * <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
66   * <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
67   * </p>
68   * <p>
69   * Aside from the additions for UTF-16 support, and the updated Unicode
70   * properties, the main differences between UCharacter and Character are:
71   * <ul>
72   * <li> UCharacter is not designed to be a char wrapper and does not have
73   *      APIs to which involves management of that single char.<br>
74   *      These include:
75   *      <ul>
76   *        <li> char charValue(),
77   *        <li> int compareTo(java.lang.Character, java.lang.Character), etc.
78   *      </ul>
79   * <li> UCharacter does not include Character APIs that are deprecated, nor
80   *      does it include the Java-specific character information, such as
81   *      boolean isJavaIdentifierPart(char ch).
82   * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
83   *      values '10' - '35'. UCharacter also does this in digit and
84   *      getNumericValue, to adhere to the java semantics of these
85   *      methods.  New methods unicodeDigit, and
86   *      getUnicodeNumericValue do not treat the above code points
87   *      as having numeric values.  This is a semantic change from ICU4J 1.3.1.
88   * </ul>
89   * <p>
90   * Further detail differences can be determined from the program
91   *        <a href="http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
92   *        com.ibm.icu.dev.test.lang.UCharacterCompare</a>
93   * </p>
94   * <p>
95   * In addition to Java compatibility functions, which calculate derived properties,
96   * this API provides low-level access to the Unicode Character Database.
97   * </p>
98   * <p>
99   * Unicode assigns each code point (not just assigned character) values for
100  * many properties.
101  * Most of them are simple boolean flags, or constants from a small enumerated list.
102  * For some properties, values are strings or other relatively more complex types.
103  * </p>
104  * <p>
105  * For more information see
106  * "About the Unicode Character Database" (http://www.unicode.org/ucd/)
107  * and the ICU User Guide chapter on Properties (http://www.icu-project.org/userguide/properties.html).
108  * </p>
109  * <p>
110  * There are also functions that provide easy migration from C/POSIX functions
111  * like isblank(). Their use is generally discouraged because the C/POSIX
112  * standards do not define their semantics beyond the ASCII range, which means
113  * that different implementations exhibit very different behavior.
114  * Instead, Unicode properties should be used directly.
115  * </p>
116  * <p>
117  * There are also only a few, broad C/POSIX character classes, and they tend
118  * to be used for conflicting purposes. For example, the "isalpha()" class
119  * is sometimes used to determine word boundaries, while a more sophisticated
120  * approach would at least distinguish initial letters from continuation
121  * characters (the latter including combining marks).
122  * (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
123  * Another example: There is no "istitle()" class for titlecase characters.
124  * </p>
125  * <p>
126  * ICU 3.4 and later provides API access for all twelve C/POSIX character classes.
127  * ICU implements them according to the Standard Recommendations in
128  * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
129  * (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
130  * </p>
131  * <p>
132  * API access for C/POSIX character classes is as follows:
133  * - alpha:     isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
134  * - lower:     isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
135  * - upper:     isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
136  * - punct:     ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|(1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|(1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0
137  * - digit:     isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER
138  * - xdigit:    hasBinaryProperty(c, UProperty.POSIX_XDIGIT)
139  * - alnum:     hasBinaryProperty(c, UProperty.POSIX_ALNUM)
140  * - space:     isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE)
141  * - blank:     hasBinaryProperty(c, UProperty.POSIX_BLANK)
142  * - cntrl:     getType(c)==CONTROL
143  * - graph:     hasBinaryProperty(c, UProperty.POSIX_GRAPH)
144  * - print:     hasBinaryProperty(c, UProperty.POSIX_PRINT)
145  * </p>
146  * <p>
147  * The C/POSIX character classes are also available in UnicodeSet patterns,
148  * using patterns like [:graph:] or \p{graph}.
149  * </p>
150  * <p>
151  * Note: There are several ICU (and Java) whitespace functions.
152  * Comparison:
153  * - isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
154  *       most of general categories "Z" (separators) + most whitespace ISO controls
155  *       (including no-break spaces, but excluding IS1..IS4 and ZWSP)
156  * - isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
157  * - isSpaceChar: just Z (including no-break spaces)
158  * </p>
159  * <p>
160  * This class is not subclassable
161  * </p>
162  * @author Syn Wee Quek
163  * @stable ICU 2.1
164  * @see com.ibm.icu.lang.UCharacterEnums
165  */
166 
167 public final class UCharacter
168 {
169 
170     /**
171      * Numeric Type constants.
172      * @see UProperty#NUMERIC_TYPE
173      * @stable ICU 2.4
174      */
175     public static interface NumericType
176     {
177         /**
178          * @stable ICU 2.4
179          */
180         public static final int DECIMAL = 1;
181     }
182 
183     // public data members -----------------------------------------------
184 
185     /**
186      * The lowest Unicode code point value.
187      * @stable ICU 2.1
188      */
189     public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
190 
191     /**
192      * The highest Unicode code point value (scalar value) according to the
193      * Unicode Standard.
194      * This is a 21-bit value (21 bits, rounded up).<br>
195      * Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE
196      * @stable ICU 2.1
197      */
198     public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
199 
200     /**
201      * The minimum value for Supplementary code points
202      * @stable ICU 2.1
203      */
204     public static final int SUPPLEMENTARY_MIN_VALUE =
205         UTF16.SUPPLEMENTARY_MIN_VALUE;
206 
207     // public methods ----------------------------------------------------
208 
209     /**
210      * Retrieves the numeric value of a decimal digit code point.
211      * <br>This method observes the semantics of
212      * <code>java.lang.Character.digit()</code>.  Note that this
213      * will return positive values for code points for which isDigit
214      * returns false, just like java.lang.Character.
215      * <br><em>Semantic Change:</em> In release 1.3.1 and
216      * prior, this did not treat the European letters as having a
217      * digit value, and also treated numeric letters and other numbers as
218      * digits.
219      * This has been changed to conform to the java semantics.
220      * <br>A code point is a valid digit if and only if:
221      * <ul>
222      *   <li>ch is a decimal digit or one of the european letters, and
223      *   <li>the value of ch is less than the specified radix.
224      * </ul>
225      * @param ch the code point to query
226      * @param radix the radix
227      * @return the numeric value represented by the code point in the
228      * specified radix, or -1 if the code point is not a decimal digit
229      * or if its value is too large for the radix
230      * @stable ICU 2.1
231      */
232     public static int digit(int ch, int radix)
233     {
234         // when ch is out of bounds getProperty == 0
235         int props = getProperty(ch);
236         int value;
237         if (getNumericType(props) == NumericType.DECIMAL) {
238             value = UCharacterProperty.getUnsignedValue(props);
239         } else {
240             value = getEuropeanDigit(ch);
241         }
242         return (0 <= value && value < radix) ? value : -1;
243     }
244 
245     /**
246      * Returns the Bidirection property of a code point.
247      * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
248      * property.<br>
249      * Result returned belongs to the interface
250      * <a href=UCharacterDirection.html>UCharacterDirection</a>
251      * @param ch the code point to be determined its direction
252      * @return direction constant from UCharacterDirection.
253      * @stable ICU 2.1
254      */
255     public static int getDirection(int ch)
256     {
257         return gBdp.getClass(ch);
258     }
259 
260     /**
261      * Returns a code point corresponding to the two UTF16 characters.
262      * @param lead the lead char
263      * @param trail the trail char
264      * @return code point if surrogate characters are valid.
265      * @exception IllegalArgumentException thrown when argument characters do
266      *            not form a valid codepoint
267      * @stable ICU 2.1
268      */
269     public static int getCodePoint(char lead, char trail)
270     {
271         if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) {
272             return UCharacterProperty.getRawSupplementary(lead, trail);
273         }
274         throw new IllegalArgumentException("Illegal surrogate characters");
275     }
276 
277     /**
278      * <p>Get the "age" of the code point.</p>
279      * <p>The "age" is the Unicode version when the code point was first
280      * designated (as a non-character or for Private Use) or assigned a
281      * character.
282      * <p>This can be useful to avoid emitting code points to receiving
283      * processes that do not accept newer characters.</p>
284      * <p>The data is from the UCD file DerivedAge.txt.</p>
285      * @param ch The code point.
286      * @return the Unicode version number
287      * @stable ICU 2.6
288      */
289     public static VersionInfo getAge(int ch)
290     {
291         if (ch < MIN_VALUE || ch > MAX_VALUE) {
292         throw new IllegalArgumentException("Codepoint out of bounds");
293         }
294         return PROPERTY_.getAge(ch);
295     }
296 
297     // private variables -------------------------------------------------
298 
299     /**
300      * Database storing the sets of character property
301      */
302     private static final UCharacterProperty PROPERTY_;
303     /**
304      * For optimization
305      */
306     private static final char[] PROPERTY_TRIE_INDEX_;
307     private static final char[] PROPERTY_TRIE_DATA_;
308     private static final int PROPERTY_INITIAL_VALUE_;
309 
310     private static final UBiDiProps gBdp;
311 
312     // block to initialise character property database
313     static
314     {
315         try
316         {
317             PROPERTY_ = UCharacterProperty.getInstance();
318             PROPERTY_TRIE_INDEX_ = PROPERTY_.m_trieIndex_;
319             PROPERTY_TRIE_DATA_ = PROPERTY_.m_trieData_;
320             PROPERTY_INITIAL_VALUE_ = PROPERTY_.m_trieInitialValue_;
321         }
322         catch (Exception e)
323         {
324             throw new MissingResourceException(e.getMessage(),"","");
325         }
326 
327         UBiDiProps bdp;
328         try {
329             bdp=UBiDiProps.getSingleton();
330         } catch(IOException e) {
331             bdp=UBiDiProps.getDummy();
332         }
333         gBdp=bdp;
334     }
335 
336     /**
337      * Shift to get numeric type
338      */
339     private static final int NUMERIC_TYPE_SHIFT_ = 5;
340     /**
341      * Mask to get numeric type
342      */
343     private static final int NUMERIC_TYPE_MASK_ = 0x7 << NUMERIC_TYPE_SHIFT_;
344 
345     // private methods ---------------------------------------------------
346 
347     /**
348      * Getting the digit values of characters like 'A' - 'Z', normal,
349      * half-width and full-width. This method assumes that the other digit
350      * characters are checked by the calling method.
351      * @param ch character to test
352      * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
353      *         its corresponding digit will be returned.
354      */
355     private static int getEuropeanDigit(int ch) {
356         if ((ch > 0x7a && ch < 0xff21)
357             || ch < 0x41 || (ch > 0x5a && ch < 0x61)
358             || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
359             return -1;
360         }
361         if (ch <= 0x7a) {
362             // ch >= 0x41 or ch < 0x61
363             return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
364         }
365         // ch >= 0xff21
366         if (ch <= 0xff3a) {
367             return ch + 10 - 0xff21;
368         }
369         // ch >= 0xff41 && ch <= 0xff5a
370         return ch + 10 - 0xff41;
371     }
372 
373     /**
374      * Gets the numeric type of the property argument
375      * @param props 32 bit property
376      * @return the numeric type
377      */
378     private static int getNumericType(int props)
379     {
380         return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_;
381     }
382 
383     /**
384      * Gets the property value at the index.
385      * This is optimized.
386      * Note this is alittle different from CharTrie the index m_trieData_
387      * is never negative.
388      * This is a duplicate of UCharacterProperty.getProperty. For optimization
389      * purposes, this method calls the trie data directly instead of through
390      * UCharacterProperty.getProperty.
391      * @param ch code point whose property value is to be retrieved
392      * @return property value of code point
393      * @stable ICU 2.6
394      */
395     private static final int getProperty(int ch)
396     {
397         if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
398             || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
399                 && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
400             // BMP codepoint 0000..D7FF or DC00..FFFF
401             try { // using try for ch < 0 is faster than using an if statement
402                 return PROPERTY_TRIE_DATA_[
403                               (PROPERTY_TRIE_INDEX_[ch >> 5] << 2)
404                               + (ch & 0x1f)];
405             } catch (ArrayIndexOutOfBoundsException e) {
406                 return PROPERTY_INITIAL_VALUE_;
407             }
408         }
409         if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
410             // lead surrogate D800..DBFF
411             return PROPERTY_TRIE_DATA_[
412                               (PROPERTY_TRIE_INDEX_[(0x2800 >> 5) + (ch >> 5)] << 2)
413                               + (ch & 0x1f)];
414         }
415         // for optimization
416         if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
417             // supplementary code point 10000..10FFFF
418             // look at the construction of supplementary characters
419             // trail forms the ends of it.
420             return PROPERTY_.m_trie_.getSurrogateValue(
421                                       UTF16.getLeadSurrogate(ch),
422                                       (char)(ch & 0x3ff));
423         }
424         // return m_dataOffset_ if there is an error, in this case we return
425         // the default value: m_initialValue_
426         // we cannot assume that m_initialValue_ is at offset 0
427         // this is for optimization.
428         return PROPERTY_INITIAL_VALUE_;
429     }
430 
431 }