View Javadoc
1   /*
2    * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
3    * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4    *
5    * This code is free software; you can redistribute it and/or modify it
6    * under the terms of the GNU General Public License version 2 only, as
7    * published by the Free Software Foundation.  Oracle designates this
8    * particular file as subject to the "Classpath" exception as provided
9    * by Oracle in the LICENSE file that accompanied this code.
10   *
11   * This code is distributed in the hope that it will be useful, but WITHOUT
12   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14   * version 2 for more details (a copy is included in the LICENSE file that
15   * accompanied this code).
16   *
17   * You should have received a copy of the GNU General Public License version
18   * 2 along with this work; if not, write to the Free Software Foundation,
19   * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20   *
21   * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22   * or visit www.oracle.com if you need additional information or have any
23   * questions.
24   */
25  /*
26   *******************************************************************************
27   * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
28   *                                                                             *
29   * The original version of this source code and documentation is copyrighted   *
30   * and owned by IBM, These materials are provided under terms of a License     *
31   * Agreement between IBM and Sun. This technology is protected by multiple     *
32   * US and International patents. This notice and attribution to IBM may not    *
33   * to removed.                                                                 *
34   *******************************************************************************
35   */
36  
37  package sun.text.normalizer;
38  
39  /**
40   * <p>Class enabling iteration of the values in a Trie.</p>
41   * <p>Result of each iteration contains the interval of codepoints that have
42   * the same value type and the value type itself.</p>
43   * <p>The comparison of each codepoint value is done via extract(), which the
44   * default implementation is to return the value as it is.</p>
45   * <p>Method extract() can be overwritten to perform manipulations on
46   * codepoint values in order to perform specialized comparison.</p>
47   * <p>TrieIterator is designed to be a generic iterator for the CharTrie
48   * and the IntTrie, hence to accommodate both types of data, the return
49   * result will be in terms of int (32 bit) values.</p>
50   * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.</p>
51   * <p>Notes for porting utrie_enum from icu4c to icu4j:<br>
52   * Internally, icu4c's utrie_enum performs all iterations in its body. In Java
53   * sense, the caller will have to pass a object with a callback function
54   * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit,
55   * uint32_t value) into utrie_enum. utrie_enum will then find ranges of
56   * codepoints with the same value as determined by
57   * UTrieEnumValue(const void *context, uint32_t value). for each range,
58   * utrie_enum calls the callback function to perform a task. In this way,
59   * icu4c performs the iteration within utrie_enum.
60   * To follow the JDK model, icu4j is slightly different from icu4c.
61   * Instead of requesting the caller to implement an object for a callback.
62   * The caller will have to implement a subclass of TrieIterator, fleshing out
63   * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j,
64   * the caller will have to code his own iteration and flesh out the task
65   * (equivalent to UTrieEnumRange) to be performed in the iteration loop.
66   * </p>
67   * <p>There are basically 3 usage scenarios for porting:</p>
68   * <p>1) UTrieEnumValue is the only implemented callback then just implement a
69   * subclass of TrieIterator and override the extract(int) method. The
70   * extract(int) method is analogus to UTrieEnumValue callback.
71   * </p>
72   * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement
73   * a subclass of TrieIterator, override the extract method and iterate, e.g
74   * </p>
75   * <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange,
76   *               set);<br>
77   * In Java :<br>
78   * <pre>
79   * class TrieIteratorImpl extends TrieIterator{
80   *     public TrieIteratorImpl(Trie data){
81   *         super(data);
82   *     }
83   *     public int extract(int value){
84   *         // port the implementation of _enumPropertyStartsValue here
85   *     }
86   * }
87   * ....
88   * TrieIterator fcdIter  = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
89   * while(fcdIter.next(result)) {
90   *     // port the implementation of _enumPropertyStartsRange
91   * }
92   * </pre>
93   * </p>
94   * <p>3) UTrieEnumRange is the only implemented callback then just implement
95   * the while loop, when utrie_enum is called
96   * <pre>
97   * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
98   * TrieIterator fcdIter  = new TrieIterator(fcdTrieImpl.fcdTrie);
99   * while(fcdIter.next(result)){
100  *     set.add(result.start);
101  * }
102  * </pre>
103  * </p>
104  * @author synwee
105  * @see com.ibm.icu.impl.Trie
106  * @see com.ibm.icu.lang.UCharacterTypeIterator
107  * @since release 2.1, Jan 17 2002
108  */
109 public class TrieIterator implements RangeValueIterator
110 {
111 
112     // public constructor ---------------------------------------------
113 
114     /**
115     * TrieEnumeration constructor
116     * @param trie to be used
117     * @exception IllegalArgumentException throw when argument is null.
118     */
119     public TrieIterator(Trie trie)
120     {
121         if (trie == null) {
122             throw new IllegalArgumentException(
123                                           "Argument trie cannot be null");
124         }
125         m_trie_             = trie;
126         // synwee: check that extract belongs to the child class
127         m_initialValue_     = extract(m_trie_.getInitialValue());
128         reset();
129     }
130 
131     // public methods -------------------------------------------------
132 
133     /**
134     * <p>Returns true if we are not at the end of the iteration, false
135     * otherwise.</p>
136     * <p>The next set of codepoints with the same value type will be
137     * calculated during this call and returned in the arguement element.</p>
138     * @param element return result
139     * @return true if we are not at the end of the iteration, false otherwise.
140     * @exception NoSuchElementException - if no more elements exist.
141     * @see com.ibm.icu.util.RangeValueIterator.Element
142     */
143     public final boolean next(Element element)
144     {
145         if (m_nextCodepoint_ > UCharacter.MAX_VALUE) {
146             return false;
147         }
148         if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE &&
149             calculateNextBMPElement(element)) {
150             return true;
151         }
152         calculateNextSupplementaryElement(element);
153         return true;
154     }
155 
156     /**
157     * Resets the iterator to the beginning of the iteration
158     */
159     public final void reset()
160     {
161         m_currentCodepoint_ = 0;
162         m_nextCodepoint_    = 0;
163         m_nextIndex_        = 0;
164         m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_;
165         if (m_nextBlock_ == 0) {
166             m_nextValue_ = m_initialValue_;
167         }
168         else {
169             m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_));
170         }
171         m_nextBlockIndex_ = 0;
172         m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_;
173     }
174 
175     // protected methods ----------------------------------------------
176 
177     /**
178     * Called by next() to extracts a 32 bit value from a trie value
179     * used for comparison.
180     * This method is to be overwritten if special manipulation is to be done
181     * to retrieve a relevant comparison.
182     * The default function is to return the value as it is.
183     * @param value a value from the trie
184     * @return extracted value
185     */
186     protected int extract(int value)
187     {
188         return value;
189     }
190 
191     // private methods ------------------------------------------------
192 
193     /**
194     * Set the result values
195     * @param element return result object
196     * @param start codepoint of range
197     * @param limit (end + 1) codepoint of range
198     * @param value common value of range
199     */
200     private final void setResult(Element element, int start, int limit,
201                                  int value)
202     {
203         element.start = start;
204         element.limit = limit;
205         element.value = value;
206     }
207 
208     /**
209     * Finding the next element.
210     * This method is called just before returning the result of
211     * next().
212     * We always store the next element before it is requested.
213     * In the case that we have to continue calculations into the
214     * supplementary planes, a false will be returned.
215     * @param element return result object
216     * @return true if the next range is found, false if we have to proceed to
217     *         the supplementary range.
218     */
219     private final boolean calculateNextBMPElement(Element element)
220     {
221         int currentBlock    = m_nextBlock_;
222         int currentValue    = m_nextValue_;
223         m_currentCodepoint_ = m_nextCodepoint_;
224         m_nextCodepoint_ ++;
225         m_nextBlockIndex_ ++;
226         if (!checkBlockDetail(currentValue)) {
227             setResult(element, m_currentCodepoint_, m_nextCodepoint_,
228                       currentValue);
229             return true;
230         }
231         // synwee check that next block index == 0 here
232         // enumerate BMP - the main loop enumerates data blocks
233         while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) {
234             m_nextIndex_ ++;
235             // because of the way the character is split to form the index
236             // the lead surrogate and trail surrogate can not be in the
237             // mid of a block
238             if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) {
239                 // skip lead surrogate code units,
240                 // go to lead surrogate codepoints
241                 m_nextIndex_ = BMP_INDEX_LENGTH_;
242             }
243             else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) {
244                 // go back to regular BMP code points
245                 m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_;
246             }
247 
248             m_nextBlockIndex_ = 0;
249             if (!checkBlock(currentBlock, currentValue)) {
250                 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
251                           currentValue);
252                 return true;
253             }
254         }
255         m_nextCodepoint_ --;   // step one back since this value has not been
256         m_nextBlockIndex_ --;  // retrieved yet.
257         return false;
258     }
259 
260     /**
261     * Finds the next supplementary element.
262     * For each entry in the trie, the value to be delivered is passed through
263     * extract().
264     * We always store the next element before it is requested.
265     * Called after calculateNextBMP() completes its round of BMP characters.
266     * There is a slight difference in the usage of m_currentCodepoint_
267     * here as compared to calculateNextBMP(). Though both represents the
268     * lower bound of the next element, in calculateNextBMP() it gets set
269     * at the start of any loop, where-else, in calculateNextSupplementary()
270     * since m_currentCodepoint_ already contains the lower bound of the
271     * next element (passed down from calculateNextBMP()), we keep it till
272     * the end before resetting it to the new value.
273     * Note, if there are no more iterations, it will never get to here.
274     * Blocked out by next().
275     * @param element return result object
276     */
277     private final void calculateNextSupplementaryElement(Element element)
278     {
279         int currentValue = m_nextValue_;
280         int currentBlock = m_nextBlock_;
281         m_nextCodepoint_ ++;
282         m_nextBlockIndex_ ++;
283 
284         if (UTF16.getTrailSurrogate(m_nextCodepoint_)
285                                         != UTF16.TRAIL_SURROGATE_MIN_VALUE) {
286             // this piece is only called when we are in the middle of a lead
287             // surrogate block
288             if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) {
289                 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
290                           currentValue);
291                 m_currentCodepoint_ = m_nextCodepoint_;
292                 return;
293             }
294             // we have cleared one block
295             m_nextIndex_ ++;
296             m_nextTrailIndexOffset_ ++;
297             if (!checkTrailBlock(currentBlock, currentValue)) {
298                 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
299                           currentValue);
300                 m_currentCodepoint_ = m_nextCodepoint_;
301                 return;
302             }
303         }
304         int nextLead  = UTF16.getLeadSurrogate(m_nextCodepoint_);
305         // enumerate supplementary code points
306         while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) {
307             // lead surrogate access
308             int leadBlock =
309                    m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
310                                                    Trie.INDEX_STAGE_2_SHIFT_;
311             if (leadBlock == m_trie_.m_dataOffset_) {
312                 // no entries for a whole block of lead surrogates
313                 if (currentValue != m_initialValue_) {
314                     m_nextValue_      = m_initialValue_;
315                     m_nextBlock_      = 0;
316                     m_nextBlockIndex_ = 0;
317                     setResult(element, m_currentCodepoint_, m_nextCodepoint_,
318                               currentValue);
319                     m_currentCodepoint_ = m_nextCodepoint_;
320                     return;
321                 }
322 
323                 nextLead += DATA_BLOCK_LENGTH_;
324                 // number of total affected supplementary codepoints in one
325                 // block
326                 // this is not a simple addition of
327                 // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider
328                 // that we might have moved some of the codepoints
329                 m_nextCodepoint_ = UCharacterProperty.getRawSupplementary(
330                                      (char)nextLead,
331                                      (char)UTF16.TRAIL_SURROGATE_MIN_VALUE);
332                 continue;
333             }
334             if (m_trie_.m_dataManipulate_ == null) {
335                 throw new NullPointerException(
336                             "The field DataManipulate in this Trie is null");
337             }
338             // enumerate trail surrogates for this lead surrogate
339             m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
340                                m_trie_.getValue(leadBlock +
341                                    (nextLead & Trie.INDEX_STAGE_3_MASK_)));
342             if (m_nextIndex_ <= 0) {
343                 // no data for this lead surrogate
344                 if (currentValue != m_initialValue_) {
345                     m_nextValue_      = m_initialValue_;
346                     m_nextBlock_      = 0;
347                     m_nextBlockIndex_ = 0;
348                     setResult(element, m_currentCodepoint_, m_nextCodepoint_,
349                               currentValue);
350                     m_currentCodepoint_ = m_nextCodepoint_;
351                     return;
352                 }
353                 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_;
354             } else {
355                 m_nextTrailIndexOffset_ = 0;
356                 if (!checkTrailBlock(currentBlock, currentValue)) {
357                     setResult(element, m_currentCodepoint_, m_nextCodepoint_,
358                               currentValue);
359                     m_currentCodepoint_ = m_nextCodepoint_;
360                     return;
361                 }
362             }
363             nextLead ++;
364          }
365 
366          // deliver last range
367          setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1,
368                    currentValue);
369     }
370 
371     /**
372     * Internal block value calculations
373     * Performs calculations on a data block to find codepoints in m_nextBlock_
374     * after the index m_nextBlockIndex_ that has the same value.
375     * Note m_*_ variables at this point is the next codepoint whose value
376     * has not been calculated.
377     * But when returned with false, it will be the last codepoint whose
378     * value has been calculated.
379     * @param currentValue the value which other codepoints are tested against
380     * @return true if the whole block has the same value as currentValue or if
381     *              the whole block has been calculated, false otherwise.
382     */
383     private final boolean checkBlockDetail(int currentValue)
384     {
385         while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) {
386             m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ +
387                                                     m_nextBlockIndex_));
388             if (m_nextValue_ != currentValue) {
389                 return false;
390             }
391             ++ m_nextBlockIndex_;
392             ++ m_nextCodepoint_;
393         }
394         return true;
395     }
396 
397     /**
398     * Internal block value calculations
399     * Performs calculations on a data block to find codepoints in m_nextBlock_
400     * that has the same value.
401     * Will call checkBlockDetail() if highlevel check fails.
402     * Note m_*_ variables at this point is the next codepoint whose value
403     * has not been calculated.
404     * @param currentBlock the initial block containing all currentValue
405     * @param currentValue the value which other codepoints are tested against
406     * @return true if the whole block has the same value as currentValue or if
407     *              the whole block has been calculated, false otherwise.
408     */
409     private final boolean checkBlock(int currentBlock, int currentValue)
410     {
411         m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] <<
412                                                   Trie.INDEX_STAGE_2_SHIFT_;
413         if (m_nextBlock_ == currentBlock &&
414             (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) {
415             // the block is the same as the previous one, filled with
416             // currentValue
417             m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
418         }
419         else if (m_nextBlock_ == 0) {
420             // this is the all-initial-value block
421             if (currentValue != m_initialValue_) {
422                 m_nextValue_      = m_initialValue_;
423                 m_nextBlockIndex_ = 0;
424                 return false;
425             }
426             m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
427         }
428         else {
429             if (!checkBlockDetail(currentValue)) {
430                 return false;
431             }
432         }
433         return true;
434     }
435 
436     /**
437     * Internal block value calculations
438     * Performs calculations on multiple data blocks for a set of trail
439     * surrogates to find codepoints in m_nextBlock_ that has the same value.
440     * Will call checkBlock() for internal block checks.
441     * Note m_*_ variables at this point is the next codepoint whose value
442     * has not been calculated.
443     * @param currentBlock the initial block containing all currentValue
444     * @param currentValue the value which other codepoints are tested against
445     * @return true if the whole block has the same value as currentValue or if
446     *              the whole block has been calculated, false otherwise.
447     */
448     private final boolean checkTrailBlock(int currentBlock,
449                                           int currentValue)
450     {
451         // enumerate code points for this lead surrogate
452         while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_)
453         {
454             // if we ever reach here, we are at the start of a new block
455             m_nextBlockIndex_ = 0;
456             // copy of most of the body of the BMP loop
457             if (!checkBlock(currentBlock, currentValue)) {
458                 return false;
459             }
460             m_nextTrailIndexOffset_ ++;
461             m_nextIndex_ ++;
462         }
463         return true;
464     }
465 
466     /**
467     * Checks if we are beginning at the start of a initial block.
468     * If we are then the rest of the codepoints in this initial block
469     * has the same values.
470     * We increment m_nextCodepoint_ and relevant data members if so.
471     * This is used only in for the supplementary codepoints because
472     * the offset to the trail indexes could be 0.
473     * @return true if we are at the start of a initial block.
474     */
475     private final boolean checkNullNextTrailIndex()
476     {
477         if (m_nextIndex_ <= 0) {
478             m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
479             int nextLead  = UTF16.getLeadSurrogate(m_nextCodepoint_);
480             int leadBlock =
481                    m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
482                                                    Trie.INDEX_STAGE_2_SHIFT_;
483             if (m_trie_.m_dataManipulate_ == null) {
484                 throw new NullPointerException(
485                             "The field DataManipulate in this Trie is null");
486             }
487             m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
488                                m_trie_.getValue(leadBlock +
489                                    (nextLead & Trie.INDEX_STAGE_3_MASK_)));
490             m_nextIndex_ --;
491             m_nextBlockIndex_ =  DATA_BLOCK_LENGTH_;
492             return true;
493         }
494         return false;
495     }
496 
497     // private data members --------------------------------------------
498 
499     /**
500     * Size of the stage 1 BMP indexes
501     */
502     private static final int BMP_INDEX_LENGTH_ =
503                                         0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
504     /**
505     * Lead surrogate minimum value
506     */
507     private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
508     /**
509     * Trail surrogate minimum value
510     */
511     private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
512     /**
513     * Number of trail surrogate
514     */
515     private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
516     /**
517     * Number of stage 1 indexes for supplementary calculations that maps to
518     * each lead surrogate character.
519     * See second pass into getRawOffset for the trail surrogate character.
520     * 10 for significant number of bits for trail surrogates, 5 for what we
521     * discard during shifting.
522     */
523     private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ =
524                                     1 << (10 - Trie.INDEX_STAGE_1_SHIFT_);
525     /**
526     * Number of data values in a stage 2 (data array) block.
527     */
528     private static final int DATA_BLOCK_LENGTH_ =
529                                               1 << Trie.INDEX_STAGE_1_SHIFT_;
530     /**
531     * Trie instance
532     */
533     private Trie m_trie_;
534     /**
535     * Initial value for trie values
536     */
537     private int m_initialValue_;
538     /**
539     * Next element results and data.
540     */
541     private int m_currentCodepoint_;
542     private int m_nextCodepoint_;
543     private int m_nextValue_;
544     private int m_nextIndex_;
545     private int m_nextBlock_;
546     private int m_nextBlockIndex_;
547     private int m_nextTrailIndexOffset_;
548 }