View Javadoc
1   /*
2    * Copyright (c) 1999, 2012, Oracle and/or its affiliates. All rights reserved.
3    * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4    *
5    * This code is free software; you can redistribute it and/or modify it
6    * under the terms of the GNU General Public License version 2 only, as
7    * published by the Free Software Foundation.  Oracle designates this
8    * particular file as subject to the "Classpath" exception as provided
9    * by Oracle in the LICENSE file that accompanied this code.
10   *
11   * This code is distributed in the hope that it will be useful, but WITHOUT
12   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14   * version 2 for more details (a copy is included in the LICENSE file that
15   * accompanied this code).
16   *
17   * You should have received a copy of the GNU General Public License version
18   * 2 along with this work; if not, write to the Free Software Foundation,
19   * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20   *
21   * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22   * or visit www.oracle.com if you need additional information or have any
23   * questions.
24   */
25  
26  /*
27   *
28   * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
29   * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
30   *
31   * The original version of this source code and documentation
32   * is copyrighted and owned by Taligent, Inc., a wholly-owned
33   * subsidiary of IBM. These materials are provided under terms
34   * of a License Agreement between Taligent and Sun. This technology
35   * is protected by multiple US and International patents.
36   *
37   * This notice and attribution to Taligent may not be removed.
38   * Taligent is a registered trademark of Taligent, Inc.
39   */
40  
41  package sun.util.locale.provider;
42  
43  import java.io.IOException;
44  import java.text.CharacterIterator;
45  import java.util.ArrayList;
46  import java.util.List;
47  import java.util.Stack;
48  
49  /**
50   * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
51   * to further subdivide ranges of text beyond what is possible using just the
52   * state-table-based algorithm.  This is necessary, for example, to handle
53   * word and line breaking in Thai, which doesn't use spaces between words.  The
54   * state-table-based algorithm used by RuleBasedBreakIterator is used to divide
55   * up text as far as possible, and then contiguous ranges of letters are
56   * repeatedly compared against a list of known words (i.e., the dictionary)
57   * to divide them up into words.
58   *
59   * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
60   * but adds one more special substitution name: <dictionary>.  This substitution
61   * name is used to identify characters in words in the dictionary.  The idea is that
62   * if the iterator passes over a chunk of text that includes two or more characters
63   * in a row that are included in <dictionary>, it goes back through that range and
64   * derives additional break positions (if possible) using the dictionary.
65   *
66   * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
67   * file.  It follows a prescribed search path to locate the dictionary (right now,
68   * it looks for it in /com/ibm/text/resources in each directory in the classpath,
69   * and won't find it in JAR files, but this location is likely to change).  The
70   * dictionary file is in a serialized binary format.  We have a very primitive (and
71   * slow) BuildDictionaryFile utility for creating dictionary files, but aren't
72   * currently making it public.  Contact us for help.
73   */
74  class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
75  
76      /**
77       * a list of known words that is used to divide up contiguous ranges of letters,
78       * stored in a compressed, indexed, format that offers fast access
79       */
80      private BreakDictionary dictionary;
81  
82      /**
83       * a list of flags indicating which character categories are contained in
84       * the dictionary file (this is used to determine which ranges of characters
85       * to apply the dictionary to)
86       */
87      private boolean[] categoryFlags;
88  
89      /**
90       * a temporary hiding place for the number of dictionary characters in the
91       * last range passed over by next()
92       */
93      private int dictionaryCharCount;
94  
95      /**
96       * when a range of characters is divided up using the dictionary, the break
97       * positions that are discovered are stored here, preventing us from having
98       * to use either the dictionary or the state table again until the iterator
99       * leaves this range of text
100      */
101     private int[] cachedBreakPositions;
102 
103     /**
104      * if cachedBreakPositions is not null, this indicates which item in the
105      * cache the current iteration position refers to
106      */
107     private int positionInCache;
108 
109     /**
110      * Constructs a DictionaryBasedBreakIterator.
111      * @param description Same as the description parameter on RuleBasedBreakIterator,
112      * except for the special meaning of "<dictionary>".  This parameter is just
113      * passed through to RuleBasedBreakIterator's constructor.
114      * @param dictionaryFilename The filename of the dictionary file to use
115      */
116     DictionaryBasedBreakIterator(String dataFile, String dictionaryFile)
117                                         throws IOException {
118         super(dataFile);
119         byte[] tmp = super.getAdditionalData();
120         if (tmp != null) {
121             prepareCategoryFlags(tmp);
122             super.setAdditionalData(null);
123         }
124         dictionary = new BreakDictionary(dictionaryFile);
125     }
126 
127     private void prepareCategoryFlags(byte[] data) {
128         categoryFlags = new boolean[data.length];
129         for (int i = 0; i < data.length; i++) {
130             categoryFlags[i] = (data[i] == (byte)1) ? true : false;
131         }
132     }
133 
134     @Override
135     public void setText(CharacterIterator newText) {
136         super.setText(newText);
137         cachedBreakPositions = null;
138         dictionaryCharCount = 0;
139         positionInCache = 0;
140     }
141 
142     /**
143      * Sets the current iteration position to the beginning of the text.
144      * (i.e., the CharacterIterator's starting offset).
145      * @return The offset of the beginning of the text.
146      */
147     @Override
148     public int first() {
149         cachedBreakPositions = null;
150         dictionaryCharCount = 0;
151         positionInCache = 0;
152         return super.first();
153     }
154 
155     /**
156      * Sets the current iteration position to the end of the text.
157      * (i.e., the CharacterIterator's ending offset).
158      * @return The text's past-the-end offset.
159      */
160     @Override
161     public int last() {
162         cachedBreakPositions = null;
163         dictionaryCharCount = 0;
164         positionInCache = 0;
165         return super.last();
166     }
167 
168     /**
169      * Advances the iterator one step backwards.
170      * @return The position of the last boundary position before the
171      * current iteration position
172      */
173     @Override
174     public int previous() {
175         CharacterIterator text = getText();
176 
177         // if we have cached break positions and we're still in the range
178         // covered by them, just move one step backward in the cache
179         if (cachedBreakPositions != null && positionInCache > 0) {
180             --positionInCache;
181             text.setIndex(cachedBreakPositions[positionInCache]);
182             return cachedBreakPositions[positionInCache];
183         }
184 
185         // otherwise, dump the cache and use the inherited previous() method to move
186         // backward.  This may fill up the cache with new break positions, in which
187         // case we have to mark our position in the cache
188         else {
189             cachedBreakPositions = null;
190             int result = super.previous();
191             if (cachedBreakPositions != null) {
192                 positionInCache = cachedBreakPositions.length - 2;
193             }
194             return result;
195         }
196     }
197 
198     /**
199      * Sets the current iteration position to the last boundary position
200      * before the specified position.
201      * @param offset The position to begin searching from
202      * @return The position of the last boundary before "offset"
203      */
204     @Override
205     public int preceding(int offset) {
206         CharacterIterator text = getText();
207         checkOffset(offset, text);
208 
209         // if we have no cached break positions, or "offset" is outside the
210         // range covered by the cache, we can just call the inherited routine
211         // (which will eventually call other routines in this class that may
212         // refresh the cache)
213         if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] ||
214                 offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {
215             cachedBreakPositions = null;
216             return super.preceding(offset);
217         }
218 
219         // on the other hand, if "offset" is within the range covered by the cache,
220         // then all we have to do is search the cache for the last break position
221         // before "offset"
222         else {
223             positionInCache = 0;
224             while (positionInCache < cachedBreakPositions.length
225                    && offset > cachedBreakPositions[positionInCache]) {
226                 ++positionInCache;
227             }
228             --positionInCache;
229             text.setIndex(cachedBreakPositions[positionInCache]);
230             return text.getIndex();
231         }
232     }
233 
234     /**
235      * Sets the current iteration position to the first boundary position after
236      * the specified position.
237      * @param offset The position to begin searching forward from
238      * @return The position of the first boundary after "offset"
239      */
240     @Override
241     public int following(int offset) {
242         CharacterIterator text = getText();
243         checkOffset(offset, text);
244 
245         // if we have no cached break positions, or if "offset" is outside the
246         // range covered by the cache, then dump the cache and call our
247         // inherited following() method.  This will call other methods in this
248         // class that may refresh the cache.
249         if (cachedBreakPositions == null || offset < cachedBreakPositions[0] ||
250                 offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {
251             cachedBreakPositions = null;
252             return super.following(offset);
253         }
254 
255         // on the other hand, if "offset" is within the range covered by the
256         // cache, then just search the cache for the first break position
257         // after "offset"
258         else {
259             positionInCache = 0;
260             while (positionInCache < cachedBreakPositions.length
261                    && offset >= cachedBreakPositions[positionInCache]) {
262                 ++positionInCache;
263             }
264             text.setIndex(cachedBreakPositions[positionInCache]);
265             return text.getIndex();
266         }
267     }
268 
269     /**
270      * This is the implementation function for next().
271      */
272     @Override
273     protected int handleNext() {
274         CharacterIterator text = getText();
275 
276         // if there are no cached break positions, or if we've just moved
277         // off the end of the range covered by the cache, we have to dump
278         // and possibly regenerate the cache
279         if (cachedBreakPositions == null ||
280             positionInCache == cachedBreakPositions.length - 1) {
281 
282             // start by using the inherited handleNext() to find a tentative return
283             // value.   dictionaryCharCount tells us how many dictionary characters
284             // we passed over on our way to the tentative return value
285             int startPos = text.getIndex();
286             dictionaryCharCount = 0;
287             int result = super.handleNext();
288 
289             // if we passed over more than one dictionary character, then we use
290             // divideUpDictionaryRange() to regenerate the cached break positions
291             // for the new range
292             if (dictionaryCharCount > 1 && result - startPos > 1) {
293                 divideUpDictionaryRange(startPos, result);
294             }
295 
296             // otherwise, the value we got back from the inherited fuction
297             // is our return value, and we can dump the cache
298             else {
299                 cachedBreakPositions = null;
300                 return result;
301             }
302         }
303 
304         // if the cache of break positions has been regenerated (or existed all
305         // along), then just advance to the next break position in the cache
306         // and return it
307         if (cachedBreakPositions != null) {
308             ++positionInCache;
309             text.setIndex(cachedBreakPositions[positionInCache]);
310             return cachedBreakPositions[positionInCache];
311         }
312         return -9999;   // SHOULD NEVER GET HERE!
313     }
314 
315     /**
316      * Looks up a character category for a character.
317      */
318     @Override
319     protected int lookupCategory(int c) {
320         // this override of lookupCategory() exists only to keep track of whether we've
321         // passed over any dictionary characters.  It calls the inherited lookupCategory()
322         // to do the real work, and then checks whether its return value is one of the
323         // categories represented in the dictionary.  If it is, bump the dictionary-
324         // character count.
325         int result = super.lookupCategory(c);
326         if (result != RuleBasedBreakIterator.IGNORE && categoryFlags[result]) {
327             ++dictionaryCharCount;
328         }
329         return result;
330     }
331 
332     /**
333      * This is the function that actually implements the dictionary-based
334      * algorithm.  Given the endpoints of a range of text, it uses the
335      * dictionary to determine the positions of any boundaries in this
336      * range.  It stores all the boundary positions it discovers in
337      * cachedBreakPositions so that we only have to do this work once
338      * for each time we enter the range.
339      */
340     @SuppressWarnings("unchecked")
341     private void divideUpDictionaryRange(int startPos, int endPos) {
342         CharacterIterator text = getText();
343 
344         // the range we're dividing may begin or end with non-dictionary characters
345         // (i.e., for line breaking, we may have leading or trailing punctuation
346         // that needs to be kept with the word).  Seek from the beginning of the
347         // range to the first dictionary character
348         text.setIndex(startPos);
349         int c = getCurrent();
350         int category = lookupCategory(c);
351         while (category == IGNORE || !categoryFlags[category]) {
352             c = getNext();
353             category = lookupCategory(c);
354         }
355 
356         // initialize.  We maintain two stacks: currentBreakPositions contains
357         // the list of break positions that will be returned if we successfully
358         // finish traversing the whole range now.  possibleBreakPositions lists
359         // all other possible word ends we've passed along the way.  (Whenever
360         // we reach an error [a sequence of characters that can't begin any word
361         // in the dictionary], we back up, possibly delete some breaks from
362         // currentBreakPositions, move a break from possibleBreakPositions
363         // to currentBreakPositions, and start over from there.  This process
364         // continues in this way until we either successfully make it all the way
365         // across the range, or exhaust all of our combinations of break
366         // positions.)
367         Stack<Integer> currentBreakPositions = new Stack<>();
368         Stack<Integer> possibleBreakPositions = new Stack<>();
369         List<Integer> wrongBreakPositions = new ArrayList<>();
370 
371         // the dictionary is implemented as a trie, which is treated as a state
372         // machine.  -1 represents the end of a legal word.  Every word in the
373         // dictionary is represented by a path from the root node to -1.  A path
374         // that ends in state 0 is an illegal combination of characters.
375         int state = 0;
376 
377         // these two variables are used for error handling.  We keep track of the
378         // farthest we've gotten through the range being divided, and the combination
379         // of breaks that got us that far.  If we use up all possible break
380         // combinations, the text contains an error or a word that's not in the
381         // dictionary.  In this case, we "bless" the break positions that got us the
382         // farthest as real break positions, and then start over from scratch with
383         // the character where the error occurred.
384         int farthestEndPoint = text.getIndex();
385         Stack<Integer> bestBreakPositions = null;
386 
387         // initialize (we always exit the loop with a break statement)
388         c = getCurrent();
389         while (true) {
390 
391             // if we can transition to state "-1" from our current state, we're
392             // on the last character of a legal word.  Push that position onto
393             // the possible-break-positions stack
394             if (dictionary.getNextState(state, 0) == -1) {
395                 possibleBreakPositions.push(text.getIndex());
396             }
397 
398             // look up the new state to transition to in the dictionary
399             state = dictionary.getNextStateFromCharacter(state, c);
400 
401             // if the character we're sitting on causes us to transition to
402             // the "end of word" state, then it was a non-dictionary character
403             // and we've successfully traversed the whole range.  Drop out
404             // of the loop.
405             if (state == -1) {
406                 currentBreakPositions.push(text.getIndex());
407                 break;
408             }
409 
410             // if the character we're sitting on causes us to transition to
411             // the error state, or if we've gone off the end of the range
412             // without transitioning to the "end of word" state, we've hit
413             // an error...
414             else if (state == 0 || text.getIndex() >= endPos) {
415 
416                 // if this is the farthest we've gotten, take note of it in
417                 // case there's an error in the text
418                 if (text.getIndex() > farthestEndPoint) {
419                     farthestEndPoint = text.getIndex();
420 
421                     @SuppressWarnings("unchecked")
422                     Stack<Integer> currentBreakPositionsCopy = (Stack<Integer>) currentBreakPositions.clone();
423 
424                     bestBreakPositions = currentBreakPositionsCopy;
425                 }
426 
427                 // wrongBreakPositions is a list of all break positions
428                 // we've tried starting that didn't allow us to traverse
429                 // all the way through the text.  Every time we pop a
430                 // break position off of currentBreakPositions, we put it
431                 // into wrongBreakPositions to avoid trying it again later.
432                 // If we make it to this spot, we're either going to back
433                 // up to a break in possibleBreakPositions and try starting
434                 // over from there, or we've exhausted all possible break
435                 // positions and are going to do the fallback procedure.
436                 // This loop prevents us from messing with anything in
437                 // possibleBreakPositions that didn't work as a starting
438                 // point the last time we tried it (this is to prevent a bunch of
439                 // repetitive checks from slowing down some extreme cases)
440                 while (!possibleBreakPositions.isEmpty()
441                         && wrongBreakPositions.contains(possibleBreakPositions.peek())) {
442                     possibleBreakPositions.pop();
443                 }
444 
445                 // if we've used up all possible break-position combinations, there's
446                 // an error or an unknown word in the text.  In this case, we start
447                 // over, treating the farthest character we've reached as the beginning
448                 // of the range, and "blessing" the break positions that got us that
449                 // far as real break positions
450                 if (possibleBreakPositions.isEmpty()) {
451                     if (bestBreakPositions != null) {
452                         currentBreakPositions = bestBreakPositions;
453                         if (farthestEndPoint < endPos) {
454                             text.setIndex(farthestEndPoint + 1);
455                         }
456                         else {
457                             break;
458                         }
459                     }
460                     else {
461                         if ((currentBreakPositions.size() == 0 ||
462                              currentBreakPositions.peek().intValue() != text.getIndex())
463                             && text.getIndex() != startPos) {
464                             currentBreakPositions.push(new Integer(text.getIndex()));
465                         }
466                         getNext();
467                         currentBreakPositions.push(new Integer(text.getIndex()));
468                     }
469                 }
470 
471                 // if we still have more break positions we can try, then promote the
472                 // last break in possibleBreakPositions into currentBreakPositions,
473                 // and get rid of all entries in currentBreakPositions that come after
474                 // it.  Then back up to that position and start over from there (i.e.,
475                 // treat that position as the beginning of a new word)
476                 else {
477                     Integer temp = possibleBreakPositions.pop();
478                     Integer temp2 = null;
479                     while (!currentBreakPositions.isEmpty() && temp.intValue() <
480                            currentBreakPositions.peek().intValue()) {
481                         temp2 = currentBreakPositions.pop();
482                         wrongBreakPositions.add(temp2);
483                     }
484                     currentBreakPositions.push(temp);
485                     text.setIndex(currentBreakPositions.peek().intValue());
486                 }
487 
488                 // re-sync "c" for the next go-round, and drop out of the loop if
489                 // we've made it off the end of the range
490                 c = getCurrent();
491                 if (text.getIndex() >= endPos) {
492                     break;
493                 }
494             }
495 
496             // if we didn't hit any exceptional conditions on this last iteration,
497             // just advance to the next character and loop
498             else {
499                 c = getNext();
500             }
501         }
502 
503         // dump the last break position in the list, and replace it with the actual
504         // end of the range (which may be the same character, or may be further on
505         // because the range actually ended with non-dictionary characters we want to
506         // keep with the word)
507         if (!currentBreakPositions.isEmpty()) {
508             currentBreakPositions.pop();
509         }
510         currentBreakPositions.push(endPos);
511 
512         // create a regular array to hold the break positions and copy
513         // the break positions from the stack to the array (in addition,
514         // our starting position goes into this array as a break position).
515         // This array becomes the cache of break positions used by next()
516         // and previous(), so this is where we actually refresh the cache.
517         cachedBreakPositions = new int[currentBreakPositions.size() + 1];
518         cachedBreakPositions[0] = startPos;
519 
520         for (int i = 0; i < currentBreakPositions.size(); i++) {
521             cachedBreakPositions[i + 1] = currentBreakPositions.elementAt(i).intValue();
522         }
523         positionInCache = 0;
524     }
525 }