View Javadoc
1   /*
2    * Copyright (c) 1994, 2004, Oracle and/or its affiliates. All rights reserved.
3    * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4    *
5    * This code is free software; you can redistribute it and/or modify it
6    * under the terms of the GNU General Public License version 2 only, as
7    * published by the Free Software Foundation.  Oracle designates this
8    * particular file as subject to the "Classpath" exception as provided
9    * by Oracle in the LICENSE file that accompanied this code.
10   *
11   * This code is distributed in the hope that it will be useful, but WITHOUT
12   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14   * version 2 for more details (a copy is included in the LICENSE file that
15   * accompanied this code).
16   *
17   * You should have received a copy of the GNU General Public License version
18   * 2 along with this work; if not, write to the Free Software Foundation,
19   * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20   *
21   * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22   * or visit www.oracle.com if you need additional information or have any
23   * questions.
24   */
25  
26  package java.util;
27  
28  import java.lang.*;
29  
30  /**
31   * The string tokenizer class allows an application to break a
32   * string into tokens. The tokenization method is much simpler than
33   * the one used by the <code>StreamTokenizer</code> class. The
34   * <code>StringTokenizer</code> methods do not distinguish among
35   * identifiers, numbers, and quoted strings, nor do they recognize
36   * and skip comments.
37   * <p>
38   * The set of delimiters (the characters that separate tokens) may
39   * be specified either at creation time or on a per-token basis.
40   * <p>
41   * An instance of <code>StringTokenizer</code> behaves in one of two
42   * ways, depending on whether it was created with the
43   * <code>returnDelims</code> flag having the value <code>true</code>
44   * or <code>false</code>:
45   * <ul>
46   * <li>If the flag is <code>false</code>, delimiter characters serve to
47   *     separate tokens. A token is a maximal sequence of consecutive
48   *     characters that are not delimiters.
49   * <li>If the flag is <code>true</code>, delimiter characters are themselves
50   *     considered to be tokens. A token is thus either one delimiter
51   *     character, or a maximal sequence of consecutive characters that are
52   *     not delimiters.
53   * </ul><p>
54   * A <tt>StringTokenizer</tt> object internally maintains a current
55   * position within the string to be tokenized. Some operations advance this
56   * current position past the characters processed.<p>
57   * A token is returned by taking a substring of the string that was used to
58   * create the <tt>StringTokenizer</tt> object.
59   * <p>
60   * The following is one example of the use of the tokenizer. The code:
61   * <blockquote><pre>
62   *     StringTokenizer st = new StringTokenizer("this is a test");
63   *     while (st.hasMoreTokens()) {
64   *         System.out.println(st.nextToken());
65   *     }
66   * </pre></blockquote>
67   * <p>
68   * prints the following output:
69   * <blockquote><pre>
70   *     this
71   *     is
72   *     a
73   *     test
74   * </pre></blockquote>
75   *
76   * <p>
77   * <tt>StringTokenizer</tt> is a legacy class that is retained for
78   * compatibility reasons although its use is discouraged in new code. It is
79   * recommended that anyone seeking this functionality use the <tt>split</tt>
80   * method of <tt>String</tt> or the java.util.regex package instead.
81   * <p>
82   * The following example illustrates how the <tt>String.split</tt>
83   * method can be used to break up a string into its basic tokens:
84   * <blockquote><pre>
85   *     String[] result = "this is a test".split("\\s");
86   *     for (int x=0; x&lt;result.length; x++)
87   *         System.out.println(result[x]);
88   * </pre></blockquote>
89   * <p>
90   * prints the following output:
91   * <blockquote><pre>
92   *     this
93   *     is
94   *     a
95   *     test
96   * </pre></blockquote>
97   *
98   * @author  unascribed
99   * @see     java.io.StreamTokenizer
100  * @since   JDK1.0
101  */
102 public
103 class StringTokenizer implements Enumeration<Object> {
104     private int currentPosition;
105     private int newPosition;
106     private int maxPosition;
107     private String str;
108     private String delimiters;
109     private boolean retDelims;
110     private boolean delimsChanged;
111 
112     /**
113      * maxDelimCodePoint stores the value of the delimiter character with the
114      * highest value. It is used to optimize the detection of delimiter
115      * characters.
116      *
117      * It is unlikely to provide any optimization benefit in the
118      * hasSurrogates case because most string characters will be
119      * smaller than the limit, but we keep it so that the two code
120      * paths remain similar.
121      */
122     private int maxDelimCodePoint;
123 
124     /**
125      * If delimiters include any surrogates (including surrogate
126      * pairs), hasSurrogates is true and the tokenizer uses the
127      * different code path. This is because String.indexOf(int)
128      * doesn't handle unpaired surrogates as a single character.
129      */
130     private boolean hasSurrogates = false;
131 
132     /**
133      * When hasSurrogates is true, delimiters are converted to code
134      * points and isDelimiter(int) is used to determine if the given
135      * codepoint is a delimiter.
136      */
137     private int[] delimiterCodePoints;
138 
139     /**
140      * Set maxDelimCodePoint to the highest char in the delimiter set.
141      */
142     private void setMaxDelimCodePoint() {
143         if (delimiters == null) {
144             maxDelimCodePoint = 0;
145             return;
146         }
147 
148         int m = 0;
149         int c;
150         int count = 0;
151         for (int i = 0; i < delimiters.length(); i += Character.charCount(c)) {
152             c = delimiters.charAt(i);
153             if (c >= Character.MIN_HIGH_SURROGATE && c <= Character.MAX_LOW_SURROGATE) {
154                 c = delimiters.codePointAt(i);
155                 hasSurrogates = true;
156             }
157             if (m < c)
158                 m = c;
159             count++;
160         }
161         maxDelimCodePoint = m;
162 
163         if (hasSurrogates) {
164             delimiterCodePoints = new int[count];
165             for (int i = 0, j = 0; i < count; i++, j += Character.charCount(c)) {
166                 c = delimiters.codePointAt(j);
167                 delimiterCodePoints[i] = c;
168             }
169         }
170     }
171 
172     /**
173      * Constructs a string tokenizer for the specified string. All
174      * characters in the <code>delim</code> argument are the delimiters
175      * for separating tokens.
176      * <p>
177      * If the <code>returnDelims</code> flag is <code>true</code>, then
178      * the delimiter characters are also returned as tokens. Each
179      * delimiter is returned as a string of length one. If the flag is
180      * <code>false</code>, the delimiter characters are skipped and only
181      * serve as separators between tokens.
182      * <p>
183      * Note that if <tt>delim</tt> is <tt>null</tt>, this constructor does
184      * not throw an exception. However, trying to invoke other methods on the
185      * resulting <tt>StringTokenizer</tt> may result in a
186      * <tt>NullPointerException</tt>.
187      *
188      * @param   str            a string to be parsed.
189      * @param   delim          the delimiters.
190      * @param   returnDelims   flag indicating whether to return the delimiters
191      *                         as tokens.
192      * @exception NullPointerException if str is <CODE>null</CODE>
193      */
194     public StringTokenizer(String str, String delim, boolean returnDelims) {
195         currentPosition = 0;
196         newPosition = -1;
197         delimsChanged = false;
198         this.str = str;
199         maxPosition = str.length();
200         delimiters = delim;
201         retDelims = returnDelims;
202         setMaxDelimCodePoint();
203     }
204 
205     /**
206      * Constructs a string tokenizer for the specified string. The
207      * characters in the <code>delim</code> argument are the delimiters
208      * for separating tokens. Delimiter characters themselves will not
209      * be treated as tokens.
210      * <p>
211      * Note that if <tt>delim</tt> is <tt>null</tt>, this constructor does
212      * not throw an exception. However, trying to invoke other methods on the
213      * resulting <tt>StringTokenizer</tt> may result in a
214      * <tt>NullPointerException</tt>.
215      *
216      * @param   str     a string to be parsed.
217      * @param   delim   the delimiters.
218      * @exception NullPointerException if str is <CODE>null</CODE>
219      */
220     public StringTokenizer(String str, String delim) {
221         this(str, delim, false);
222     }
223 
224     /**
225      * Constructs a string tokenizer for the specified string. The
226      * tokenizer uses the default delimiter set, which is
227      * <code>"&nbsp;&#92;t&#92;n&#92;r&#92;f"</code>: the space character,
228      * the tab character, the newline character, the carriage-return character,
229      * and the form-feed character. Delimiter characters themselves will
230      * not be treated as tokens.
231      *
232      * @param   str   a string to be parsed.
233      * @exception NullPointerException if str is <CODE>null</CODE>
234      */
235     public StringTokenizer(String str) {
236         this(str, " \t\n\r\f", false);
237     }
238 
239     /**
240      * Skips delimiters starting from the specified position. If retDelims
241      * is false, returns the index of the first non-delimiter character at or
242      * after startPos. If retDelims is true, startPos is returned.
243      */
244     private int skipDelimiters(int startPos) {
245         if (delimiters == null)
246             throw new NullPointerException();
247 
248         int position = startPos;
249         while (!retDelims && position < maxPosition) {
250             if (!hasSurrogates) {
251                 char c = str.charAt(position);
252                 if ((c > maxDelimCodePoint) || (delimiters.indexOf(c) < 0))
253                     break;
254                 position++;
255             } else {
256                 int c = str.codePointAt(position);
257                 if ((c > maxDelimCodePoint) || !isDelimiter(c)) {
258                     break;
259                 }
260                 position += Character.charCount(c);
261             }
262         }
263         return position;
264     }
265 
266     /**
267      * Skips ahead from startPos and returns the index of the next delimiter
268      * character encountered, or maxPosition if no such delimiter is found.
269      */
270     private int scanToken(int startPos) {
271         int position = startPos;
272         while (position < maxPosition) {
273             if (!hasSurrogates) {
274                 char c = str.charAt(position);
275                 if ((c <= maxDelimCodePoint) && (delimiters.indexOf(c) >= 0))
276                     break;
277                 position++;
278             } else {
279                 int c = str.codePointAt(position);
280                 if ((c <= maxDelimCodePoint) && isDelimiter(c))
281                     break;
282                 position += Character.charCount(c);
283             }
284         }
285         if (retDelims && (startPos == position)) {
286             if (!hasSurrogates) {
287                 char c = str.charAt(position);
288                 if ((c <= maxDelimCodePoint) && (delimiters.indexOf(c) >= 0))
289                     position++;
290             } else {
291                 int c = str.codePointAt(position);
292                 if ((c <= maxDelimCodePoint) && isDelimiter(c))
293                     position += Character.charCount(c);
294             }
295         }
296         return position;
297     }
298 
299     private boolean isDelimiter(int codePoint) {
300         for (int i = 0; i < delimiterCodePoints.length; i++) {
301             if (delimiterCodePoints[i] == codePoint) {
302                 return true;
303             }
304         }
305         return false;
306     }
307 
308     /**
309      * Tests if there are more tokens available from this tokenizer's string.
310      * If this method returns <tt>true</tt>, then a subsequent call to
311      * <tt>nextToken</tt> with no argument will successfully return a token.
312      *
313      * @return  <code>true</code> if and only if there is at least one token
314      *          in the string after the current position; <code>false</code>
315      *          otherwise.
316      */
317     public boolean hasMoreTokens() {
318         /*
319          * Temporarily store this position and use it in the following
320          * nextToken() method only if the delimiters haven't been changed in
321          * that nextToken() invocation.
322          */
323         newPosition = skipDelimiters(currentPosition);
324         return (newPosition < maxPosition);
325     }
326 
327     /**
328      * Returns the next token from this string tokenizer.
329      *
330      * @return     the next token from this string tokenizer.
331      * @exception  NoSuchElementException  if there are no more tokens in this
332      *               tokenizer's string.
333      */
334     public String nextToken() {
335         /*
336          * If next position already computed in hasMoreElements() and
337          * delimiters have changed between the computation and this invocation,
338          * then use the computed value.
339          */
340 
341         currentPosition = (newPosition >= 0 && !delimsChanged) ?
342             newPosition : skipDelimiters(currentPosition);
343 
344         /* Reset these anyway */
345         delimsChanged = false;
346         newPosition = -1;
347 
348         if (currentPosition >= maxPosition)
349             throw new NoSuchElementException();
350         int start = currentPosition;
351         currentPosition = scanToken(currentPosition);
352         return str.substring(start, currentPosition);
353     }
354 
355     /**
356      * Returns the next token in this string tokenizer's string. First,
357      * the set of characters considered to be delimiters by this
358      * <tt>StringTokenizer</tt> object is changed to be the characters in
359      * the string <tt>delim</tt>. Then the next token in the string
360      * after the current position is returned. The current position is
361      * advanced beyond the recognized token.  The new delimiter set
362      * remains the default after this call.
363      *
364      * @param      delim   the new delimiters.
365      * @return     the next token, after switching to the new delimiter set.
366      * @exception  NoSuchElementException  if there are no more tokens in this
367      *               tokenizer's string.
368      * @exception NullPointerException if delim is <CODE>null</CODE>
369      */
370     public String nextToken(String delim) {
371         delimiters = delim;
372 
373         /* delimiter string specified, so set the appropriate flag. */
374         delimsChanged = true;
375 
376         setMaxDelimCodePoint();
377         return nextToken();
378     }
379 
380     /**
381      * Returns the same value as the <code>hasMoreTokens</code>
382      * method. It exists so that this class can implement the
383      * <code>Enumeration</code> interface.
384      *
385      * @return  <code>true</code> if there are more tokens;
386      *          <code>false</code> otherwise.
387      * @see     java.util.Enumeration
388      * @see     java.util.StringTokenizer#hasMoreTokens()
389      */
390     public boolean hasMoreElements() {
391         return hasMoreTokens();
392     }
393 
394     /**
395      * Returns the same value as the <code>nextToken</code> method,
396      * except that its declared return value is <code>Object</code> rather than
397      * <code>String</code>. It exists so that this class can implement the
398      * <code>Enumeration</code> interface.
399      *
400      * @return     the next token in the string.
401      * @exception  NoSuchElementException  if there are no more tokens in this
402      *               tokenizer's string.
403      * @see        java.util.Enumeration
404      * @see        java.util.StringTokenizer#nextToken()
405      */
406     public Object nextElement() {
407         return nextToken();
408     }
409 
410     /**
411      * Calculates the number of times that this tokenizer's
412      * <code>nextToken</code> method can be called before it generates an
413      * exception. The current position is not advanced.
414      *
415      * @return  the number of tokens remaining in the string using the current
416      *          delimiter set.
417      * @see     java.util.StringTokenizer#nextToken()
418      */
419     public int countTokens() {
420         int count = 0;
421         int currpos = currentPosition;
422         while (currpos < maxPosition) {
423             currpos = skipDelimiters(currpos);
424             if (currpos >= maxPosition)
425                 break;
426             currpos = scanToken(currpos);
427             count++;
428         }
429         return count;
430     }
431 }