View Javadoc
1   /*
2    * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
3    * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4    *
5    * This code is free software; you can redistribute it and/or modify it
6    * under the terms of the GNU General Public License version 2 only, as
7    * published by the Free Software Foundation.  Oracle designates this
8    * particular file as subject to the "Classpath" exception as provided
9    * by Oracle in the LICENSE file that accompanied this code.
10   *
11   * This code is distributed in the hope that it will be useful, but WITHOUT
12   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14   * version 2 for more details (a copy is included in the LICENSE file that
15   * accompanied this code).
16   *
17   * You should have received a copy of the GNU General Public License version
18   * 2 along with this work; if not, write to the Free Software Foundation,
19   * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20   *
21   * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22   * or visit www.oracle.com if you need additional information or have any
23   * questions.
24   */
25  
26  /*
27   *******************************************************************************
28   * Copyright (C) 2010, International Business Machines Corporation and         *
29   * others. All Rights Reserved.                                                *
30   *******************************************************************************
31   */
32  package sun.util.locale;
33  
34  import java.util.ArrayList;
35  import java.util.Collections;
36  import java.util.HashMap;
37  import java.util.List;
38  import java.util.Map;
39  import java.util.Set;
40  
41  public class LanguageTag {
42      //
43      // static fields
44      //
45      public static final String SEP = "-";
46      public static final String PRIVATEUSE = "x";
47      public static final String UNDETERMINED = "und";
48      public static final String PRIVUSE_VARIANT_PREFIX = "lvariant";
49  
50      //
51      // Language subtag fields
52      //
53      private String language = "";      // language subtag
54      private String script = "";        // script subtag
55      private String region = "";        // region subtag
56      private String privateuse = "";    // privateuse
57  
58      private List<String> extlangs = Collections.emptyList();   // extlang subtags
59      private List<String> variants = Collections.emptyList();   // variant subtags
60      private List<String> extensions = Collections.emptyList(); // extensions
61  
62      // Map contains grandfathered tags and its preferred mappings from
63      // http://www.ietf.org/rfc/rfc5646.txt
64      // Keys are lower-case strings.
65      private static final Map<String, String[]> GRANDFATHERED = new HashMap<>();
66  
67      static {
68          // grandfathered = irregular           ; non-redundant tags registered
69          //               / regular             ; during the RFC 3066 era
70          //
71          // irregular     = "en-GB-oed"         ; irregular tags do not match
72          //               / "i-ami"             ; the 'langtag' production and
73          //               / "i-bnn"             ; would not otherwise be
74          //               / "i-default"         ; considered 'well-formed'
75          //               / "i-enochian"        ; These tags are all valid,
76          //               / "i-hak"             ; but most are deprecated
77          //               / "i-klingon"         ; in favor of more modern
78          //               / "i-lux"             ; subtags or subtag
79          //               / "i-mingo"           ; combination
80          //               / "i-navajo"
81          //               / "i-pwn"
82          //               / "i-tao"
83          //               / "i-tay"
84          //               / "i-tsu"
85          //               / "sgn-BE-FR"
86          //               / "sgn-BE-NL"
87          //               / "sgn-CH-DE"
88          //
89          // regular       = "art-lojban"        ; these tags match the 'langtag'
90          //               / "cel-gaulish"       ; production, but their subtags
91          //               / "no-bok"            ; are not extended language
92          //               / "no-nyn"            ; or variant subtags: their meaning
93          //               / "zh-guoyu"          ; is defined by their registration
94          //               / "zh-hakka"          ; and all of these are deprecated
95          //               / "zh-min"            ; in favor of a more modern
96          //               / "zh-min-nan"        ; subtag or sequence of subtags
97          //               / "zh-xiang"
98  
99          final String[][] entries = {
100           //{"tag",         "preferred"},
101             {"art-lojban",  "jbo"},
102             {"cel-gaulish", "xtg-x-cel-gaulish"},   // fallback
103             {"en-GB-oed",   "en-GB-x-oed"},         // fallback
104             {"i-ami",       "ami"},
105             {"i-bnn",       "bnn"},
106             {"i-default",   "en-x-i-default"},      // fallback
107             {"i-enochian",  "und-x-i-enochian"},    // fallback
108             {"i-hak",       "hak"},
109             {"i-klingon",   "tlh"},
110             {"i-lux",       "lb"},
111             {"i-mingo",     "see-x-i-mingo"},       // fallback
112             {"i-navajo",    "nv"},
113             {"i-pwn",       "pwn"},
114             {"i-tao",       "tao"},
115             {"i-tay",       "tay"},
116             {"i-tsu",       "tsu"},
117             {"no-bok",      "nb"},
118             {"no-nyn",      "nn"},
119             {"sgn-BE-FR",   "sfb"},
120             {"sgn-BE-NL",   "vgt"},
121             {"sgn-CH-DE",   "sgg"},
122             {"zh-guoyu",    "cmn"},
123             {"zh-hakka",    "hak"},
124             {"zh-min",      "nan-x-zh-min"},        // fallback
125             {"zh-min-nan",  "nan"},
126             {"zh-xiang",    "hsn"},
127         };
128         for (String[] e : entries) {
129             GRANDFATHERED.put(LocaleUtils.toLowerString(e[0]), e);
130         }
131     }
132 
133     private LanguageTag() {
134     }
135 
136     /*
137      * BNF in RFC5646
138      *
139      * Language-Tag  = langtag             ; normal language tags
140      *               / privateuse          ; private use tag
141      *               / grandfathered       ; grandfathered tags
142      *
143      *
144      * langtag       = language
145      *                 ["-" script]
146      *                 ["-" region]
147      *                 *("-" variant)
148      *                 *("-" extension)
149      *                 ["-" privateuse]
150      *
151      * language      = 2*3ALPHA            ; shortest ISO 639 code
152      *                 ["-" extlang]       ; sometimes followed by
153      *                                     ; extended language subtags
154      *               / 4ALPHA              ; or reserved for future use
155      *               / 5*8ALPHA            ; or registered language subtag
156      *
157      * extlang       = 3ALPHA              ; selected ISO 639 codes
158      *                 *2("-" 3ALPHA)      ; permanently reserved
159      *
160      * script        = 4ALPHA              ; ISO 15924 code
161      *
162      * region        = 2ALPHA              ; ISO 3166-1 code
163      *               / 3DIGIT              ; UN M.49 code
164      *
165      * variant       = 5*8alphanum         ; registered variants
166      *               / (DIGIT 3alphanum)
167      *
168      * extension     = singleton 1*("-" (2*8alphanum))
169      *
170      *                                     ; Single alphanumerics
171      *                                     ; "x" reserved for private use
172      * singleton     = DIGIT               ; 0 - 9
173      *               / %x41-57             ; A - W
174      *               / %x59-5A             ; Y - Z
175      *               / %x61-77             ; a - w
176      *               / %x79-7A             ; y - z
177      *
178      * privateuse    = "x" 1*("-" (1*8alphanum))
179      *
180      */
181     public static LanguageTag parse(String languageTag, ParseStatus sts) {
182         if (sts == null) {
183             sts = new ParseStatus();
184         } else {
185             sts.reset();
186         }
187 
188         StringTokenIterator itr;
189 
190         // Check if the tag is grandfathered
191         String[] gfmap = GRANDFATHERED.get(LocaleUtils.toLowerString(languageTag));
192         if (gfmap != null) {
193             // use preferred mapping
194             itr = new StringTokenIterator(gfmap[1], SEP);
195         } else {
196             itr = new StringTokenIterator(languageTag, SEP);
197         }
198 
199         LanguageTag tag = new LanguageTag();
200 
201         // langtag must start with either language or privateuse
202         if (tag.parseLanguage(itr, sts)) {
203             tag.parseExtlangs(itr, sts);
204             tag.parseScript(itr, sts);
205             tag.parseRegion(itr, sts);
206             tag.parseVariants(itr, sts);
207             tag.parseExtensions(itr, sts);
208         }
209         tag.parsePrivateuse(itr, sts);
210 
211         if (!itr.isDone() && !sts.isError()) {
212             String s = itr.current();
213             sts.errorIndex = itr.currentStart();
214             if (s.length() == 0) {
215                 sts.errorMsg = "Empty subtag";
216             } else {
217                 sts.errorMsg = "Invalid subtag: " + s;
218             }
219         }
220 
221         return tag;
222     }
223 
224     //
225     // Language subtag parsers
226     //
227 
228     private boolean parseLanguage(StringTokenIterator itr, ParseStatus sts) {
229         if (itr.isDone() || sts.isError()) {
230             return false;
231         }
232 
233         boolean found = false;
234 
235         String s = itr.current();
236         if (isLanguage(s)) {
237             found = true;
238             language = s;
239             sts.parseLength = itr.currentEnd();
240             itr.next();
241         }
242 
243         return found;
244     }
245 
246     private boolean parseExtlangs(StringTokenIterator itr, ParseStatus sts) {
247         if (itr.isDone() || sts.isError()) {
248             return false;
249         }
250 
251         boolean found = false;
252 
253         while (!itr.isDone()) {
254             String s = itr.current();
255             if (!isExtlang(s)) {
256                 break;
257             }
258             found = true;
259             if (extlangs.isEmpty()) {
260                 extlangs = new ArrayList<>(3);
261             }
262             extlangs.add(s);
263             sts.parseLength = itr.currentEnd();
264             itr.next();
265 
266             if (extlangs.size() == 3) {
267                 // Maximum 3 extlangs
268                 break;
269             }
270         }
271 
272         return found;
273     }
274 
275     private boolean parseScript(StringTokenIterator itr, ParseStatus sts) {
276         if (itr.isDone() || sts.isError()) {
277             return false;
278         }
279 
280         boolean found = false;
281 
282         String s = itr.current();
283         if (isScript(s)) {
284             found = true;
285             script = s;
286             sts.parseLength = itr.currentEnd();
287             itr.next();
288         }
289 
290         return found;
291     }
292 
293     private boolean parseRegion(StringTokenIterator itr, ParseStatus sts) {
294         if (itr.isDone() || sts.isError()) {
295             return false;
296         }
297 
298         boolean found = false;
299 
300         String s = itr.current();
301         if (isRegion(s)) {
302             found = true;
303             region = s;
304             sts.parseLength = itr.currentEnd();
305             itr.next();
306         }
307 
308         return found;
309     }
310 
311     private boolean parseVariants(StringTokenIterator itr, ParseStatus sts) {
312         if (itr.isDone() || sts.isError()) {
313             return false;
314         }
315 
316         boolean found = false;
317 
318         while (!itr.isDone()) {
319             String s = itr.current();
320             if (!isVariant(s)) {
321                 break;
322             }
323             found = true;
324             if (variants.isEmpty()) {
325                 variants = new ArrayList<>(3);
326             }
327             variants.add(s);
328             sts.parseLength = itr.currentEnd();
329             itr.next();
330         }
331 
332         return found;
333     }
334 
335     private boolean parseExtensions(StringTokenIterator itr, ParseStatus sts) {
336         if (itr.isDone() || sts.isError()) {
337             return false;
338         }
339 
340         boolean found = false;
341 
342         while (!itr.isDone()) {
343             String s = itr.current();
344             if (isExtensionSingleton(s)) {
345                 int start = itr.currentStart();
346                 String singleton = s;
347                 StringBuilder sb = new StringBuilder(singleton);
348 
349                 itr.next();
350                 while (!itr.isDone()) {
351                     s = itr.current();
352                     if (isExtensionSubtag(s)) {
353                         sb.append(SEP).append(s);
354                         sts.parseLength = itr.currentEnd();
355                     } else {
356                         break;
357                     }
358                     itr.next();
359                 }
360 
361                 if (sts.parseLength <= start) {
362                     sts.errorIndex = start;
363                     sts.errorMsg = "Incomplete extension '" + singleton + "'";
364                     break;
365                 }
366 
367                 if (extensions.isEmpty()) {
368                     extensions = new ArrayList<>(4);
369                 }
370                 extensions.add(sb.toString());
371                 found = true;
372             } else {
373                 break;
374             }
375         }
376         return found;
377     }
378 
379     private boolean parsePrivateuse(StringTokenIterator itr, ParseStatus sts) {
380         if (itr.isDone() || sts.isError()) {
381             return false;
382         }
383 
384         boolean found = false;
385 
386         String s = itr.current();
387         if (isPrivateusePrefix(s)) {
388             int start = itr.currentStart();
389             StringBuilder sb = new StringBuilder(s);
390 
391             itr.next();
392             while (!itr.isDone()) {
393                 s = itr.current();
394                 if (!isPrivateuseSubtag(s)) {
395                     break;
396                 }
397                 sb.append(SEP).append(s);
398                 sts.parseLength = itr.currentEnd();
399 
400                 itr.next();
401             }
402 
403             if (sts.parseLength <= start) {
404                 // need at least 1 private subtag
405                 sts.errorIndex = start;
406                 sts.errorMsg = "Incomplete privateuse";
407             } else {
408                 privateuse = sb.toString();
409                 found = true;
410             }
411         }
412 
413         return found;
414     }
415 
416     public static LanguageTag parseLocale(BaseLocale baseLocale, LocaleExtensions localeExtensions) {
417         LanguageTag tag = new LanguageTag();
418 
419         String language = baseLocale.getLanguage();
420         String script = baseLocale.getScript();
421         String region = baseLocale.getRegion();
422         String variant = baseLocale.getVariant();
423 
424         boolean hasSubtag = false;
425 
426         String privuseVar = null;   // store ill-formed variant subtags
427 
428         if (isLanguage(language)) {
429             // Convert a deprecated language code to its new code
430             if (language.equals("iw")) {
431                 language = "he";
432             } else if (language.equals("ji")) {
433                 language = "yi";
434             } else if (language.equals("in")) {
435                 language = "id";
436             }
437             tag.language = language;
438         }
439 
440         if (isScript(script)) {
441             tag.script = canonicalizeScript(script);
442             hasSubtag = true;
443         }
444 
445         if (isRegion(region)) {
446             tag.region = canonicalizeRegion(region);
447             hasSubtag = true;
448         }
449 
450         // Special handling for no_NO_NY - use nn_NO for language tag
451         if (tag.language.equals("no") && tag.region.equals("NO") && variant.equals("NY")) {
452             tag.language = "nn";
453             variant = "";
454         }
455 
456         if (variant.length() > 0) {
457             List<String> variants = null;
458             StringTokenIterator varitr = new StringTokenIterator(variant, BaseLocale.SEP);
459             while (!varitr.isDone()) {
460                 String var = varitr.current();
461                 if (!isVariant(var)) {
462                     break;
463                 }
464                 if (variants == null) {
465                     variants = new ArrayList<>();
466                 }
467                 variants.add(var);  // Do not canonicalize!
468                 varitr.next();
469             }
470             if (variants != null) {
471                 tag.variants = variants;
472                 hasSubtag = true;
473             }
474             if (!varitr.isDone()) {
475                 // ill-formed variant subtags
476                 StringBuilder buf = new StringBuilder();
477                 while (!varitr.isDone()) {
478                     String prvv = varitr.current();
479                     if (!isPrivateuseSubtag(prvv)) {
480                         // cannot use private use subtag - truncated
481                         break;
482                     }
483                     if (buf.length() > 0) {
484                         buf.append(SEP);
485                     }
486                     buf.append(prvv);
487                     varitr.next();
488                 }
489                 if (buf.length() > 0) {
490                     privuseVar = buf.toString();
491                 }
492             }
493         }
494 
495         List<String> extensions = null;
496         String privateuse = null;
497 
498         if (localeExtensions != null) {
499             Set<Character> locextKeys = localeExtensions.getKeys();
500             for (Character locextKey : locextKeys) {
501                 Extension ext = localeExtensions.getExtension(locextKey);
502                 if (isPrivateusePrefixChar(locextKey)) {
503                     privateuse = ext.getValue();
504                 } else {
505                     if (extensions == null) {
506                         extensions = new ArrayList<>();
507                     }
508                     extensions.add(locextKey.toString() + SEP + ext.getValue());
509                 }
510             }
511         }
512 
513         if (extensions != null) {
514             tag.extensions = extensions;
515             hasSubtag = true;
516         }
517 
518         // append ill-formed variant subtags to private use
519         if (privuseVar != null) {
520             if (privateuse == null) {
521                 privateuse = PRIVUSE_VARIANT_PREFIX + SEP + privuseVar;
522             } else {
523                 privateuse = privateuse + SEP + PRIVUSE_VARIANT_PREFIX
524                              + SEP + privuseVar.replace(BaseLocale.SEP, SEP);
525             }
526         }
527 
528         if (privateuse != null) {
529             tag.privateuse = privateuse;
530         }
531 
532         if (tag.language.length() == 0 && (hasSubtag || privateuse == null)) {
533             // use lang "und" when 1) no language is available AND
534             // 2) any of other subtags other than private use are available or
535             // no private use tag is available
536             tag.language = UNDETERMINED;
537         }
538 
539         return tag;
540     }
541 
542     //
543     // Getter methods for language subtag fields
544     //
545 
546     public String getLanguage() {
547         return language;
548     }
549 
550     public List<String> getExtlangs() {
551         if (extlangs.isEmpty()) {
552             return Collections.emptyList();
553         }
554         return Collections.unmodifiableList(extlangs);
555     }
556 
557     public String getScript() {
558         return script;
559     }
560 
561     public String getRegion() {
562         return region;
563     }
564 
565     public List<String> getVariants() {
566         if (variants.isEmpty()) {
567             return Collections.emptyList();
568         }
569         return Collections.unmodifiableList(variants);
570     }
571 
572     public List<String> getExtensions() {
573         if (extensions.isEmpty()) {
574             return Collections.emptyList();
575         }
576         return Collections.unmodifiableList(extensions);
577     }
578 
579     public String getPrivateuse() {
580         return privateuse;
581     }
582 
583     //
584     // Language subtag syntax checking methods
585     //
586 
587     public static boolean isLanguage(String s) {
588         // language      = 2*3ALPHA            ; shortest ISO 639 code
589         //                 ["-" extlang]       ; sometimes followed by
590         //                                     ;   extended language subtags
591         //               / 4ALPHA              ; or reserved for future use
592         //               / 5*8ALPHA            ; or registered language subtag
593         int len = s.length();
594         return (len >= 2) && (len <= 8) && LocaleUtils.isAlphaString(s);
595     }
596 
597     public static boolean isExtlang(String s) {
598         // extlang       = 3ALPHA              ; selected ISO 639 codes
599         //                 *2("-" 3ALPHA)      ; permanently reserved
600         return (s.length() == 3) && LocaleUtils.isAlphaString(s);
601     }
602 
603     public static boolean isScript(String s) {
604         // script        = 4ALPHA              ; ISO 15924 code
605         return (s.length() == 4) && LocaleUtils.isAlphaString(s);
606     }
607 
608     public static boolean isRegion(String s) {
609         // region        = 2ALPHA              ; ISO 3166-1 code
610         //               / 3DIGIT              ; UN M.49 code
611         return ((s.length() == 2) && LocaleUtils.isAlphaString(s))
612                 || ((s.length() == 3) && LocaleUtils.isNumericString(s));
613     }
614 
615     public static boolean isVariant(String s) {
616         // variant       = 5*8alphanum         ; registered variants
617         //               / (DIGIT 3alphanum)
618         int len = s.length();
619         if (len >= 5 && len <= 8) {
620             return LocaleUtils.isAlphaNumericString(s);
621         }
622         if (len == 4) {
623             return LocaleUtils.isNumeric(s.charAt(0))
624                     && LocaleUtils.isAlphaNumeric(s.charAt(1))
625                     && LocaleUtils.isAlphaNumeric(s.charAt(2))
626                     && LocaleUtils.isAlphaNumeric(s.charAt(3));
627         }
628         return false;
629     }
630 
631     public static boolean isExtensionSingleton(String s) {
632         // singleton     = DIGIT               ; 0 - 9
633         //               / %x41-57             ; A - W
634         //               / %x59-5A             ; Y - Z
635         //               / %x61-77             ; a - w
636         //               / %x79-7A             ; y - z
637 
638         return (s.length() == 1)
639                 && LocaleUtils.isAlphaString(s)
640                 && !LocaleUtils.caseIgnoreMatch(PRIVATEUSE, s);
641     }
642 
643     public static boolean isExtensionSingletonChar(char c) {
644         return isExtensionSingleton(String.valueOf(c));
645     }
646 
647     public static boolean isExtensionSubtag(String s) {
648         // extension     = singleton 1*("-" (2*8alphanum))
649         int len = s.length();
650         return (len >= 2) && (len <= 8) && LocaleUtils.isAlphaNumericString(s);
651     }
652 
653     public static boolean isPrivateusePrefix(String s) {
654         // privateuse    = "x" 1*("-" (1*8alphanum))
655         return (s.length() == 1)
656                 && LocaleUtils.caseIgnoreMatch(PRIVATEUSE, s);
657     }
658 
659     public static boolean isPrivateusePrefixChar(char c) {
660         return (LocaleUtils.caseIgnoreMatch(PRIVATEUSE, String.valueOf(c)));
661     }
662 
663     public static boolean isPrivateuseSubtag(String s) {
664         // privateuse    = "x" 1*("-" (1*8alphanum))
665         int len = s.length();
666         return (len >= 1) && (len <= 8) && LocaleUtils.isAlphaNumericString(s);
667     }
668 
669     //
670     // Language subtag canonicalization methods
671     //
672 
673     public static String canonicalizeLanguage(String s) {
674         return LocaleUtils.toLowerString(s);
675     }
676 
677     public static String canonicalizeExtlang(String s) {
678         return LocaleUtils.toLowerString(s);
679     }
680 
681     public static String canonicalizeScript(String s) {
682         return LocaleUtils.toTitleString(s);
683     }
684 
685     public static String canonicalizeRegion(String s) {
686         return LocaleUtils.toUpperString(s);
687     }
688 
689     public static String canonicalizeVariant(String s) {
690         return LocaleUtils.toLowerString(s);
691     }
692 
693     public static String canonicalizeExtension(String s) {
694         return LocaleUtils.toLowerString(s);
695     }
696 
697     public static String canonicalizeExtensionSingleton(String s) {
698         return LocaleUtils.toLowerString(s);
699     }
700 
701     public static String canonicalizeExtensionSubtag(String s) {
702         return LocaleUtils.toLowerString(s);
703     }
704 
705     public static String canonicalizePrivateuse(String s) {
706         return LocaleUtils.toLowerString(s);
707     }
708 
709     public static String canonicalizePrivateuseSubtag(String s) {
710         return LocaleUtils.toLowerString(s);
711     }
712 
713     @Override
714     public String toString() {
715         StringBuilder sb = new StringBuilder();
716 
717         if (language.length() > 0) {
718             sb.append(language);
719 
720             for (String extlang : extlangs) {
721                 sb.append(SEP).append(extlang);
722             }
723 
724             if (script.length() > 0) {
725                 sb.append(SEP).append(script);
726             }
727 
728             if (region.length() > 0) {
729                 sb.append(SEP).append(region);
730             }
731 
732             for (String variant : variants) {
733                 sb.append(SEP).append(variant);
734             }
735 
736             for (String extension : extensions) {
737                 sb.append(SEP).append(extension);
738             }
739         }
740         if (privateuse.length() > 0) {
741             if (sb.length() > 0) {
742                 sb.append(SEP);
743             }
744             sb.append(privateuse);
745         }
746 
747         return sb.toString();
748     }
749 }