idna/
uts46.rs

1// Copyright 2013-2014 The rust-url developers.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9//! [*Unicode IDNA Compatibility Processing*
10//! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
11
12use self::Mapping::*;
13use crate::punycode;
14
15use alloc::string::String;
16use core::fmt;
17use unicode_bidi::{bidi_class, BidiClass};
18use unicode_normalization::char::is_combining_mark;
19use unicode_normalization::{is_nfc, UnicodeNormalization};
20
21include!("uts46_mapping_table.rs");
22
23const PUNYCODE_PREFIX: &str = "xn--";
24
25#[derive(Debug)]
26struct StringTableSlice {
27    // Store these as separate fields so the structure will have an
28    // alignment of 1 and thus pack better into the Mapping enum, below.
29    byte_start_lo: u8,
30    byte_start_hi: u8,
31    byte_len: u8,
32}
33
34fn decode_slice(slice: &StringTableSlice) -> &'static str {
35    let lo = slice.byte_start_lo as usize;
36    let hi = slice.byte_start_hi as usize;
37    let start = (hi << 8) | lo;
38    let len = slice.byte_len as usize;
39    &STRING_TABLE[start..(start + len)]
40}
41
42#[repr(u8)]
43#[derive(Debug)]
44enum Mapping {
45    Valid,
46    Ignored,
47    Mapped(StringTableSlice),
48    Deviation(StringTableSlice),
49    Disallowed,
50    DisallowedStd3Valid,
51    DisallowedStd3Mapped(StringTableSlice),
52    DisallowedIdna2008,
53}
54
55fn find_char(codepoint: char) -> &'static Mapping {
56    let idx = match TABLE.binary_search_by_key(&codepoint, |&val| val.0) {
57        Ok(idx) => idx,
58        Err(idx) => idx - 1,
59    };
60
61    const SINGLE_MARKER: u16 = 1 << 15;
62
63    let (base, x) = TABLE[idx];
64    let single = (x & SINGLE_MARKER) != 0;
65    let offset = !SINGLE_MARKER & x;
66
67    if single {
68        &MAPPING_TABLE[offset as usize]
69    } else {
70        &MAPPING_TABLE[(offset + (codepoint as u16 - base as u16)) as usize]
71    }
72}
73
74struct Mapper<'a> {
75    chars: core::str::Chars<'a>,
76    config: Config,
77    errors: &'a mut Errors,
78    slice: Option<core::str::Chars<'static>>,
79}
80
81impl<'a> Iterator for Mapper<'a> {
82    type Item = char;
83
84    fn next(&mut self) -> Option<Self::Item> {
85        loop {
86            if let Some(s) = &mut self.slice {
87                match s.next() {
88                    Some(c) => return Some(c),
89                    None => {
90                        self.slice = None;
91                    }
92                }
93            }
94
95            let codepoint = self.chars.next()?;
96            if let '.' | '-' | 'a'..='z' | '0'..='9' = codepoint {
97                return Some(codepoint);
98            }
99
100            return Some(match *find_char(codepoint) {
101                Mapping::Valid => codepoint,
102                Mapping::Ignored => continue,
103                Mapping::Mapped(ref slice) => {
104                    self.slice = Some(decode_slice(slice).chars());
105                    continue;
106                }
107                Mapping::Deviation(ref slice) => {
108                    if self.config.transitional_processing {
109                        self.slice = Some(decode_slice(slice).chars());
110                        continue;
111                    } else {
112                        codepoint
113                    }
114                }
115                Mapping::Disallowed => {
116                    self.errors.disallowed_character = true;
117                    codepoint
118                }
119                Mapping::DisallowedStd3Valid => {
120                    if self.config.use_std3_ascii_rules {
121                        self.errors.disallowed_by_std3_ascii_rules = true;
122                    };
123                    codepoint
124                }
125                Mapping::DisallowedStd3Mapped(ref slice) => {
126                    if self.config.use_std3_ascii_rules {
127                        self.errors.disallowed_mapped_in_std3 = true;
128                    };
129                    self.slice = Some(decode_slice(slice).chars());
130                    continue;
131                }
132                Mapping::DisallowedIdna2008 => {
133                    if self.config.use_idna_2008_rules {
134                        self.errors.disallowed_in_idna_2008 = true;
135                    }
136                    codepoint
137                }
138            });
139        }
140    }
141}
142
143// http://tools.ietf.org/html/rfc5893#section-2
144fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool {
145    // Rule 0: Bidi Rules apply to Bidi Domain Names: a name with at least one RTL label.  A label
146    // is RTL if it contains at least one character of bidi class R, AL or AN.
147    if !is_bidi_domain {
148        return true;
149    }
150
151    let mut chars = label.chars();
152    let first_char_class = match chars.next() {
153        Some(c) => bidi_class(c),
154        None => return true, // empty string
155    };
156
157    match first_char_class {
158        // LTR label
159        BidiClass::L => {
160            // Rule 5
161            for c in chars.by_ref() {
162                if !matches!(
163                    bidi_class(c),
164                    BidiClass::L
165                        | BidiClass::EN
166                        | BidiClass::ES
167                        | BidiClass::CS
168                        | BidiClass::ET
169                        | BidiClass::ON
170                        | BidiClass::BN
171                        | BidiClass::NSM
172                ) {
173                    return false;
174                }
175            }
176
177            // Rule 6
178            // must end in L or EN followed by 0 or more NSM
179            let mut rev_chars = label.chars().rev();
180            let mut last_non_nsm = rev_chars.next();
181            loop {
182                match last_non_nsm {
183                    Some(c) if bidi_class(c) == BidiClass::NSM => {
184                        last_non_nsm = rev_chars.next();
185                        continue;
186                    }
187                    _ => {
188                        break;
189                    }
190                }
191            }
192            match last_non_nsm {
193                Some(c) if bidi_class(c) == BidiClass::L || bidi_class(c) == BidiClass::EN => {}
194                Some(_) => {
195                    return false;
196                }
197                _ => {}
198            }
199        }
200
201        // RTL label
202        BidiClass::R | BidiClass::AL => {
203            let mut found_en = false;
204            let mut found_an = false;
205
206            // Rule 2
207            for c in chars {
208                let char_class = bidi_class(c);
209                if char_class == BidiClass::EN {
210                    found_en = true;
211                } else if char_class == BidiClass::AN {
212                    found_an = true;
213                }
214
215                if !matches!(
216                    char_class,
217                    BidiClass::R
218                        | BidiClass::AL
219                        | BidiClass::AN
220                        | BidiClass::EN
221                        | BidiClass::ES
222                        | BidiClass::CS
223                        | BidiClass::ET
224                        | BidiClass::ON
225                        | BidiClass::BN
226                        | BidiClass::NSM
227                ) {
228                    return false;
229                }
230            }
231            // Rule 3
232            let mut rev_chars = label.chars().rev();
233            let mut last = rev_chars.next();
234            loop {
235                // must end in L or EN followed by 0 or more NSM
236                match last {
237                    Some(c) if bidi_class(c) == BidiClass::NSM => {
238                        last = rev_chars.next();
239                        continue;
240                    }
241                    _ => {
242                        break;
243                    }
244                }
245            }
246            match last {
247                Some(c)
248                    if matches!(
249                        bidi_class(c),
250                        BidiClass::R | BidiClass::AL | BidiClass::EN | BidiClass::AN
251                    ) => {}
252                _ => {
253                    return false;
254                }
255            }
256
257            // Rule 4
258            if found_an && found_en {
259                return false;
260            }
261        }
262
263        // Rule 1: Should start with L or R/AL
264        _ => {
265            return false;
266        }
267    }
268
269    true
270}
271
272/// Check the validity criteria for the given label
273///
274/// V1 (NFC) and V8 (Bidi) are checked inside `processing()` to prevent doing duplicate work.
275///
276/// http://www.unicode.org/reports/tr46/#Validity_Criteria
277fn check_validity(label: &str, config: Config, errors: &mut Errors) {
278    let first_char = label.chars().next();
279    if first_char.is_none() {
280        // Empty string, pass
281        return;
282    }
283
284    // V2: No U+002D HYPHEN-MINUS in both third and fourth positions.
285    //
286    // NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
287    // third and fourth positions. But nobody follows this criteria. See the spec issue below:
288    // https://github.com/whatwg/url/issues/53
289
290    // V3: neither begin nor end with a U+002D HYPHEN-MINUS
291    if config.check_hyphens && (label.starts_with('-') || label.ends_with('-')) {
292        errors.check_hyphens = true;
293        return;
294    }
295
296    // V4: not contain a U+002E FULL STOP
297    //
298    // Here, label can't contain '.' since the input is from .split('.')
299
300    // V5: not begin with a GC=Mark
301    if is_combining_mark(first_char.unwrap()) {
302        errors.start_combining_mark = true;
303        return;
304    }
305
306    // V6: Check against Mapping Table
307    if label.chars().any(|c| match *find_char(c) {
308        Mapping::Valid | Mapping::DisallowedIdna2008 => false,
309        Mapping::Deviation(_) => config.transitional_processing,
310        Mapping::DisallowedStd3Valid => config.use_std3_ascii_rules,
311        _ => true,
312    }) {
313        errors.invalid_mapping = true;
314    }
315
316    // V7: ContextJ rules
317    //
318    // TODO: Implement rules and add *CheckJoiners* flag.
319
320    // V8: Bidi rules are checked inside `processing()`
321}
322
323// Detect simple cases: all lowercase ASCII characters and digits where none
324// of the labels start with PUNYCODE_PREFIX and labels don't start or end with hyphen.
325fn is_simple(domain: &str) -> bool {
326    if domain.is_empty() {
327        return false;
328    }
329    let (mut prev, mut puny_prefix) = ('?', 0);
330    for c in domain.chars() {
331        if c == '.' {
332            if prev == '-' {
333                return false;
334            }
335            puny_prefix = 0;
336            continue;
337        } else if puny_prefix == 0 && c == '-' {
338            return false;
339        } else if puny_prefix < 5 {
340            if c == ['x', 'n', '-', '-'][puny_prefix] {
341                puny_prefix += 1;
342                if puny_prefix == 4 {
343                    return false;
344                }
345            } else {
346                puny_prefix = 5;
347            }
348        }
349        if !c.is_ascii_lowercase() && !c.is_ascii_digit() {
350            return false;
351        }
352        prev = c;
353    }
354
355    true
356}
357
358/// http://www.unicode.org/reports/tr46/#Processing
359fn processing(
360    domain: &str,
361    config: Config,
362    normalized: &mut String,
363    output: &mut String,
364) -> Errors {
365    normalized.clear();
366    let mut errors = Errors::default();
367    let offset = output.len();
368
369    let iter = Mapper {
370        chars: domain.chars(),
371        config,
372        errors: &mut errors,
373        slice: None,
374    };
375
376    normalized.extend(iter.nfc());
377
378    let mut decoder = punycode::Decoder::default();
379    let non_transitional = config.transitional_processing(false);
380    let (mut first, mut has_bidi_labels) = (true, false);
381    for label in normalized.split('.') {
382        if !first {
383            output.push('.');
384        }
385        first = false;
386        if let Some(remainder) = label.strip_prefix(PUNYCODE_PREFIX) {
387            match decoder.decode(remainder) {
388                Ok(decode) => {
389                    let start = output.len();
390                    output.extend(decode);
391                    let decoded_label = &output[start..];
392
393                    if !has_bidi_labels {
394                        has_bidi_labels |= is_bidi_domain(decoded_label);
395                    }
396
397                    if !errors.is_err() {
398                        if !is_nfc(decoded_label) {
399                            errors.nfc = true;
400                        } else {
401                            check_validity(decoded_label, non_transitional, &mut errors);
402                        }
403                    }
404                }
405                Err(()) => {
406                    has_bidi_labels = true;
407                    errors.punycode = true;
408                }
409            }
410        } else {
411            if !has_bidi_labels {
412                has_bidi_labels |= is_bidi_domain(label);
413            }
414
415            // `normalized` is already `NFC` so we can skip that check
416            check_validity(label, config, &mut errors);
417            output.push_str(label)
418        }
419    }
420
421    for label in output[offset..].split('.') {
422        // V8: Bidi rules
423        //
424        // TODO: Add *CheckBidi* flag
425        if !passes_bidi(label, has_bidi_labels) {
426            errors.check_bidi = true;
427            break;
428        }
429    }
430
431    errors
432}
433
434#[derive(Default)]
435pub struct Idna {
436    config: Config,
437    normalized: String,
438    output: String,
439}
440
441impl Idna {
442    pub fn new(config: Config) -> Self {
443        Self {
444            config,
445            normalized: String::new(),
446            output: String::new(),
447        }
448    }
449
450    pub fn to_ascii_inner(&mut self, domain: &str, out: &mut String) -> Errors {
451        if is_simple(domain) {
452            out.push_str(domain);
453            return Errors::default();
454        }
455        let mut errors = processing(domain, self.config, &mut self.normalized, out);
456        self.output = core::mem::replace(out, String::with_capacity(out.len()));
457        let mut first = true;
458        for label in self.output.split('.') {
459            if !first {
460                out.push('.');
461            }
462            first = false;
463
464            if label.is_ascii() {
465                out.push_str(label);
466            } else {
467                let offset = out.len();
468                out.push_str(PUNYCODE_PREFIX);
469                if let Err(()) = punycode::encode_into(label.chars(), out) {
470                    errors.punycode = true;
471                    out.truncate(offset);
472                }
473            }
474        }
475        errors
476    }
477
478    /// http://www.unicode.org/reports/tr46/#ToASCII
479    #[allow(clippy::wrong_self_convention)]
480    pub fn to_ascii(&mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
481        let mut errors = self.to_ascii_inner(domain, out);
482
483        if self.config.verify_dns_length {
484            let domain = if out.ends_with('.') {
485                &out[..out.len() - 1]
486            } else {
487                &*out
488            };
489            if domain.is_empty() || domain.split('.').any(|label| label.is_empty()) {
490                errors.too_short_for_dns = true;
491            }
492            if domain.len() > 253 || domain.split('.').any(|label| label.len() > 63) {
493                errors.too_long_for_dns = true;
494            }
495        }
496
497        errors.into()
498    }
499
500    /// http://www.unicode.org/reports/tr46/#ToUnicode
501    #[allow(clippy::wrong_self_convention)]
502    pub fn to_unicode(&mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
503        if is_simple(domain) {
504            out.push_str(domain);
505            return Errors::default().into();
506        }
507        processing(domain, self.config, &mut self.normalized, out).into()
508    }
509}
510
511#[derive(Clone, Copy)]
512#[must_use]
513pub struct Config {
514    use_std3_ascii_rules: bool,
515    transitional_processing: bool,
516    verify_dns_length: bool,
517    check_hyphens: bool,
518    use_idna_2008_rules: bool,
519}
520
521/// The defaults are that of https://url.spec.whatwg.org/#idna
522impl Default for Config {
523    fn default() -> Self {
524        Config {
525            use_std3_ascii_rules: false,
526            transitional_processing: false,
527            check_hyphens: false,
528            // check_bidi: true,
529            // check_joiners: true,
530
531            // Only use for to_ascii, not to_unicode
532            verify_dns_length: false,
533            use_idna_2008_rules: false,
534        }
535    }
536}
537
538impl Config {
539    #[inline]
540    pub fn use_std3_ascii_rules(mut self, value: bool) -> Self {
541        self.use_std3_ascii_rules = value;
542        self
543    }
544
545    #[inline]
546    pub fn transitional_processing(mut self, value: bool) -> Self {
547        self.transitional_processing = value;
548        self
549    }
550
551    #[inline]
552    pub fn verify_dns_length(mut self, value: bool) -> Self {
553        self.verify_dns_length = value;
554        self
555    }
556
557    #[inline]
558    pub fn check_hyphens(mut self, value: bool) -> Self {
559        self.check_hyphens = value;
560        self
561    }
562
563    #[inline]
564    pub fn use_idna_2008_rules(mut self, value: bool) -> Self {
565        self.use_idna_2008_rules = value;
566        self
567    }
568
569    /// http://www.unicode.org/reports/tr46/#ToASCII
570    pub fn to_ascii(self, domain: &str) -> Result<String, Errors> {
571        let mut result = String::with_capacity(domain.len());
572        let mut codec = Idna::new(self);
573        codec.to_ascii(domain, &mut result).map(|()| result)
574    }
575
576    /// http://www.unicode.org/reports/tr46/#ToUnicode
577    pub fn to_unicode(self, domain: &str) -> (String, Result<(), Errors>) {
578        let mut codec = Idna::new(self);
579        let mut out = String::with_capacity(domain.len());
580        let result = codec.to_unicode(domain, &mut out);
581        (out, result)
582    }
583}
584
585fn is_bidi_domain(s: &str) -> bool {
586    for c in s.chars() {
587        if c.is_ascii_graphic() {
588            continue;
589        }
590        match bidi_class(c) {
591            BidiClass::R | BidiClass::AL | BidiClass::AN => return true,
592            _ => {}
593        }
594    }
595    false
596}
597
598/// Errors recorded during UTS #46 processing.
599///
600/// This is opaque for now, indicating what types of errors have been encountered at least once.
601/// More details may be exposed in the future.
602#[derive(Default)]
603pub struct Errors {
604    punycode: bool,
605    check_hyphens: bool,
606    check_bidi: bool,
607    start_combining_mark: bool,
608    invalid_mapping: bool,
609    nfc: bool,
610    disallowed_by_std3_ascii_rules: bool,
611    disallowed_mapped_in_std3: bool,
612    disallowed_character: bool,
613    too_long_for_dns: bool,
614    too_short_for_dns: bool,
615    disallowed_in_idna_2008: bool,
616}
617
618impl Errors {
619    fn is_err(&self) -> bool {
620        let Errors {
621            punycode,
622            check_hyphens,
623            check_bidi,
624            start_combining_mark,
625            invalid_mapping,
626            nfc,
627            disallowed_by_std3_ascii_rules,
628            disallowed_mapped_in_std3,
629            disallowed_character,
630            too_long_for_dns,
631            too_short_for_dns,
632            disallowed_in_idna_2008,
633        } = *self;
634        punycode
635            || check_hyphens
636            || check_bidi
637            || start_combining_mark
638            || invalid_mapping
639            || nfc
640            || disallowed_by_std3_ascii_rules
641            || disallowed_mapped_in_std3
642            || disallowed_character
643            || too_long_for_dns
644            || too_short_for_dns
645            || disallowed_in_idna_2008
646    }
647}
648
649impl fmt::Debug for Errors {
650    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
651        let Errors {
652            punycode,
653            check_hyphens,
654            check_bidi,
655            start_combining_mark,
656            invalid_mapping,
657            nfc,
658            disallowed_by_std3_ascii_rules,
659            disallowed_mapped_in_std3,
660            disallowed_character,
661            too_long_for_dns,
662            too_short_for_dns,
663            disallowed_in_idna_2008,
664        } = *self;
665
666        let fields = [
667            ("punycode", punycode),
668            ("check_hyphens", check_hyphens),
669            ("check_bidi", check_bidi),
670            ("start_combining_mark", start_combining_mark),
671            ("invalid_mapping", invalid_mapping),
672            ("nfc", nfc),
673            (
674                "disallowed_by_std3_ascii_rules",
675                disallowed_by_std3_ascii_rules,
676            ),
677            ("disallowed_mapped_in_std3", disallowed_mapped_in_std3),
678            ("disallowed_character", disallowed_character),
679            ("too_long_for_dns", too_long_for_dns),
680            ("too_short_for_dns", too_short_for_dns),
681            ("disallowed_in_idna_2008", disallowed_in_idna_2008),
682        ];
683
684        let mut empty = true;
685        f.write_str("Errors { ")?;
686        for (name, val) in &fields {
687            if *val {
688                if !empty {
689                    f.write_str(", ")?;
690                }
691                f.write_str(name)?;
692                empty = false;
693            }
694        }
695
696        if !empty {
697            f.write_str(" }")
698        } else {
699            f.write_str("}")
700        }
701    }
702}
703
704impl From<Errors> for Result<(), Errors> {
705    fn from(e: Errors) -> Result<(), Errors> {
706        if !e.is_err() {
707            Ok(())
708        } else {
709            Err(e)
710        }
711    }
712}
713
714#[cfg(feature = "std")]
715impl std::error::Error for Errors {}
716
717impl fmt::Display for Errors {
718    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
719        fmt::Debug::fmt(self, f)
720    }
721}
722
723#[cfg(test)]
724mod tests {
725    use super::{find_char, Mapping};
726
727    #[test]
728    fn mapping_fast_path() {
729        assert_matches!(find_char('-'), &Mapping::Valid);
730        assert_matches!(find_char('.'), &Mapping::Valid);
731        for c in &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] {
732            assert_matches!(find_char(*c), &Mapping::Valid);
733        }
734        for c in &[
735            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
736            'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
737        ] {
738            assert_matches!(find_char(*c), &Mapping::Valid);
739        }
740    }
741}