diff options
Diffstat (limited to 'third_party/rust/idna/src/uts46.rs')
-rw-r--r-- | third_party/rust/idna/src/uts46.rs | 322 |
1 files changed, 322 insertions, 0 deletions
diff --git a/third_party/rust/idna/src/uts46.rs b/third_party/rust/idna/src/uts46.rs new file mode 100644 index 000000000..5f230e0e4 --- /dev/null +++ b/third_party/rust/idna/src/uts46.rs @@ -0,0 +1,322 @@ +// Copyright 2013-2014 Valentin Gosu. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! [*Unicode IDNA Compatibility Processing* +//! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/) + +use self::Mapping::*; +use punycode; +use std::ascii::AsciiExt; +use unicode_normalization::UnicodeNormalization; +use unicode_normalization::char::is_combining_mark; +use unicode_bidi::{BidiClass, bidi_class}; + +include!("uts46_mapping_table.rs"); + +#[derive(Debug)] +enum Mapping { + Valid, + Ignored, + Mapped(&'static str), + Deviation(&'static str), + Disallowed, + DisallowedStd3Valid, + DisallowedStd3Mapped(&'static str), +} + +struct Range { + from: char, + to: char, + mapping: Mapping, +} + +fn find_char(codepoint: char) -> &'static Mapping { + let mut min = 0; + let mut max = TABLE.len() - 1; + while max > min { + let mid = (min + max) >> 1; + if codepoint > TABLE[mid].to { + min = mid; + } else if codepoint < TABLE[mid].from { + max = mid; + } else { + min = mid; + max = mid; + } + } + &TABLE[min].mapping +} + +fn map_char(codepoint: char, flags: Flags, output: &mut String, errors: &mut Vec<Error>) { + match *find_char(codepoint) { + Mapping::Valid => output.push(codepoint), + Mapping::Ignored => {}, + Mapping::Mapped(mapping) => output.push_str(mapping), + Mapping::Deviation(mapping) => { + if flags.transitional_processing { + output.push_str(mapping) + } else { + output.push(codepoint) + } + } + Mapping::Disallowed => { + errors.push(Error::DissallowedCharacter); + output.push(codepoint); + } + Mapping::DisallowedStd3Valid => { + if flags.use_std3_ascii_rules { + errors.push(Error::DissallowedByStd3AsciiRules); + } + output.push(codepoint) + } + Mapping::DisallowedStd3Mapped(mapping) => { + if flags.use_std3_ascii_rules { + errors.push(Error::DissallowedMappedInStd3); + } + output.push_str(mapping) + } + } +} + +// http://tools.ietf.org/html/rfc5893#section-2 +fn passes_bidi(label: &str, transitional_processing: bool) -> bool { + let mut chars = label.chars(); + let class = match chars.next() { + Some(c) => bidi_class(c), + None => return true, // empty string + }; + + if class == BidiClass::L + || (class == BidiClass::ON && transitional_processing) // starts with \u200D + || (class == BidiClass::ES && transitional_processing) // hack: 1.35.+33.49 + || class == BidiClass::EN // hack: starts with number 0à.\u05D0 + { // LTR + // Rule 5 + loop { + match chars.next() { + Some(c) => { + let c = bidi_class(c); + if !matches!(c, BidiClass::L | BidiClass::EN | + BidiClass::ES | BidiClass::CS | + BidiClass::ET | BidiClass::ON | + BidiClass::BN | BidiClass::NSM) { + return false; + } + }, + None => { break; }, + } + } + + // Rule 6 + let mut rev_chars = label.chars().rev(); + let mut last = rev_chars.next(); + loop { // must end in L or EN followed by 0 or more NSM + match last { + Some(c) if bidi_class(c) == BidiClass::NSM => { + last = rev_chars.next(); + continue; + } + _ => { break; }, + } + } + + // TODO: does not pass for àˇ.\u05D0 + // match last { + // Some(c) if bidi_class(c) == BidiClass::L + // || bidi_class(c) == BidiClass::EN => {}, + // Some(c) => { return false; }, + // _ => {} + // } + + } else if class == BidiClass::R || class == BidiClass::AL { // RTL + let mut found_en = false; + let mut found_an = false; + + // Rule 2 + loop { + match chars.next() { + Some(c) => { + let char_class = bidi_class(c); + + if char_class == BidiClass::EN { + found_en = true; + } + if char_class == BidiClass::AN { + found_an = true; + } + + if !matches!(char_class, BidiClass::R | BidiClass::AL | + BidiClass::AN | BidiClass::EN | + BidiClass::ES | BidiClass::CS | + BidiClass::ET | BidiClass::ON | + BidiClass::BN | BidiClass::NSM) { + return false; + } + }, + None => { break; }, + } + } + // Rule 3 + let mut rev_chars = label.chars().rev(); + let mut last = rev_chars.next(); + loop { // must end in L or EN followed by 0 or more NSM + match last { + Some(c) if bidi_class(c) == BidiClass::NSM => { + last = rev_chars.next(); + continue; + } + _ => { break; }, + } + } + match last { + Some(c) if matches!(bidi_class(c), BidiClass::R | BidiClass::AL | + BidiClass::EN | BidiClass::AN) => {}, + _ => { return false; } + } + + // Rule 4 + if found_an && found_en { + return false; + } + } else { + // Rule 2: Should start with L or R/AL + return false; + } + + return true; +} + +/// http://www.unicode.org/reports/tr46/#Validity_Criteria +fn validate(label: &str, flags: Flags, errors: &mut Vec<Error>) { + if label.nfc().ne(label.chars()) { + errors.push(Error::ValidityCriteria); + } + + // Can not contain '.' since the input is from .split('.') + if { + let mut chars = label.chars().skip(2); + let third = chars.next(); + let fourth = chars.next(); + (third, fourth) == (Some('-'), Some('-')) + } || label.starts_with("-") + || label.ends_with("-") + || label.chars().next().map_or(false, is_combining_mark) + || label.chars().any(|c| match *find_char(c) { + Mapping::Valid => false, + Mapping::Deviation(_) => flags.transitional_processing, + Mapping::DisallowedStd3Valid => flags.use_std3_ascii_rules, + _ => true, + }) + || !passes_bidi(label, flags.transitional_processing) + { + errors.push(Error::ValidityCriteria) + } +} + +/// http://www.unicode.org/reports/tr46/#Processing +fn processing(domain: &str, flags: Flags, errors: &mut Vec<Error>) -> String { + let mut mapped = String::new(); + for c in domain.chars() { + map_char(c, flags, &mut mapped, errors) + } + let normalized: String = mapped.nfc().collect(); + let mut validated = String::new(); + for label in normalized.split('.') { + if validated.len() > 0 { + validated.push('.'); + } + if label.starts_with("xn--") { + match punycode::decode_to_string(&label["xn--".len()..]) { + Some(decoded_label) => { + let flags = Flags { transitional_processing: false, ..flags }; + validate(&decoded_label, flags, errors); + validated.push_str(&decoded_label) + } + None => errors.push(Error::PunycodeError) + } + } else { + validate(label, flags, errors); + validated.push_str(label) + } + } + validated +} + +#[derive(Copy, Clone)] +pub struct Flags { + pub use_std3_ascii_rules: bool, + pub transitional_processing: bool, + pub verify_dns_length: bool, +} + +#[derive(PartialEq, Eq, Clone, Copy, Debug)] +enum Error { + PunycodeError, + ValidityCriteria, + DissallowedByStd3AsciiRules, + DissallowedMappedInStd3, + DissallowedCharacter, + TooLongForDns, +} + +/// Errors recorded during UTS #46 processing. +/// +/// This is opaque for now, only indicating the precense of at least one error. +/// More details may be exposed in the future. +#[derive(Debug)] +pub struct Errors(Vec<Error>); + +/// http://www.unicode.org/reports/tr46/#ToASCII +pub fn to_ascii(domain: &str, flags: Flags) -> Result<String, Errors> { + let mut errors = Vec::new(); + let mut result = String::new(); + for label in processing(domain, flags, &mut errors).split('.') { + if result.len() > 0 { + result.push('.'); + } + if label.is_ascii() { + result.push_str(label); + } else { + match punycode::encode_str(label) { + Some(x) => { + result.push_str("xn--"); + result.push_str(&x); + }, + None => errors.push(Error::PunycodeError) + } + } + } + + if flags.verify_dns_length { + let domain = if result.ends_with(".") { &result[..result.len()-1] } else { &*result }; + if domain.len() < 1 || domain.len() > 253 || + domain.split('.').any(|label| label.len() < 1 || label.len() > 63) { + errors.push(Error::TooLongForDns) + } + } + if errors.is_empty() { + Ok(result) + } else { + Err(Errors(errors)) + } +} + +/// http://www.unicode.org/reports/tr46/#ToUnicode +/// +/// Only `use_std3_ascii_rules` is used in `flags`. +pub fn to_unicode(domain: &str, mut flags: Flags) -> (String, Result<(), Errors>) { + flags.transitional_processing = false; + let mut errors = Vec::new(); + let domain = processing(domain, flags, &mut errors); + let errors = if errors.is_empty() { + Ok(()) + } else { + Err(Errors(errors)) + }; + (domain, errors) +} |