From 64f423eb409bbd77e0c9e59e5cfefdfedb472adc Mon Sep 17 00:00:00 2001
From: "Matt A. Tobin" <email@mattatobin.com>
Date: Mon, 11 Nov 2019 01:48:09 -0500
Subject: Bugs 1437282, 1438590, 1506587, and 1498795

* strip unnecessary ==== padding in base64 text.
* Extension of the characters to be encoded (quoted-printable) according to RFC2047.
* Remove unwanted characters from headers. Compact extraneous white space in display name to avoid sender address spoofing.
* Be more tolerant of spaces in base64-encoded RFC 2047 tokens.

Tag #1273
---
 mailnews/mime/jsmime/jsmime.js | 58 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 53 insertions(+), 5 deletions(-)

(limited to 'mailnews')

diff --git a/mailnews/mime/jsmime/jsmime.js b/mailnews/mime/jsmime/jsmime.js
index 253b5da0f..9e0682428 100644
--- a/mailnews/mime/jsmime/jsmime.js
+++ b/mailnews/mime/jsmime/jsmime.js
@@ -68,6 +68,8 @@ function decode_base64(buffer, more) {
   else
     buffer = '';
   sanitize = sanitize.substring(0, sanitize.length - excess);
+  // Delete all unnecessary '====' in padding.
+  sanitize = sanitize.replace(/(====)+$/g, '');
   // Use the atob function we (ought to) have in global scope.
   return [atob(sanitize), buffer];
 }
@@ -379,8 +381,49 @@ function getHeaderTokens(value, delimiters, opts) {
   // converted to not be one.
   let tokenList = [];
 
-  /// Represents a non-delimiter token
+  // Represents a non-delimiter token.
   function Token(token) {
+    // Replace problematic characters so we don't get unexpected behavior
+    // down the line. These fall into a few categories:
+    // A) "Separator, space" (Zs),
+    // B) "Mark, Nonspacing" (Mn)
+    // C) "Other, Control" (Cc)
+    // D) "Other, Format" (Cf)
+    // Unfortuantely, no support for the needed regexp Unicode property escapes
+    // in our engine. So we need to hand-roll it. Used the regexpu tool for
+    // that: https://mothereff.in/regexpu.
+    // This should be updated regularly, to take into account new additions
+    // to the unicode standard. Last updated July 2019.
+    // For a full list of categories, see http://unicode.org/Public//5.0.0/ucd/UCD.html.
+
+    // -- case A: /\p{Zs}/u
+    // https://www.fileformat.info/info/unicode/category/Zs/list.htm
+    // https://mothereff.in/regexpu#input=/\p{Zs}/u&unicodePropertyEscape=1
+    token = token.replace(/[\xA0\u1680\u2000-\u200A\u202F\u205F\u3000]/g, " ");
+
+    // -- case B: /\p{Mn}/u
+    // https://www.fileformat.info/info/unicode/category/Mn/list.htm
+    // https://mothereff.in/regexpu#input=/\p{Mn}/u&unicodePropertyEscape=1
+    // This is a bit more complicated as some of them could be "real", so we'll
+    // only remove the ones that are known to show as blank.
+    token = token.replace(/[\u034F\u17B4\u17B5\u180B-\u180D\uFE00-\uFE0F]/g, "");
+    // \uE0100-\uE01EF need to be written using their surrogate code point pairs
+    // until extended Unicode escapes are supported in regexps.
+    // https://www.fileformat.info/info/unicode/char/e0100/index.htm says \uDB40\uDD00.
+    // https://www.fileformat.info/info/unicode/char/e01ef/index.htm says \uDB40\uDDEF.
+    token = token.replace(/\uDB40[\uDD00-\uDDEF]/g, "");
+
+    // -- case C: /\p{Cc}/u, except Tab/LF/CR
+    // https://www.fileformat.info/info/unicode/category/Cc/list.htm
+    // https://mothereff.in/regexpu#input=/\p{Cc}/u&unicodePropertyEscape=1
+    // eslint-disable-next-line no-control-regex
+    token = token.replace(/(?![\t\n\r])[\0-\x1F\x7F-\x9F]/g, "");
+
+    // -- case D: /\p{Cf}/u
+    // https://www.fileformat.info/info/unicode/category/Cf/list.htm
+    // https://mothereff.in/regexpu#input=/\p{Cf}/u&unicodePropertyEscape=1
+    // Remove all of these except for \u0600-\u0605.
+    token = token.replace(/(?:[\xAD\u061C\u06DD\u070F\u08E2\u180E\u200B-\u200F\u202A-\u202E\u2060-\u2064\u2066-\u206F\uFEFF\uFFF9-\uFFFB]|\uD804[\uDCBD\uDCCD]|\uD80D[\uDC30-\uDC38]|\uD82F[\uDCA0-\uDCA3]|\uD834[\uDD73-\uDD7A]|\uDB40[\uDC01\uDC20-\uDC7F])/g, "");
     // Unescape all quoted pairs. Any trailing \ is deleted.
     this.token = token.replace(/\\(.?)/g, "$1");
   }
@@ -634,7 +677,7 @@ function decodeRFC2047Words(headerValue) {
     if (encoding == 'B' || encoding == 'b') {
       // Decode base64. If there's any non-base64 data, treat the string as
       // an illegal token.
-      if (/[^A-Za-z0-9+\/=]/.exec(text))
+      if (/[^ A-Za-z0-9+\/=]/.exec(text))
         return false;
 
       // Decode the string
@@ -819,13 +862,18 @@ function parseAddressingHeader(header, doRFC2047) {
                  addrSpec.substring(addrSpec.lastIndexOf("@"));
     }
 
+    // Replace all whitespace characters with a single whitespace,
+    // to avoid consecutive whitespace and also to normalize tabs and newlines.
+    displayName = displayName.replace(/\s+/g, " ").trim();
+
     if (displayName === '' && lastComment !== '') {
       // Take last comment content as the display-name.
       let offset = lastComment[0] === ' ' ? 2 : 1;
       displayName = lastComment.substr(offset, lastComment.length - offset - 1);
     }
-    if (displayName !== '' || addrSpec !== '')
+    if (displayName !== '' || addrSpec !== '') {
       addrlist.push({name: displayName, email: addrSpec});
+    }
     // Clear pending flags and variables.
     name = localPart = address = lastComment = '';
     inAngle = inComment = needsSpace = false;
@@ -921,7 +969,7 @@ function parseAddressingHeader(header, doRFC2047) {
 
       // Ignore the needs space if we're a "close" delimiter token.
       let spacedToken = token;
-      if (needsSpace && token.toString()[0] != '.')
+      if (needsSpace && (token.toString().length > 0) && token.toString()[0] != ".")
         spacedToken = ' ' + spacedToken;
 
       // Which field do we add this data to?
@@ -2871,7 +2919,7 @@ var nonAsciiRe = /[^\x20-\x7e]/;
 var b64Prelude = "=?UTF-8?B?", qpPrelude = "=?UTF-8?Q?";
 
 /// A list of ASCII characters forbidden in RFC 2047 encoded-words
-var qpForbidden = "=?_()\",";
+var qpForbidden = "\"#$%&'(),.:;<=>?@[\\]^_`{|}~";
 
 var hexString = "0123456789abcdef";
 
-- 
cgit v1.2.3