1 files changed, 520 insertions, 0 deletions
diff --git a/mailnews/import/outlook/src/rtfDecoder.cpp b/mailnews/import/outlook/src/rtfDecoder.cpp
new file mode 100644
index 000000000..837beec0b
--- /dev/null
+++ b/mailnews/import/outlook/src/rtfDecoder.cpp
@@ -0,0 +1,520 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <stack>
+#include <map>
+#include <sstream>
+#include "Windows.h"
+#include "rtfDecoder.h"
+
+#define SIZEOF(x) (sizeof(x)/sizeof((x)[0]))
+#define IS_DIGIT(i)   ((i) >= '0' && (i) <= '9')
+#define IS_ALPHA(VAL) (((VAL) >= 'a' && (VAL) <= 'z') || ((VAL) >= 'A' && (VAL) <= 'Z'))
+
+inline int HexToInt(char ch)
+{
+  switch (ch) {
+  case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
+    return ch-'0';
+  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+    return ch-'A'+10;
+  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+    return ch-'a'+10;
+  default:
+    return 0;
+  }
+}
+
+inline int CharsetToCP(int charset)
+{
+  // We don't know the Code page for the commented out charsets.
+  switch (charset) {
+  case 0: return 1252; // ANSI
+  case 1: return 0;   // Default
+//case 2: return 42; // Symbol
+  case 2: return 1252; // Symbol
+  case 77: return 10000; // Mac Roman
+  case 78: return 10001; // Mac Shift Jis
+  case 79: return 10003; // Mac Hangul
+  case 80: return 10008; // Mac GB2312
+  case 81: return 10002; // Mac Big5
+//case 82: Mac Johab (old)
+  case 83: return 10005; // Mac Hebrew
+  case 84: return 10004; // Mac Arabic
+  case 85: return 10006; // Mac Greek
+  case 86: return 10081; // Mac Turkish
+  case 87: return 10021; // Mac Thai
+  case 88: return 10029; // Mac East Europe
+  case 89: return 10007; // Mac Russian
+  case 128: return 932; // Shift JIS
+  case 129: return 949; // Hangul
+  case 130: return 1361; // Johab
+  case 134: return 936; // GB2312
+  case 136: return 950; // Big5
+  case 161: return 1253; // Greek
+  case 162: return 1254; // Turkish
+  case 163: return 1258; // Vietnamese
+  case 177: return 1255; // Hebrew
+  case 178: return 1256; // Arabic
+//case 179: Arabic Traditional (old)
+//case 180: Arabic user (old)
+//case 181: Hebrew user (old)
+  case 186: return 1257; // Baltic
+  case 204: return 1251; // Russian
+  case 222: return 874; // Thai
+  case 238: return 1250; // Eastern European
+  case 254: return 437; // PC 437
+  case 255: return 850; // OEM
+  default: return CP_ACP;
+  }
+}
+
+struct FontInfo {
+  enum Options {has_fcharset = 0x0001,
+                has_cpg      = 0x0002};
+  unsigned int options;
+  int fcharset;
+  unsigned int cpg;
+  FontInfo() : options(0), fcharset(0), cpg(0xFFFFFFFF) {}
+  unsigned int Codepage()
+  {
+    if (options & has_cpg)
+      return cpg;
+    else if (options & has_fcharset)
+      return CharsetToCP(fcharset);
+    else return 0xFFFFFFFF;
+  }
+};
+typedef std::map<int, FontInfo> Fonttbl;
+
+struct LocalState {
+  bool fonttbl;         // When fonts are being defined
+  int f;                // Index of the font being defined/used; defines the codepage if no \cpg
+  unsigned int uc;      // ucN keyword value; its default is 1
+  unsigned int codepage;// defined by \cpg
+};
+typedef std::stack<LocalState> StateStack;
+
+struct GlobalState {
+  enum Pcdata_state { pcdsno, pcdsin, pcdsfinished };
+  std::istream& stream;
+  Fonttbl fonttbl;
+  StateStack stack;
+  unsigned int codepage; // defined by \ansi, \mac, \pc, \pca, and \ansicpgN
+  int deff;
+  std::stringstream pcdata_a;
+  unsigned int pcdata_a_codepage;
+  Pcdata_state pcdata_a_state;
+
+  GlobalState(std::istream& s)
+    : stream(s), codepage(CP_ACP), deff(-1), pcdata_a_state(pcdsno)
+  {
+    LocalState st;
+    st.fonttbl = false;
+    st.f = -1;
+    st.uc = 1;
+    st.codepage = 0xFFFFFFFF;
+    stack.push(st);
+  }
+  unsigned int GetCurrentCP()
+  {
+    if (stack.top().codepage != 0xFFFFFFFF) // \cpg in use
+      return stack.top().codepage;
+    // \cpg not used; use font settings
+    int f = (stack.top().f != -1) ? stack.top().f : deff; 
+    if (f != -1) {
+      Fonttbl::iterator iter = fonttbl.find(f);
+      if (iter != fonttbl.end()) {
+        unsigned int cp = iter->second.Codepage();
+        if (cp != 0xFFFFFFFF)
+          return cp;
+      }
+    }
+    return codepage; // No overrides; use the top-level legacy setting
+  }
+};
+
+struct Keyword {
+  char name[33];
+  bool hasVal;
+  int val;
+};
+
+class Lexem {
+public:
+  enum Type {ltGroupBegin, ltGroupEnd, ltKeyword, ltPCDATA_A, ltPCDATA_W,
+             ltBDATA, ltEOF, ltError};
+  Lexem(Type t=ltError) : m_type(t) {}
+  Lexem(Lexem& from) // Move pointers when copying
+  {
+    switch (m_type = from.m_type) {
+    case ltKeyword:
+      m_keyword = from.m_keyword;
+      break;
+    case ltPCDATA_A:
+      m_pcdata_a = from.m_pcdata_a;
+      break;
+    case ltPCDATA_W:
+      m_pcdata_w = from.m_pcdata_w;
+      break;
+    case ltBDATA:
+      m_bdata = from.m_bdata;
+      from.m_type = ltError;
+      break;
+    }
+  }
+  ~Lexem() { Clear(); }
+  Lexem& operator = (Lexem& from)
+  {
+    if (&from != this) {
+      Clear();
+      switch (m_type = from.m_type) {
+      case ltKeyword:
+        m_keyword = from.m_keyword;
+        break;
+      case ltPCDATA_A:
+        m_pcdata_a = from.m_pcdata_a;
+        break;
+      case ltPCDATA_W:
+        m_pcdata_w = from.m_pcdata_w;
+        break;
+      case ltBDATA:
+        m_bdata = from.m_bdata;
+        from.m_type = ltError;
+        break;
+      }
+    }
+    return *this;
+  }
+  Type type() const { return m_type; }
+  void SetPCDATA_A(char chdata)
+  {
+    Clear();
+    m_pcdata_a = chdata;
+    m_type = ltPCDATA_A;
+  }
+  void SetPCDATA_W(wchar_t chdata)
+  {
+    Clear();
+    m_pcdata_w = chdata;
+    m_type = ltPCDATA_W;
+  }
+  void SetBDATA(const char* data, int sz)
+  {
+    char* tmp = new char[sz]; // to allow getting the data from itself
+    if (tmp) {
+      memcpy(tmp, data, sz);
+      Clear();
+      m_bdata.data = tmp;
+      m_bdata.sz = sz;
+      m_type = ltBDATA;
+    }
+    else m_type = ltError;
+  }
+  void SetKeyword(const Keyword& src)
+  {
+    Clear();
+    m_type = ltKeyword;
+    m_keyword = src;
+  }
+  void SetKeyword(const char* name, bool hasVal=false, int val=0)
+  {
+    char tmp[SIZEOF(m_keyword.name)];
+    strncpy(tmp, name, SIZEOF(m_keyword.name)-1); // to allow copy drom itself
+    tmp[SIZEOF(m_keyword.name)-1]=0;
+    Clear();
+    m_type = ltKeyword;
+    memcpy(m_keyword.name, tmp, SIZEOF(m_keyword.name));
+    m_keyword.hasVal=hasVal;
+    m_keyword.val=val;
+  }
+  const char* KeywordName() const {
+    return (m_type == ltKeyword) ? m_keyword.name : 0; }
+  const int* KeywordVal() const {
+    return ((m_type == ltKeyword) && m_keyword.hasVal) ? &m_keyword.val : 0; }
+  char pcdata_a() const { return (m_type == ltPCDATA_A) ? m_pcdata_a : 0; }
+  wchar_t pcdata_w() const { return (m_type == ltPCDATA_W) ? m_pcdata_w : 0; }
+  const char* bdata() const { return (m_type == ltBDATA) ? m_bdata.data : 0; }
+  int bdata_sz() const { return (m_type == ltBDATA) ? m_bdata.sz : 0; }
+  static Lexem eof;
+  static Lexem groupBegin;
+  static Lexem groupEnd;
+  static Lexem error;
+private:
+  struct BDATA {
+    size_t sz;
+    char* data;
+  };
+
+  Type m_type;
+  union {
+    Keyword m_keyword;
+    char m_pcdata_a;
+    wchar_t m_pcdata_w;
+    BDATA m_bdata;
+  };
+  // This function leaves the object in the broken state. Must be followed
+  // by a correct initialization.
+  void Clear() 
+  {
+    switch (m_type) {
+    case ltBDATA:
+      delete[] m_bdata.data;
+      break;
+    }
+//  m_type = ltError;
+  }
+};
+
+Lexem Lexem::eof(ltEOF);
+Lexem Lexem::groupBegin(ltGroupBegin);
+Lexem Lexem::groupEnd(ltGroupEnd);
+Lexem Lexem::error(ltError);
+
+// This function moves pos. When calling the function, pos must be next to the
+// backslash; pos must be in the same sequence and before end!
+Keyword GetKeyword(std::istream& stream)
+{
+  Keyword keyword = {"", false, 0};
+  char ch;
+  if (stream.get(ch).eof())
+    return keyword;
+  // Control word; maybe delimiter and value
+  if (IS_ALPHA(ch)) { 
+    int i = 0;
+    do {
+      // We take up to 32 characters into account, skipping over extra
+      // characters (allowing for some non-conformant implementation).
+      if (i < 32)
+        keyword.name[i++] = ch;
+    } while (!stream.get(ch).eof() && IS_ALPHA(ch));
+    keyword.name[i] = 0; // NULL-terminating
+    if (!stream.eof() && (IS_DIGIT(ch) || (ch == '-'))) { // Value begin
+      keyword.hasVal = true;
+      bool negative = (ch == '-');
+      if (negative) stream.get(ch);
+      i = 0;
+      while (!stream.eof() && IS_DIGIT(ch)) {
+        // We take into account only 10 digits, skip other. Older specs stated
+        // that we must be ready for an arbitrary number of digits.
+        if (i++ < 10) 
+          keyword.val = keyword.val*10 + (ch - '0');
+        stream.get(ch);
+      }
+      if (negative) keyword.val = -keyword.val;
+    }
+     // End of control word; the space is just a delimiter - skip it
+    if (!stream.eof() && !(ch == ' '))
+      stream.unget();
+  }
+  else { // Control symbol
+    keyword.name[0] = ch, keyword.name[1] = 0;
+  }
+  return keyword;
+}
+
+Lexem GetLexem(std::istream& stream)
+{
+  Lexem result;
+  // We always stay at the beginning of the next lexem or a crlf
+  // If it's a brace then it's group begin/end
+  // If it's a backslash -> Preprocess
+  // - if it's a \u or \' -> make UTF16 character
+  // - else it's a keyword -> Process (e.g., remember the codepage)
+  // - (if the keyword is \bin then the following is #BDATA)
+  // If it's some other character -> Preprocess
+  // - if it's 0x09 -> it's the keyword \tab
+  // - else it's a PCDATA
+  char ch;
+  while (!stream.get(ch).eof() && ((ch == '\n') || (ch == '\r'))); // Skip crlf
+  if (stream.eof())
+    result = Lexem::eof;
+  else {
+    switch (ch) {
+    case '{': // Group begin
+    case '}': // Group end
+      result = (ch == '{') ? Lexem::groupBegin : Lexem::groupEnd;
+      break;
+    case '\\': // Keyword
+      result.SetKeyword(GetKeyword(stream));
+      break;
+    case '\t': // tab
+      result.SetKeyword("tab");
+      break;
+    default: // PSDATA?
+      result.SetPCDATA_A(ch);
+      break;
+    }
+  }
+  return result;
+}
+
+void PreprocessLexem(/*inout*/Lexem& lexem, std::istream& stream, int uc)
+{
+  if (lexem.type() == Lexem::ltKeyword) {
+    if (lexem.KeywordName()[0] == 0) // Empty keyword - maybe eof?
+      lexem = Lexem::error;
+    else if (eq(lexem.KeywordName(), "u")) {
+       // Unicode character - get the UTF16 and skip the uc characters
+      if (const int* val = lexem.KeywordVal()) {
+        lexem.SetPCDATA_W(*val);
+        stream.ignore(uc);
+      }
+      else lexem = Lexem::error;
+    }
+    else if (eq(lexem.KeywordName(), "'")) {
+       // 8-bit character (\'hh) -> use current codepage
+      char ch, ch1;
+      if (!stream.get(ch).eof()) ch1 = HexToInt(ch);
+      if (!stream.get(ch).eof()) (ch1 <<= 4) += HexToInt(ch);
+      lexem.SetPCDATA_A(ch1);
+    }
+    else if (eq(lexem.KeywordName(), "\\") || eq(lexem.KeywordName(), "{") ||
+             eq(lexem.KeywordName(), "}")) // escaped characters
+      lexem.SetPCDATA_A(lexem.KeywordName()[0]);
+    else if (eq(lexem.KeywordName(), "bin")) {
+      if (const int* i = lexem.KeywordVal()) {
+        char* data = new char[*i];
+        if (data) {
+          stream.read(data, *i);
+          if (stream.fail())
+            lexem = Lexem::error;
+          else
+            lexem.SetBDATA(data, *i);
+          delete[] data;
+        }
+        else lexem = Lexem::error;
+      }
+      else lexem = Lexem::error;
+    }
+    else if (eq(lexem.KeywordName(), "\n") || eq(lexem.KeywordName(), "\r")) {
+      // escaped cr or lf
+      lexem.SetKeyword("par");
+    }
+  }
+}
+
+void UpdateState(const Lexem& lexem, /*inout*/GlobalState& globalState)
+{
+  switch (globalState.pcdata_a_state) {
+  case GlobalState::pcdsfinished: // Last time we finished the pcdata
+    globalState.pcdata_a_state = GlobalState::pcdsno;
+    break;
+  case GlobalState::pcdsin:
+     // to be reset later if still in the pcdata
+    globalState.pcdata_a_state = GlobalState::pcdsfinished;
+    break;
+  }
+
+  switch (lexem.type()) {
+  case Lexem::ltGroupBegin:
+    globalState.stack.push(globalState.stack.top());
+    break;
+  case Lexem::ltGroupEnd:
+    globalState.stack.pop();
+    break;
+  case Lexem::ltKeyword:
+    {
+      const int* val = lexem.KeywordVal();
+      if (eq(lexem.KeywordName(), "ansi")) globalState.codepage = CP_ACP;
+      else if (eq(lexem.KeywordName(), "mac")) globalState.codepage = CP_MACCP;
+      else if (eq(lexem.KeywordName(), "pc")) globalState.codepage = 437;
+      else if (eq(lexem.KeywordName(), "pca")) globalState.codepage = 850;
+      else if (eq(lexem.KeywordName(), "ansicpg") && val)
+        globalState.codepage = static_cast<unsigned int>(*val);
+      else if (eq(lexem.KeywordName(), "deff") && val)
+        globalState.deff = *val;
+      else if (eq(lexem.KeywordName(), "fonttbl")) globalState.stack.top().fonttbl = true;
+      else if (eq(lexem.KeywordName(), "f") && val) {
+        globalState.stack.top().f = *val;
+      }
+      else if (eq(lexem.KeywordName(), "fcharset") &&
+               globalState.stack.top().fonttbl &&
+               (globalState.stack.top().f != -1) && val) {
+        FontInfo& f = globalState.fonttbl[globalState.stack.top().f];
+        f.options |= FontInfo::has_fcharset;
+        f.fcharset = *val;
+      }
+      else if (eq(lexem.KeywordName(), "cpg") && val) {
+        if (globalState.stack.top().fonttbl && (globalState.stack.top().f != -1)) { // Defining a font
+          FontInfo& f = globalState.fonttbl[globalState.stack.top().f];
+          f.options |= FontInfo::has_cpg;
+          f.cpg = *val;
+        }
+        else { // Overriding the codepage for the block - may be in filenames
+          globalState.stack.top().codepage = *val;
+        }
+      }
+      else if (eq(lexem.KeywordName(), "plain"))
+        globalState.stack.top().f = -1;
+      else if (eq(lexem.KeywordName(), "uc") && val)
+        globalState.stack.top().uc = *val;
+    }
+    break;
+  case Lexem::ltPCDATA_A:
+    if (globalState.pcdata_a_state == GlobalState::pcdsno) // Beginning of the pcdata
+      globalState.pcdata_a_codepage = globalState.GetCurrentCP(); // to use later to convert to utf16
+    globalState.pcdata_a_state = GlobalState::pcdsin;
+    globalState.pcdata_a << lexem.pcdata_a();
+    break;
+  }
+}
+
+void DecodeRTF(std::istream& rtf, CRTFDecoder& decoder)
+{
+  // Check if this is the rtf
+  Lexem lexem = GetLexem(rtf);
+  if (lexem.type() != Lexem::ltGroupBegin)
+    return;
+  decoder.BeginGroup();
+  lexem = GetLexem(rtf);
+  if ((lexem.type() != Lexem::ltKeyword) || !eq(lexem.KeywordName(), "rtf") ||
+      !lexem.KeywordVal() || (*lexem.KeywordVal() != 1))
+    return;
+  decoder.Keyword(lexem.KeywordName(), lexem.KeywordVal());
+
+  GlobalState state(rtf);
+  // Level is the count of elements in the stack
+
+  while (!state.stream.eof() && (state.stack.size()>0)) { // Don't go past the global group
+    lexem = GetLexem(state.stream);
+    PreprocessLexem(lexem, state.stream, state.stack.top().uc);
+    UpdateState(lexem, state);
+
+    if (state.pcdata_a_state == GlobalState::pcdsfinished) {
+      std::string s = state.pcdata_a.str();
+      int sz = ::MultiByteToWideChar(state.pcdata_a_codepage, 0, s.c_str(), s.size(), 0, 0);
+      if (sz) {
+        wchar_t* data = new wchar_t[sz];
+        ::MultiByteToWideChar(state.pcdata_a_codepage, 0, s.c_str(), s.size(), data, sz);
+        decoder.PCDATA(data, sz);
+        delete[] data;
+      }
+      state.pcdata_a.str(""); // reset
+    }
+
+    switch (lexem.type()) {
+    case Lexem::ltGroupBegin:
+      decoder.BeginGroup();
+      break;
+    case Lexem::ltGroupEnd:
+      decoder.EndGroup();
+      break;
+    case Lexem::ltKeyword:
+      decoder.Keyword(lexem.KeywordName(), lexem.KeywordVal());
+      break;
+    case Lexem::ltPCDATA_W:
+      {
+        wchar_t ch = lexem.pcdata_w();
+        decoder.PCDATA(&ch, 1);
+      }
+      break;
+    case Lexem::ltBDATA:
+      decoder.BDATA(lexem.bdata(), lexem.bdata_sz());
+      break;
+    case Lexem::ltError:
+      break; // Just silently skip the erroneous data - basic error recovery
+    }
+  } // while
+} // DecodeRTF