summaryrefslogtreecommitdiffstats
path: root/mailnews/import/outlook/src/rtfDecoder.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'mailnews/import/outlook/src/rtfDecoder.cpp')
-rw-r--r--mailnews/import/outlook/src/rtfDecoder.cpp520
1 files changed, 520 insertions, 0 deletions
diff --git a/mailnews/import/outlook/src/rtfDecoder.cpp b/mailnews/import/outlook/src/rtfDecoder.cpp
new file mode 100644
index 000000000..837beec0b
--- /dev/null
+++ b/mailnews/import/outlook/src/rtfDecoder.cpp
@@ -0,0 +1,520 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <stack>
+#include <map>
+#include <sstream>
+#include "Windows.h"
+#include "rtfDecoder.h"
+
+#define SIZEOF(x) (sizeof(x)/sizeof((x)[0]))
+#define IS_DIGIT(i) ((i) >= '0' && (i) <= '9')
+#define IS_ALPHA(VAL) (((VAL) >= 'a' && (VAL) <= 'z') || ((VAL) >= 'A' && (VAL) <= 'Z'))
+
+inline int HexToInt(char ch)
+{
+ switch (ch) {
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
+ return ch-'0';
+ case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+ return ch-'A'+10;
+ case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+ return ch-'a'+10;
+ default:
+ return 0;
+ }
+}
+
+inline int CharsetToCP(int charset)
+{
+ // We don't know the Code page for the commented out charsets.
+ switch (charset) {
+ case 0: return 1252; // ANSI
+ case 1: return 0; // Default
+//case 2: return 42; // Symbol
+ case 2: return 1252; // Symbol
+ case 77: return 10000; // Mac Roman
+ case 78: return 10001; // Mac Shift Jis
+ case 79: return 10003; // Mac Hangul
+ case 80: return 10008; // Mac GB2312
+ case 81: return 10002; // Mac Big5
+//case 82: Mac Johab (old)
+ case 83: return 10005; // Mac Hebrew
+ case 84: return 10004; // Mac Arabic
+ case 85: return 10006; // Mac Greek
+ case 86: return 10081; // Mac Turkish
+ case 87: return 10021; // Mac Thai
+ case 88: return 10029; // Mac East Europe
+ case 89: return 10007; // Mac Russian
+ case 128: return 932; // Shift JIS
+ case 129: return 949; // Hangul
+ case 130: return 1361; // Johab
+ case 134: return 936; // GB2312
+ case 136: return 950; // Big5
+ case 161: return 1253; // Greek
+ case 162: return 1254; // Turkish
+ case 163: return 1258; // Vietnamese
+ case 177: return 1255; // Hebrew
+ case 178: return 1256; // Arabic
+//case 179: Arabic Traditional (old)
+//case 180: Arabic user (old)
+//case 181: Hebrew user (old)
+ case 186: return 1257; // Baltic
+ case 204: return 1251; // Russian
+ case 222: return 874; // Thai
+ case 238: return 1250; // Eastern European
+ case 254: return 437; // PC 437
+ case 255: return 850; // OEM
+ default: return CP_ACP;
+ }
+}
+
+struct FontInfo {
+ enum Options {has_fcharset = 0x0001,
+ has_cpg = 0x0002};
+ unsigned int options;
+ int fcharset;
+ unsigned int cpg;
+ FontInfo() : options(0), fcharset(0), cpg(0xFFFFFFFF) {}
+ unsigned int Codepage()
+ {
+ if (options & has_cpg)
+ return cpg;
+ else if (options & has_fcharset)
+ return CharsetToCP(fcharset);
+ else return 0xFFFFFFFF;
+ }
+};
+typedef std::map<int, FontInfo> Fonttbl;
+
+struct LocalState {
+ bool fonttbl; // When fonts are being defined
+ int f; // Index of the font being defined/used; defines the codepage if no \cpg
+ unsigned int uc; // ucN keyword value; its default is 1
+ unsigned int codepage;// defined by \cpg
+};
+typedef std::stack<LocalState> StateStack;
+
+struct GlobalState {
+ enum Pcdata_state { pcdsno, pcdsin, pcdsfinished };
+ std::istream& stream;
+ Fonttbl fonttbl;
+ StateStack stack;
+ unsigned int codepage; // defined by \ansi, \mac, \pc, \pca, and \ansicpgN
+ int deff;
+ std::stringstream pcdata_a;
+ unsigned int pcdata_a_codepage;
+ Pcdata_state pcdata_a_state;
+
+ GlobalState(std::istream& s)
+ : stream(s), codepage(CP_ACP), deff(-1), pcdata_a_state(pcdsno)
+ {
+ LocalState st;
+ st.fonttbl = false;
+ st.f = -1;
+ st.uc = 1;
+ st.codepage = 0xFFFFFFFF;
+ stack.push(st);
+ }
+ unsigned int GetCurrentCP()
+ {
+ if (stack.top().codepage != 0xFFFFFFFF) // \cpg in use
+ return stack.top().codepage;
+ // \cpg not used; use font settings
+ int f = (stack.top().f != -1) ? stack.top().f : deff;
+ if (f != -1) {
+ Fonttbl::iterator iter = fonttbl.find(f);
+ if (iter != fonttbl.end()) {
+ unsigned int cp = iter->second.Codepage();
+ if (cp != 0xFFFFFFFF)
+ return cp;
+ }
+ }
+ return codepage; // No overrides; use the top-level legacy setting
+ }
+};
+
+struct Keyword {
+ char name[33];
+ bool hasVal;
+ int val;
+};
+
+class Lexem {
+public:
+ enum Type {ltGroupBegin, ltGroupEnd, ltKeyword, ltPCDATA_A, ltPCDATA_W,
+ ltBDATA, ltEOF, ltError};
+ Lexem(Type t=ltError) : m_type(t) {}
+ Lexem(Lexem& from) // Move pointers when copying
+ {
+ switch (m_type = from.m_type) {
+ case ltKeyword:
+ m_keyword = from.m_keyword;
+ break;
+ case ltPCDATA_A:
+ m_pcdata_a = from.m_pcdata_a;
+ break;
+ case ltPCDATA_W:
+ m_pcdata_w = from.m_pcdata_w;
+ break;
+ case ltBDATA:
+ m_bdata = from.m_bdata;
+ from.m_type = ltError;
+ break;
+ }
+ }
+ ~Lexem() { Clear(); }
+ Lexem& operator = (Lexem& from)
+ {
+ if (&from != this) {
+ Clear();
+ switch (m_type = from.m_type) {
+ case ltKeyword:
+ m_keyword = from.m_keyword;
+ break;
+ case ltPCDATA_A:
+ m_pcdata_a = from.m_pcdata_a;
+ break;
+ case ltPCDATA_W:
+ m_pcdata_w = from.m_pcdata_w;
+ break;
+ case ltBDATA:
+ m_bdata = from.m_bdata;
+ from.m_type = ltError;
+ break;
+ }
+ }
+ return *this;
+ }
+ Type type() const { return m_type; }
+ void SetPCDATA_A(char chdata)
+ {
+ Clear();
+ m_pcdata_a = chdata;
+ m_type = ltPCDATA_A;
+ }
+ void SetPCDATA_W(wchar_t chdata)
+ {
+ Clear();
+ m_pcdata_w = chdata;
+ m_type = ltPCDATA_W;
+ }
+ void SetBDATA(const char* data, int sz)
+ {
+ char* tmp = new char[sz]; // to allow getting the data from itself
+ if (tmp) {
+ memcpy(tmp, data, sz);
+ Clear();
+ m_bdata.data = tmp;
+ m_bdata.sz = sz;
+ m_type = ltBDATA;
+ }
+ else m_type = ltError;
+ }
+ void SetKeyword(const Keyword& src)
+ {
+ Clear();
+ m_type = ltKeyword;
+ m_keyword = src;
+ }
+ void SetKeyword(const char* name, bool hasVal=false, int val=0)
+ {
+ char tmp[SIZEOF(m_keyword.name)];
+ strncpy(tmp, name, SIZEOF(m_keyword.name)-1); // to allow copy drom itself
+ tmp[SIZEOF(m_keyword.name)-1]=0;
+ Clear();
+ m_type = ltKeyword;
+ memcpy(m_keyword.name, tmp, SIZEOF(m_keyword.name));
+ m_keyword.hasVal=hasVal;
+ m_keyword.val=val;
+ }
+ const char* KeywordName() const {
+ return (m_type == ltKeyword) ? m_keyword.name : 0; }
+ const int* KeywordVal() const {
+ return ((m_type == ltKeyword) && m_keyword.hasVal) ? &m_keyword.val : 0; }
+ char pcdata_a() const { return (m_type == ltPCDATA_A) ? m_pcdata_a : 0; }
+ wchar_t pcdata_w() const { return (m_type == ltPCDATA_W) ? m_pcdata_w : 0; }
+ const char* bdata() const { return (m_type == ltBDATA) ? m_bdata.data : 0; }
+ int bdata_sz() const { return (m_type == ltBDATA) ? m_bdata.sz : 0; }
+ static Lexem eof;
+ static Lexem groupBegin;
+ static Lexem groupEnd;
+ static Lexem error;
+private:
+ struct BDATA {
+ size_t sz;
+ char* data;
+ };
+
+ Type m_type;
+ union {
+ Keyword m_keyword;
+ char m_pcdata_a;
+ wchar_t m_pcdata_w;
+ BDATA m_bdata;
+ };
+ // This function leaves the object in the broken state. Must be followed
+ // by a correct initialization.
+ void Clear()
+ {
+ switch (m_type) {
+ case ltBDATA:
+ delete[] m_bdata.data;
+ break;
+ }
+// m_type = ltError;
+ }
+};
+
+Lexem Lexem::eof(ltEOF);
+Lexem Lexem::groupBegin(ltGroupBegin);
+Lexem Lexem::groupEnd(ltGroupEnd);
+Lexem Lexem::error(ltError);
+
+// This function moves pos. When calling the function, pos must be next to the
+// backslash; pos must be in the same sequence and before end!
+Keyword GetKeyword(std::istream& stream)
+{
+ Keyword keyword = {"", false, 0};
+ char ch;
+ if (stream.get(ch).eof())
+ return keyword;
+ // Control word; maybe delimiter and value
+ if (IS_ALPHA(ch)) {
+ int i = 0;
+ do {
+ // We take up to 32 characters into account, skipping over extra
+ // characters (allowing for some non-conformant implementation).
+ if (i < 32)
+ keyword.name[i++] = ch;
+ } while (!stream.get(ch).eof() && IS_ALPHA(ch));
+ keyword.name[i] = 0; // NULL-terminating
+ if (!stream.eof() && (IS_DIGIT(ch) || (ch == '-'))) { // Value begin
+ keyword.hasVal = true;
+ bool negative = (ch == '-');
+ if (negative) stream.get(ch);
+ i = 0;
+ while (!stream.eof() && IS_DIGIT(ch)) {
+ // We take into account only 10 digits, skip other. Older specs stated
+ // that we must be ready for an arbitrary number of digits.
+ if (i++ < 10)
+ keyword.val = keyword.val*10 + (ch - '0');
+ stream.get(ch);
+ }
+ if (negative) keyword.val = -keyword.val;
+ }
+ // End of control word; the space is just a delimiter - skip it
+ if (!stream.eof() && !(ch == ' '))
+ stream.unget();
+ }
+ else { // Control symbol
+ keyword.name[0] = ch, keyword.name[1] = 0;
+ }
+ return keyword;
+}
+
+Lexem GetLexem(std::istream& stream)
+{
+ Lexem result;
+ // We always stay at the beginning of the next lexem or a crlf
+ // If it's a brace then it's group begin/end
+ // If it's a backslash -> Preprocess
+ // - if it's a \u or \' -> make UTF16 character
+ // - else it's a keyword -> Process (e.g., remember the codepage)
+ // - (if the keyword is \bin then the following is #BDATA)
+ // If it's some other character -> Preprocess
+ // - if it's 0x09 -> it's the keyword \tab
+ // - else it's a PCDATA
+ char ch;
+ while (!stream.get(ch).eof() && ((ch == '\n') || (ch == '\r'))); // Skip crlf
+ if (stream.eof())
+ result = Lexem::eof;
+ else {
+ switch (ch) {
+ case '{': // Group begin
+ case '}': // Group end
+ result = (ch == '{') ? Lexem::groupBegin : Lexem::groupEnd;
+ break;
+ case '\\': // Keyword
+ result.SetKeyword(GetKeyword(stream));
+ break;
+ case '\t': // tab
+ result.SetKeyword("tab");
+ break;
+ default: // PSDATA?
+ result.SetPCDATA_A(ch);
+ break;
+ }
+ }
+ return result;
+}
+
+void PreprocessLexem(/*inout*/Lexem& lexem, std::istream& stream, int uc)
+{
+ if (lexem.type() == Lexem::ltKeyword) {
+ if (lexem.KeywordName()[0] == 0) // Empty keyword - maybe eof?
+ lexem = Lexem::error;
+ else if (eq(lexem.KeywordName(), "u")) {
+ // Unicode character - get the UTF16 and skip the uc characters
+ if (const int* val = lexem.KeywordVal()) {
+ lexem.SetPCDATA_W(*val);
+ stream.ignore(uc);
+ }
+ else lexem = Lexem::error;
+ }
+ else if (eq(lexem.KeywordName(), "'")) {
+ // 8-bit character (\'hh) -> use current codepage
+ char ch, ch1;
+ if (!stream.get(ch).eof()) ch1 = HexToInt(ch);
+ if (!stream.get(ch).eof()) (ch1 <<= 4) += HexToInt(ch);
+ lexem.SetPCDATA_A(ch1);
+ }
+ else if (eq(lexem.KeywordName(), "\\") || eq(lexem.KeywordName(), "{") ||
+ eq(lexem.KeywordName(), "}")) // escaped characters
+ lexem.SetPCDATA_A(lexem.KeywordName()[0]);
+ else if (eq(lexem.KeywordName(), "bin")) {
+ if (const int* i = lexem.KeywordVal()) {
+ char* data = new char[*i];
+ if (data) {
+ stream.read(data, *i);
+ if (stream.fail())
+ lexem = Lexem::error;
+ else
+ lexem.SetBDATA(data, *i);
+ delete[] data;
+ }
+ else lexem = Lexem::error;
+ }
+ else lexem = Lexem::error;
+ }
+ else if (eq(lexem.KeywordName(), "\n") || eq(lexem.KeywordName(), "\r")) {
+ // escaped cr or lf
+ lexem.SetKeyword("par");
+ }
+ }
+}
+
+void UpdateState(const Lexem& lexem, /*inout*/GlobalState& globalState)
+{
+ switch (globalState.pcdata_a_state) {
+ case GlobalState::pcdsfinished: // Last time we finished the pcdata
+ globalState.pcdata_a_state = GlobalState::pcdsno;
+ break;
+ case GlobalState::pcdsin:
+ // to be reset later if still in the pcdata
+ globalState.pcdata_a_state = GlobalState::pcdsfinished;
+ break;
+ }
+
+ switch (lexem.type()) {
+ case Lexem::ltGroupBegin:
+ globalState.stack.push(globalState.stack.top());
+ break;
+ case Lexem::ltGroupEnd:
+ globalState.stack.pop();
+ break;
+ case Lexem::ltKeyword:
+ {
+ const int* val = lexem.KeywordVal();
+ if (eq(lexem.KeywordName(), "ansi")) globalState.codepage = CP_ACP;
+ else if (eq(lexem.KeywordName(), "mac")) globalState.codepage = CP_MACCP;
+ else if (eq(lexem.KeywordName(), "pc")) globalState.codepage = 437;
+ else if (eq(lexem.KeywordName(), "pca")) globalState.codepage = 850;
+ else if (eq(lexem.KeywordName(), "ansicpg") && val)
+ globalState.codepage = static_cast<unsigned int>(*val);
+ else if (eq(lexem.KeywordName(), "deff") && val)
+ globalState.deff = *val;
+ else if (eq(lexem.KeywordName(), "fonttbl")) globalState.stack.top().fonttbl = true;
+ else if (eq(lexem.KeywordName(), "f") && val) {
+ globalState.stack.top().f = *val;
+ }
+ else if (eq(lexem.KeywordName(), "fcharset") &&
+ globalState.stack.top().fonttbl &&
+ (globalState.stack.top().f != -1) && val) {
+ FontInfo& f = globalState.fonttbl[globalState.stack.top().f];
+ f.options |= FontInfo::has_fcharset;
+ f.fcharset = *val;
+ }
+ else if (eq(lexem.KeywordName(), "cpg") && val) {
+ if (globalState.stack.top().fonttbl && (globalState.stack.top().f != -1)) { // Defining a font
+ FontInfo& f = globalState.fonttbl[globalState.stack.top().f];
+ f.options |= FontInfo::has_cpg;
+ f.cpg = *val;
+ }
+ else { // Overriding the codepage for the block - may be in filenames
+ globalState.stack.top().codepage = *val;
+ }
+ }
+ else if (eq(lexem.KeywordName(), "plain"))
+ globalState.stack.top().f = -1;
+ else if (eq(lexem.KeywordName(), "uc") && val)
+ globalState.stack.top().uc = *val;
+ }
+ break;
+ case Lexem::ltPCDATA_A:
+ if (globalState.pcdata_a_state == GlobalState::pcdsno) // Beginning of the pcdata
+ globalState.pcdata_a_codepage = globalState.GetCurrentCP(); // to use later to convert to utf16
+ globalState.pcdata_a_state = GlobalState::pcdsin;
+ globalState.pcdata_a << lexem.pcdata_a();
+ break;
+ }
+}
+
+void DecodeRTF(std::istream& rtf, CRTFDecoder& decoder)
+{
+ // Check if this is the rtf
+ Lexem lexem = GetLexem(rtf);
+ if (lexem.type() != Lexem::ltGroupBegin)
+ return;
+ decoder.BeginGroup();
+ lexem = GetLexem(rtf);
+ if ((lexem.type() != Lexem::ltKeyword) || !eq(lexem.KeywordName(), "rtf") ||
+ !lexem.KeywordVal() || (*lexem.KeywordVal() != 1))
+ return;
+ decoder.Keyword(lexem.KeywordName(), lexem.KeywordVal());
+
+ GlobalState state(rtf);
+ // Level is the count of elements in the stack
+
+ while (!state.stream.eof() && (state.stack.size()>0)) { // Don't go past the global group
+ lexem = GetLexem(state.stream);
+ PreprocessLexem(lexem, state.stream, state.stack.top().uc);
+ UpdateState(lexem, state);
+
+ if (state.pcdata_a_state == GlobalState::pcdsfinished) {
+ std::string s = state.pcdata_a.str();
+ int sz = ::MultiByteToWideChar(state.pcdata_a_codepage, 0, s.c_str(), s.size(), 0, 0);
+ if (sz) {
+ wchar_t* data = new wchar_t[sz];
+ ::MultiByteToWideChar(state.pcdata_a_codepage, 0, s.c_str(), s.size(), data, sz);
+ decoder.PCDATA(data, sz);
+ delete[] data;
+ }
+ state.pcdata_a.str(""); // reset
+ }
+
+ switch (lexem.type()) {
+ case Lexem::ltGroupBegin:
+ decoder.BeginGroup();
+ break;
+ case Lexem::ltGroupEnd:
+ decoder.EndGroup();
+ break;
+ case Lexem::ltKeyword:
+ decoder.Keyword(lexem.KeywordName(), lexem.KeywordVal());
+ break;
+ case Lexem::ltPCDATA_W:
+ {
+ wchar_t ch = lexem.pcdata_w();
+ decoder.PCDATA(&ch, 1);
+ }
+ break;
+ case Lexem::ltBDATA:
+ decoder.BDATA(lexem.bdata(), lexem.bdata_sz());
+ break;
+ case Lexem::ltError:
+ break; // Just silently skip the erroneous data - basic error recovery
+ }
+ } // while
+} // DecodeRTF