diff options
Diffstat (limited to 'mailnews/import/outlook/src/rtfDecoder.cpp')
-rw-r--r-- | mailnews/import/outlook/src/rtfDecoder.cpp | 520 |
1 files changed, 520 insertions, 0 deletions
diff --git a/mailnews/import/outlook/src/rtfDecoder.cpp b/mailnews/import/outlook/src/rtfDecoder.cpp new file mode 100644 index 000000000..837beec0b --- /dev/null +++ b/mailnews/import/outlook/src/rtfDecoder.cpp @@ -0,0 +1,520 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include <stack> +#include <map> +#include <sstream> +#include "Windows.h" +#include "rtfDecoder.h" + +#define SIZEOF(x) (sizeof(x)/sizeof((x)[0])) +#define IS_DIGIT(i) ((i) >= '0' && (i) <= '9') +#define IS_ALPHA(VAL) (((VAL) >= 'a' && (VAL) <= 'z') || ((VAL) >= 'A' && (VAL) <= 'Z')) + +inline int HexToInt(char ch) +{ + switch (ch) { + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + return ch-'0'; + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + return ch-'A'+10; + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + return ch-'a'+10; + default: + return 0; + } +} + +inline int CharsetToCP(int charset) +{ + // We don't know the Code page for the commented out charsets. + switch (charset) { + case 0: return 1252; // ANSI + case 1: return 0; // Default +//case 2: return 42; // Symbol + case 2: return 1252; // Symbol + case 77: return 10000; // Mac Roman + case 78: return 10001; // Mac Shift Jis + case 79: return 10003; // Mac Hangul + case 80: return 10008; // Mac GB2312 + case 81: return 10002; // Mac Big5 +//case 82: Mac Johab (old) + case 83: return 10005; // Mac Hebrew + case 84: return 10004; // Mac Arabic + case 85: return 10006; // Mac Greek + case 86: return 10081; // Mac Turkish + case 87: return 10021; // Mac Thai + case 88: return 10029; // Mac East Europe + case 89: return 10007; // Mac Russian + case 128: return 932; // Shift JIS + case 129: return 949; // Hangul + case 130: return 1361; // Johab + case 134: return 936; // GB2312 + case 136: return 950; // Big5 + case 161: return 1253; // Greek + case 162: return 1254; // Turkish + case 163: return 1258; // Vietnamese + case 177: return 1255; // Hebrew + case 178: return 1256; // Arabic +//case 179: Arabic Traditional (old) +//case 180: Arabic user (old) +//case 181: Hebrew user (old) + case 186: return 1257; // Baltic + case 204: return 1251; // Russian + case 222: return 874; // Thai + case 238: return 1250; // Eastern European + case 254: return 437; // PC 437 + case 255: return 850; // OEM + default: return CP_ACP; + } +} + +struct FontInfo { + enum Options {has_fcharset = 0x0001, + has_cpg = 0x0002}; + unsigned int options; + int fcharset; + unsigned int cpg; + FontInfo() : options(0), fcharset(0), cpg(0xFFFFFFFF) {} + unsigned int Codepage() + { + if (options & has_cpg) + return cpg; + else if (options & has_fcharset) + return CharsetToCP(fcharset); + else return 0xFFFFFFFF; + } +}; +typedef std::map<int, FontInfo> Fonttbl; + +struct LocalState { + bool fonttbl; // When fonts are being defined + int f; // Index of the font being defined/used; defines the codepage if no \cpg + unsigned int uc; // ucN keyword value; its default is 1 + unsigned int codepage;// defined by \cpg +}; +typedef std::stack<LocalState> StateStack; + +struct GlobalState { + enum Pcdata_state { pcdsno, pcdsin, pcdsfinished }; + std::istream& stream; + Fonttbl fonttbl; + StateStack stack; + unsigned int codepage; // defined by \ansi, \mac, \pc, \pca, and \ansicpgN + int deff; + std::stringstream pcdata_a; + unsigned int pcdata_a_codepage; + Pcdata_state pcdata_a_state; + + GlobalState(std::istream& s) + : stream(s), codepage(CP_ACP), deff(-1), pcdata_a_state(pcdsno) + { + LocalState st; + st.fonttbl = false; + st.f = -1; + st.uc = 1; + st.codepage = 0xFFFFFFFF; + stack.push(st); + } + unsigned int GetCurrentCP() + { + if (stack.top().codepage != 0xFFFFFFFF) // \cpg in use + return stack.top().codepage; + // \cpg not used; use font settings + int f = (stack.top().f != -1) ? stack.top().f : deff; + if (f != -1) { + Fonttbl::iterator iter = fonttbl.find(f); + if (iter != fonttbl.end()) { + unsigned int cp = iter->second.Codepage(); + if (cp != 0xFFFFFFFF) + return cp; + } + } + return codepage; // No overrides; use the top-level legacy setting + } +}; + +struct Keyword { + char name[33]; + bool hasVal; + int val; +}; + +class Lexem { +public: + enum Type {ltGroupBegin, ltGroupEnd, ltKeyword, ltPCDATA_A, ltPCDATA_W, + ltBDATA, ltEOF, ltError}; + Lexem(Type t=ltError) : m_type(t) {} + Lexem(Lexem& from) // Move pointers when copying + { + switch (m_type = from.m_type) { + case ltKeyword: + m_keyword = from.m_keyword; + break; + case ltPCDATA_A: + m_pcdata_a = from.m_pcdata_a; + break; + case ltPCDATA_W: + m_pcdata_w = from.m_pcdata_w; + break; + case ltBDATA: + m_bdata = from.m_bdata; + from.m_type = ltError; + break; + } + } + ~Lexem() { Clear(); } + Lexem& operator = (Lexem& from) + { + if (&from != this) { + Clear(); + switch (m_type = from.m_type) { + case ltKeyword: + m_keyword = from.m_keyword; + break; + case ltPCDATA_A: + m_pcdata_a = from.m_pcdata_a; + break; + case ltPCDATA_W: + m_pcdata_w = from.m_pcdata_w; + break; + case ltBDATA: + m_bdata = from.m_bdata; + from.m_type = ltError; + break; + } + } + return *this; + } + Type type() const { return m_type; } + void SetPCDATA_A(char chdata) + { + Clear(); + m_pcdata_a = chdata; + m_type = ltPCDATA_A; + } + void SetPCDATA_W(wchar_t chdata) + { + Clear(); + m_pcdata_w = chdata; + m_type = ltPCDATA_W; + } + void SetBDATA(const char* data, int sz) + { + char* tmp = new char[sz]; // to allow getting the data from itself + if (tmp) { + memcpy(tmp, data, sz); + Clear(); + m_bdata.data = tmp; + m_bdata.sz = sz; + m_type = ltBDATA; + } + else m_type = ltError; + } + void SetKeyword(const Keyword& src) + { + Clear(); + m_type = ltKeyword; + m_keyword = src; + } + void SetKeyword(const char* name, bool hasVal=false, int val=0) + { + char tmp[SIZEOF(m_keyword.name)]; + strncpy(tmp, name, SIZEOF(m_keyword.name)-1); // to allow copy drom itself + tmp[SIZEOF(m_keyword.name)-1]=0; + Clear(); + m_type = ltKeyword; + memcpy(m_keyword.name, tmp, SIZEOF(m_keyword.name)); + m_keyword.hasVal=hasVal; + m_keyword.val=val; + } + const char* KeywordName() const { + return (m_type == ltKeyword) ? m_keyword.name : 0; } + const int* KeywordVal() const { + return ((m_type == ltKeyword) && m_keyword.hasVal) ? &m_keyword.val : 0; } + char pcdata_a() const { return (m_type == ltPCDATA_A) ? m_pcdata_a : 0; } + wchar_t pcdata_w() const { return (m_type == ltPCDATA_W) ? m_pcdata_w : 0; } + const char* bdata() const { return (m_type == ltBDATA) ? m_bdata.data : 0; } + int bdata_sz() const { return (m_type == ltBDATA) ? m_bdata.sz : 0; } + static Lexem eof; + static Lexem groupBegin; + static Lexem groupEnd; + static Lexem error; +private: + struct BDATA { + size_t sz; + char* data; + }; + + Type m_type; + union { + Keyword m_keyword; + char m_pcdata_a; + wchar_t m_pcdata_w; + BDATA m_bdata; + }; + // This function leaves the object in the broken state. Must be followed + // by a correct initialization. + void Clear() + { + switch (m_type) { + case ltBDATA: + delete[] m_bdata.data; + break; + } +// m_type = ltError; + } +}; + +Lexem Lexem::eof(ltEOF); +Lexem Lexem::groupBegin(ltGroupBegin); +Lexem Lexem::groupEnd(ltGroupEnd); +Lexem Lexem::error(ltError); + +// This function moves pos. When calling the function, pos must be next to the +// backslash; pos must be in the same sequence and before end! +Keyword GetKeyword(std::istream& stream) +{ + Keyword keyword = {"", false, 0}; + char ch; + if (stream.get(ch).eof()) + return keyword; + // Control word; maybe delimiter and value + if (IS_ALPHA(ch)) { + int i = 0; + do { + // We take up to 32 characters into account, skipping over extra + // characters (allowing for some non-conformant implementation). + if (i < 32) + keyword.name[i++] = ch; + } while (!stream.get(ch).eof() && IS_ALPHA(ch)); + keyword.name[i] = 0; // NULL-terminating + if (!stream.eof() && (IS_DIGIT(ch) || (ch == '-'))) { // Value begin + keyword.hasVal = true; + bool negative = (ch == '-'); + if (negative) stream.get(ch); + i = 0; + while (!stream.eof() && IS_DIGIT(ch)) { + // We take into account only 10 digits, skip other. Older specs stated + // that we must be ready for an arbitrary number of digits. + if (i++ < 10) + keyword.val = keyword.val*10 + (ch - '0'); + stream.get(ch); + } + if (negative) keyword.val = -keyword.val; + } + // End of control word; the space is just a delimiter - skip it + if (!stream.eof() && !(ch == ' ')) + stream.unget(); + } + else { // Control symbol + keyword.name[0] = ch, keyword.name[1] = 0; + } + return keyword; +} + +Lexem GetLexem(std::istream& stream) +{ + Lexem result; + // We always stay at the beginning of the next lexem or a crlf + // If it's a brace then it's group begin/end + // If it's a backslash -> Preprocess + // - if it's a \u or \' -> make UTF16 character + // - else it's a keyword -> Process (e.g., remember the codepage) + // - (if the keyword is \bin then the following is #BDATA) + // If it's some other character -> Preprocess + // - if it's 0x09 -> it's the keyword \tab + // - else it's a PCDATA + char ch; + while (!stream.get(ch).eof() && ((ch == '\n') || (ch == '\r'))); // Skip crlf + if (stream.eof()) + result = Lexem::eof; + else { + switch (ch) { + case '{': // Group begin + case '}': // Group end + result = (ch == '{') ? Lexem::groupBegin : Lexem::groupEnd; + break; + case '\\': // Keyword + result.SetKeyword(GetKeyword(stream)); + break; + case '\t': // tab + result.SetKeyword("tab"); + break; + default: // PSDATA? + result.SetPCDATA_A(ch); + break; + } + } + return result; +} + +void PreprocessLexem(/*inout*/Lexem& lexem, std::istream& stream, int uc) +{ + if (lexem.type() == Lexem::ltKeyword) { + if (lexem.KeywordName()[0] == 0) // Empty keyword - maybe eof? + lexem = Lexem::error; + else if (eq(lexem.KeywordName(), "u")) { + // Unicode character - get the UTF16 and skip the uc characters + if (const int* val = lexem.KeywordVal()) { + lexem.SetPCDATA_W(*val); + stream.ignore(uc); + } + else lexem = Lexem::error; + } + else if (eq(lexem.KeywordName(), "'")) { + // 8-bit character (\'hh) -> use current codepage + char ch, ch1; + if (!stream.get(ch).eof()) ch1 = HexToInt(ch); + if (!stream.get(ch).eof()) (ch1 <<= 4) += HexToInt(ch); + lexem.SetPCDATA_A(ch1); + } + else if (eq(lexem.KeywordName(), "\\") || eq(lexem.KeywordName(), "{") || + eq(lexem.KeywordName(), "}")) // escaped characters + lexem.SetPCDATA_A(lexem.KeywordName()[0]); + else if (eq(lexem.KeywordName(), "bin")) { + if (const int* i = lexem.KeywordVal()) { + char* data = new char[*i]; + if (data) { + stream.read(data, *i); + if (stream.fail()) + lexem = Lexem::error; + else + lexem.SetBDATA(data, *i); + delete[] data; + } + else lexem = Lexem::error; + } + else lexem = Lexem::error; + } + else if (eq(lexem.KeywordName(), "\n") || eq(lexem.KeywordName(), "\r")) { + // escaped cr or lf + lexem.SetKeyword("par"); + } + } +} + +void UpdateState(const Lexem& lexem, /*inout*/GlobalState& globalState) +{ + switch (globalState.pcdata_a_state) { + case GlobalState::pcdsfinished: // Last time we finished the pcdata + globalState.pcdata_a_state = GlobalState::pcdsno; + break; + case GlobalState::pcdsin: + // to be reset later if still in the pcdata + globalState.pcdata_a_state = GlobalState::pcdsfinished; + break; + } + + switch (lexem.type()) { + case Lexem::ltGroupBegin: + globalState.stack.push(globalState.stack.top()); + break; + case Lexem::ltGroupEnd: + globalState.stack.pop(); + break; + case Lexem::ltKeyword: + { + const int* val = lexem.KeywordVal(); + if (eq(lexem.KeywordName(), "ansi")) globalState.codepage = CP_ACP; + else if (eq(lexem.KeywordName(), "mac")) globalState.codepage = CP_MACCP; + else if (eq(lexem.KeywordName(), "pc")) globalState.codepage = 437; + else if (eq(lexem.KeywordName(), "pca")) globalState.codepage = 850; + else if (eq(lexem.KeywordName(), "ansicpg") && val) + globalState.codepage = static_cast<unsigned int>(*val); + else if (eq(lexem.KeywordName(), "deff") && val) + globalState.deff = *val; + else if (eq(lexem.KeywordName(), "fonttbl")) globalState.stack.top().fonttbl = true; + else if (eq(lexem.KeywordName(), "f") && val) { + globalState.stack.top().f = *val; + } + else if (eq(lexem.KeywordName(), "fcharset") && + globalState.stack.top().fonttbl && + (globalState.stack.top().f != -1) && val) { + FontInfo& f = globalState.fonttbl[globalState.stack.top().f]; + f.options |= FontInfo::has_fcharset; + f.fcharset = *val; + } + else if (eq(lexem.KeywordName(), "cpg") && val) { + if (globalState.stack.top().fonttbl && (globalState.stack.top().f != -1)) { // Defining a font + FontInfo& f = globalState.fonttbl[globalState.stack.top().f]; + f.options |= FontInfo::has_cpg; + f.cpg = *val; + } + else { // Overriding the codepage for the block - may be in filenames + globalState.stack.top().codepage = *val; + } + } + else if (eq(lexem.KeywordName(), "plain")) + globalState.stack.top().f = -1; + else if (eq(lexem.KeywordName(), "uc") && val) + globalState.stack.top().uc = *val; + } + break; + case Lexem::ltPCDATA_A: + if (globalState.pcdata_a_state == GlobalState::pcdsno) // Beginning of the pcdata + globalState.pcdata_a_codepage = globalState.GetCurrentCP(); // to use later to convert to utf16 + globalState.pcdata_a_state = GlobalState::pcdsin; + globalState.pcdata_a << lexem.pcdata_a(); + break; + } +} + +void DecodeRTF(std::istream& rtf, CRTFDecoder& decoder) +{ + // Check if this is the rtf + Lexem lexem = GetLexem(rtf); + if (lexem.type() != Lexem::ltGroupBegin) + return; + decoder.BeginGroup(); + lexem = GetLexem(rtf); + if ((lexem.type() != Lexem::ltKeyword) || !eq(lexem.KeywordName(), "rtf") || + !lexem.KeywordVal() || (*lexem.KeywordVal() != 1)) + return; + decoder.Keyword(lexem.KeywordName(), lexem.KeywordVal()); + + GlobalState state(rtf); + // Level is the count of elements in the stack + + while (!state.stream.eof() && (state.stack.size()>0)) { // Don't go past the global group + lexem = GetLexem(state.stream); + PreprocessLexem(lexem, state.stream, state.stack.top().uc); + UpdateState(lexem, state); + + if (state.pcdata_a_state == GlobalState::pcdsfinished) { + std::string s = state.pcdata_a.str(); + int sz = ::MultiByteToWideChar(state.pcdata_a_codepage, 0, s.c_str(), s.size(), 0, 0); + if (sz) { + wchar_t* data = new wchar_t[sz]; + ::MultiByteToWideChar(state.pcdata_a_codepage, 0, s.c_str(), s.size(), data, sz); + decoder.PCDATA(data, sz); + delete[] data; + } + state.pcdata_a.str(""); // reset + } + + switch (lexem.type()) { + case Lexem::ltGroupBegin: + decoder.BeginGroup(); + break; + case Lexem::ltGroupEnd: + decoder.EndGroup(); + break; + case Lexem::ltKeyword: + decoder.Keyword(lexem.KeywordName(), lexem.KeywordVal()); + break; + case Lexem::ltPCDATA_W: + { + wchar_t ch = lexem.pcdata_w(); + decoder.PCDATA(&ch, 1); + } + break; + case Lexem::ltBDATA: + decoder.BDATA(lexem.bdata(), lexem.bdata_sz()); + break; + case Lexem::ltError: + break; // Just silently skip the erroneous data - basic error recovery + } + } // while +} // DecodeRTF |