diff --git a/src/support/docstream.cpp b/src/support/docstream.cpp index ed2ddfe88b..60787a654a 100644 --- a/src/support/docstream.cpp +++ b/src/support/docstream.cpp @@ -242,33 +242,7 @@ protected: } virtual int do_max_length() const throw() { - // FIXME: this information should be transferred to lib/encodings - // UTF8 uses at most 4 bytes to represent one UCS4 code point - // (see RFC 3629). RFC 2279 specifies 6 bytes, but that - // information is outdated, and RFC 2279 has been superseded by - // RFC 3629. - // The CJK encodings use (different) multibyte representation as well. - // All other encodings encode one UCS4 code point in one byte - // (and can therefore only encode a subset of UCS4) - // Note that BIG5 and SJIS do not work with LaTeX (see lib/encodings). - // Furthermore, all encodings that use shifting (like SJIS) do not work with - // iconv_codecvt_facet. - if (encoding_ == "UTF-8" || - encoding_ == "GB" || - encoding_ == "EUC-TW") - return 4; - else if (encoding_ == "EUC-JP") - return 3; - else if (encoding_ == "ISO-2022-JP") - return 8; - else if (encoding_ == "BIG5" || - encoding_ == "EUC-KR" || - encoding_ == "EUC-CN" || - encoding_ == "SJIS" || - encoding_ == "GBK") - return 2; - else - return 1; + return lyx::max_encoded_bytes(encoding_); } private: /// Do the actual conversion. The interface is equivalent to that of diff --git a/src/support/unicode.cpp b/src/support/unicode.cpp index 39090769d2..7f601a86c5 100644 --- a/src/support/unicode.cpp +++ b/src/support/unicode.cpp @@ -197,6 +197,18 @@ int IconvProcessor::convert(char const * buf, size_t buflen, } +std::string IconvProcessor::from() const +{ + return pimpl_->fromcode_; +} + + +std::string IconvProcessor::to() const +{ + return pimpl_->tocode_; +} + + namespace { @@ -210,17 +222,21 @@ iconv_convert(IconvProcessor & processor, InType const * buf, size_t buflen) char const * inbuf = reinterpret_cast(buf); size_t inbytesleft = buflen * sizeof(InType); - size_t const outsize = 32768; - static char out[outsize]; - char * outbuf = out; + static std::vector outbuf(32768); + // The number of UCS4 code points in buf is at most inbytesleft. + // The output encoding will use at most + // max_encoded_bytes(pimpl_->tocode_) per UCS4 code point. + size_t maxoutbufsize = max_encoded_bytes(processor.to()) * inbytesleft; + if (outbuf.size() < maxoutbufsize) + outbuf.resize(maxoutbufsize); - int bytes = processor.convert(inbuf, inbytesleft, outbuf, outsize); + int bytes = processor.convert(inbuf, inbytesleft, outbuf.data(), outbuf.size()); if (bytes <= 0) // Conversion failed // FIXME Maybe throw an exception and handle that in the caller? return vector(); - RetType const * tmp = reinterpret_cast(out); + RetType const * tmp = reinterpret_cast(outbuf.data()); return vector(tmp, tmp + bytes / sizeof(RetType)); } @@ -345,4 +361,35 @@ void ucs4_to_multibytes(char_type ucs4, vector & out, out.clear(); } +int max_encoded_bytes(std::string const & encoding) +{ + // FIXME: this information should be transferred to lib/encodings + // UTF8 uses at most 4 bytes to represent one UCS4 code point + // (see RFC 3629). RFC 2279 specifies 6 bytes, but that + // information is outdated, and RFC 2279 has been superseded by + // RFC 3629. + // The CJK encodings use (different) multibyte representation as well. + // All other encodings encode one UCS4 code point in one byte + // (and can therefore only encode a subset of UCS4) + // Note that BIG5 and SJIS do not work with LaTeX (see lib/encodings). + // Furthermore, all encodings that use shifting (like SJIS) do not work with + // iconv_codecvt_facet. + if (encoding == "UTF-8" || + encoding == "GB" || + encoding == "EUC-TW") + return 4; + else if (encoding == "EUC-JP") + return 3; + else if (encoding == "ISO-2022-JP") + return 8; + else if (encoding == "BIG5" || + encoding == "EUC-KR" || + encoding == "EUC-CN" || + encoding == "SJIS" || + encoding == "GBK") + return 2; + else + return 1; +} + } // namespace lyx diff --git a/src/support/unicode.h b/src/support/unicode.h index 9a7deb96ec..afe0bc5858 100644 --- a/src/support/unicode.h +++ b/src/support/unicode.h @@ -36,6 +36,11 @@ public: int convert(char const * in_buffer, size_t in_size, char * out_buffer, size_t max_out_size); + /// source encoding + std::string from() const; + /// target encoding + std::string to() const; + private: /// open iconv. /// \return true if the processor is ready to use. @@ -88,6 +93,8 @@ void ucs4_to_multibytes(char_type ucs4, std::vector & out, extern char const * ucs4_codeset; +/// How many bytes does one UCS4 code point use at most in encoding \p encoding? +int max_encoded_bytes(std::string const & encoding); } // namespace lyx