/** * \file unicode.cpp * This file is part of LyX, the document processor. * Licence details can be found in the file COPYING. * * \author Lars Gullik Bjønnes * * Full author contact details are available in file CREDITS. * * A collection of unicode conversion functions, using iconv. */ #include #include "support/unicode.h" #include "support/debug.h" #include #include #include #include #include #include #include using namespace std; namespace { #ifdef WORDS_BIGENDIAN char const * utf16_codeset = "UTF16-BE"; #else char const * utf16_codeset = "UTF16-LE"; #endif } namespace lyx { #ifdef WORDS_BIGENDIAN char const * ucs4_codeset = "UCS-4BE"; #else char const * ucs4_codeset = "UCS-4LE"; #endif static const iconv_t invalid_cd = (iconv_t)(-1); struct IconvProcessor::Impl { Impl(string const & to, string const & from) : cd(invalid_cd), tocode_(to), fromcode_(from) {} ~Impl() { if (cd != invalid_cd && iconv_close(cd) == -1) LYXERR0("Error returned from iconv_close(" << errno << ")"); } iconv_t cd; string tocode_; string fromcode_; }; IconvProcessor::IconvProcessor(char const * tocode, char const * fromcode) : pimpl_(new IconvProcessor::Impl(tocode, fromcode)) { } IconvProcessor::IconvProcessor(IconvProcessor const & other) : pimpl_(new IconvProcessor::Impl(other.pimpl_->tocode_, other.pimpl_->fromcode_)) { } IconvProcessor::~IconvProcessor() { delete pimpl_; } void IconvProcessor::operator=(IconvProcessor const & other) { if (&other != this) pimpl_ = new Impl(other.pimpl_->tocode_, other.pimpl_->fromcode_); } bool IconvProcessor::init() { if (pimpl_->cd != invalid_cd) return true; pimpl_->cd = iconv_open(pimpl_->tocode_.c_str(), pimpl_->fromcode_.c_str()); if (pimpl_->cd != invalid_cd) return true; lyxerr << "Error returned from iconv_open" << endl; switch (errno) { case EINVAL: lyxerr << "EINVAL The conversion from " << pimpl_->fromcode_ << " to " << pimpl_->tocode_ << " is not supported by the implementation." << endl; break; default: lyxerr << "\tSome other error: " << errno << endl; break; } return false; } int IconvProcessor::convert(char const * buf, size_t buflen, char * outbuf, size_t maxoutsize) { if (buflen == 0) return 0; if (pimpl_->cd == invalid_cd) { if (!init()) return -1; } char ICONV_CONST * inbuf = const_cast(buf); size_t inbytesleft = buflen; size_t outbytesleft = maxoutsize; int res = iconv(pimpl_->cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); // flush out remaining data. This is needed because iconv sometimes // holds back chars in the stream, waiting for a combination character // (see e.g. http://sources.redhat.com/bugzilla/show_bug.cgi?id=1124) iconv(pimpl_->cd, NULL, NULL, &outbuf, &outbytesleft); //lyxerr << dec; //lyxerr << "Inbytesleft: " << inbytesleft << endl; //lyxerr << "Outbytesleft: " << outbytesleft << endl; if (res != -1) // Everything went well. return maxoutsize - outbytesleft; // There are some errors in the conversion lyxerr << "Error returned from iconv" << endl; switch (errno) { case E2BIG: lyxerr << "E2BIG There is not sufficient room at *outbuf." << endl; break; case EILSEQ: lyxerr << "EILSEQ An invalid multibyte sequence" << " has been encountered in the input.\n" << "When converting from " << pimpl_->fromcode_ << " to " << pimpl_->tocode_ << ".\n"; lyxerr << "Input:" << hex; for (size_t i = 0; i < buflen; ++i) { // char may be signed, avoid output of // something like 0xffffffc2 boost::uint32_t const b = *reinterpret_cast(buf + i); lyxerr << " 0x" << (unsigned int)b; } lyxerr << dec << endl; break; case EINVAL: lyxerr << "EINVAL An incomplete multibyte sequence" << " has been encountered in the input.\n" << "When converting from " << pimpl_->fromcode_ << " to " << pimpl_->tocode_ << ".\n"; lyxerr << "Input:" << hex; for (size_t i = 0; i < buflen; ++i) { // char may be signed, avoid output of // something like 0xffffffc2 boost::uint32_t const b = *reinterpret_cast(buf + i); lyxerr << " 0x" << (unsigned int)b; } lyxerr << dec << endl; break; default: lyxerr << "\tSome other error: " << errno << endl; break; } // We got an error so we close down the conversion engine if (iconv_close(pimpl_->cd) == -1) { lyxerr << "Error returned from iconv_close(" << errno << ")" << endl; } pimpl_->cd = invalid_cd; return -1; } namespace { template vector iconv_convert(IconvProcessor & processor, InType const * buf, size_t buflen) { if (buflen == 0) return vector(); char const * inbuf = reinterpret_cast(buf); size_t inbytesleft = buflen * sizeof(InType); size_t const outsize = 32768; static char out[outsize]; char * outbuf = out; int bytes = processor.convert(inbuf, inbytesleft, outbuf, outsize); if (bytes <= 0) // Conversion failed // FIXME Maybe throw an exception and handle that in the caller? return vector(); RetType const * tmp = reinterpret_cast(out); return vector(tmp, tmp + bytes / sizeof(RetType)); } } // anon namespace vector utf8_to_ucs4(vector const & utf8str) { if (utf8str.empty()) return vector(); return utf8_to_ucs4(&utf8str[0], utf8str.size()); } vector utf8_to_ucs4(char const * utf8str, size_t ls) { static IconvProcessor processor(ucs4_codeset, "UTF-8"); return iconv_convert(processor, utf8str, ls); } vector utf16_to_ucs4(unsigned short const * s, size_t ls) { static IconvProcessor processor(ucs4_codeset, utf16_codeset); return iconv_convert(processor, s, ls); } vector ucs4_to_utf16(char_type const * s, size_t ls) { static IconvProcessor processor(utf16_codeset, ucs4_codeset); return iconv_convert(processor, s, ls); } vector ucs4_to_utf8(char_type c) { static IconvProcessor processor("UTF-8", ucs4_codeset); return iconv_convert(processor, &c, 1); } vector ucs4_to_utf8(vector const & ucs4str) { if (ucs4str.empty()) return vector(); return ucs4_to_utf8(&ucs4str[0], ucs4str.size()); } vector ucs4_to_utf8(char_type const * ucs4str, size_t ls) { static IconvProcessor processor("UTF-8", ucs4_codeset); return iconv_convert(processor, ucs4str, ls); } vector eightbit_to_ucs4(char const * s, size_t ls, string const & encoding) { static map processors; if (processors.find(encoding) == processors.end()) { IconvProcessor processor(ucs4_codeset, encoding.c_str()); processors.insert(make_pair(encoding, processor)); } return iconv_convert(processors[encoding], s, ls); } vector ucs4_to_eightbit(char_type const * ucs4str, size_t ls, string const & encoding) { static map processors; if (processors.find(encoding) == processors.end()) { IconvProcessor processor(encoding.c_str(), ucs4_codeset); processors.insert(make_pair(encoding, processor)); } return iconv_convert(processors[encoding], ucs4str, ls); } char ucs4_to_eightbit(char_type ucs4, string const & encoding) { static map processors; map::iterator it = processors.find(encoding); if (it == processors.end()) { IconvProcessor processor(encoding.c_str(), ucs4_codeset); it = processors.insert(make_pair(encoding, processor)).first; } char out; int const bytes = it->second.convert((char *)(&ucs4), 4, &out, 1); if (bytes > 0) return out; return 0; } void ucs4_to_multibytes(char_type ucs4, vector & out, string const & encoding) { static map processors; map::iterator it = processors.find(encoding); if (it == processors.end()) { IconvProcessor processor(encoding.c_str(), ucs4_codeset); it = processors.insert(make_pair(encoding, processor)).first; } out.resize(4); int bytes = it->second.convert((char *)(&ucs4), 4, &out[0], 4); if (bytes > 0) out.resize(bytes); else out.clear(); } } // namespace lyx