lyx_mirror/src/support/unicode.cpp

/**
 * \file unicode.cpp
 * This file is part of LyX, the document processor.
 * Licence details can be found in the file COPYING.
 *
 * \author Lars Gullik Bjønnes
 *
 * Full author contact details are available in file CREDITS.
 *
 * A collection of unicode conversion functions, using iconv.
 */

#include <config.h>

#include "support/unicode.h"
#include "support/debug.h"
#include "support/mutex.h"

#include <iconv.h>

#include <boost/cstdint.hpp>

#include <cerrno>
#include <iomanip>
#include <map>
#include <ostream>
#include <string>


using namespace std;

namespace {

#ifdef WORDS_BIGENDIAN
	char const * utf16_codeset = "UTF16-BE";
#else
	char const * utf16_codeset = "UTF16-LE";
#endif

}


namespace lyx {

#ifdef WORDS_BIGENDIAN
	char const * ucs4_codeset = "UCS-4BE";
#else
	char const * ucs4_codeset = "UCS-4LE";
#endif

static const iconv_t invalid_cd = (iconv_t)(-1);


struct IconvProcessor::Impl
{
	Impl(string const & to, string const & from)
		: cd(invalid_cd), tocode_(to), fromcode_(from)
	{}

	~Impl()
	{
		if (cd != invalid_cd && iconv_close(cd) == -1)
				LYXERR0("Error returned from iconv_close(" << errno << ")");
	}

	iconv_t cd;
	string tocode_;
	string fromcode_;

	Mutex mutex_; // iconv() is not thread save, see #7240
};


IconvProcessor::IconvProcessor(char const * tocode, char const * fromcode)
	: pimpl_(new IconvProcessor::Impl(tocode, fromcode))
{
}


IconvProcessor::IconvProcessor(IconvProcessor const & other)
	: pimpl_(new IconvProcessor::Impl(other.pimpl_->tocode_, other.pimpl_->fromcode_))
{
}


IconvProcessor::~IconvProcessor()
{
	delete pimpl_;
}


void IconvProcessor::operator=(IconvProcessor const & other)
{
	if (&other != this)
		pimpl_ = new Impl(other.pimpl_->tocode_, other.pimpl_->fromcode_);
}


bool IconvProcessor::init()
{
	if (pimpl_->cd != invalid_cd)
		return true;

	pimpl_->cd = iconv_open(pimpl_->tocode_.c_str(), pimpl_->fromcode_.c_str());
	if (pimpl_->cd != invalid_cd)
		return true;

	lyxerr << "Error returned from iconv_open" << endl;
	switch (errno) {
		case EINVAL:
			lyxerr << "EINVAL The conversion from " << pimpl_->fromcode_
				<< " to " << pimpl_->tocode_
				<< " is not supported by the implementation."
				<< endl;
			break;
		default:
			lyxerr << "\tSome other error: " << errno << endl;
			break;
	}
	return false;
}


int IconvProcessor::convert(char const * buf, size_t buflen,
		char * outbuf, size_t maxoutsize)
{
	Mutex::Locker lock(&pimpl_->mutex_);

	if (buflen == 0)
		return 0;

	if (pimpl_->cd == invalid_cd) {
		if (!init())
			return -1;
	}

	char ICONV_CONST * inbuf = const_cast<char ICONV_CONST *>(buf);
	size_t inbytesleft = buflen;
	size_t outbytesleft = maxoutsize;

	int res = iconv(pimpl_->cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);

	// flush out remaining data. This is needed because iconv sometimes
	// holds back chars in the stream, waiting for a combination character
	// (see e.g. http://sources.redhat.com/bugzilla/show_bug.cgi?id=1124)
	iconv(pimpl_->cd, NULL, NULL, &outbuf, &outbytesleft);

	//lyxerr << dec;
	//lyxerr << "Inbytesleft: " << inbytesleft << endl;
	//lyxerr << "Outbytesleft: " << outbytesleft << endl;

	if (res != -1)
		// Everything went well.
		return maxoutsize - outbytesleft;

	// There are some errors in the conversion
	lyxerr << "Error returned from iconv" << endl;
	switch (errno) {
		case E2BIG:
			lyxerr << "E2BIG  There is not sufficient room at *outbuf." << endl;
			break;
		case EILSEQ:
			lyxerr << "EILSEQ An invalid multibyte sequence"
				<< " has been encountered in the input.\n"
				<< "When converting from " << pimpl_->fromcode_
				<< " to " << pimpl_->tocode_ << ".\n";
			lyxerr << "Input:" << hex;
			for (size_t i = 0; i < buflen; ++i) {
				// char may be signed, avoid output of
				// something like 0xffffffc2
				boost::uint32_t const b =
					*reinterpret_cast<unsigned char const *>(buf + i);
				lyxerr << " 0x" << (unsigned int)b;
			}
			lyxerr << dec << endl;
			break;
		case EINVAL:
			lyxerr << "EINVAL An incomplete multibyte sequence"
				<< " has been encountered in the input.\n"
				<< "When converting from " << pimpl_->fromcode_
				<< " to " << pimpl_->tocode_ << ".\n";
			lyxerr << "Input:" << hex;
			for (size_t i = 0; i < buflen; ++i) {
				// char may be signed, avoid output of
				// something like 0xffffffc2
				boost::uint32_t const b =
					*reinterpret_cast<unsigned char const *>(buf + i);
				lyxerr << " 0x" << (unsigned int)b;
			}
			lyxerr << dec << endl;
			break;
		default:
			lyxerr << "\tSome other error: " << errno << endl;
			break;
	}
	// We got an error so we close down the conversion engine
	if (iconv_close(pimpl_->cd) == -1) {
		lyxerr << "Error returned from iconv_close("
			<< errno << ")" << endl;
	}
	pimpl_->cd = invalid_cd;
	return -1;
}


std::string IconvProcessor::from() const
{
	return pimpl_->fromcode_;
}


std::string IconvProcessor::to() const
{
	return pimpl_->tocode_;
}


namespace {


template<typename RetType, typename InType>
vector<RetType>
iconv_convert(IconvProcessor & processor, InType const * buf, size_t buflen)
{
	if (buflen == 0)
		return vector<RetType>();

	char const * inbuf = reinterpret_cast<char const *>(buf);
	size_t inbytesleft = buflen * sizeof(InType);

	static std::vector<char> outbuf(32768);
	// The number of UCS4 code points in buf is at most inbytesleft.
	// The output encoding will use at most
	// max_encoded_bytes(pimpl_->tocode_) per UCS4 code point.
	size_t maxoutbufsize = max_encoded_bytes(processor.to()) * inbytesleft;
	if (outbuf.size() < maxoutbufsize)
		outbuf.resize(maxoutbufsize);

	int bytes = processor.convert(inbuf, inbytesleft, &outbuf[0], outbuf.size());
	if (bytes <= 0)
		// Conversion failed
		// FIXME Maybe throw an exception and handle that in the caller?
		return vector<RetType>();

	RetType const * tmp = reinterpret_cast<RetType const *>(&outbuf[0]);
	return vector<RetType>(tmp, tmp + bytes / sizeof(RetType));
}

} // anon namespace


vector<char_type> utf8_to_ucs4(vector<char> const & utf8str)
{
	if (utf8str.empty())
		return vector<char_type>();

	return utf8_to_ucs4(&utf8str[0], utf8str.size());
}


vector<char_type>
utf8_to_ucs4(char const * utf8str, size_t ls)
{
	static IconvProcessor processor(ucs4_codeset, "UTF-8");
	return iconv_convert<char_type>(processor, utf8str, ls);
}


vector<char_type>
utf16_to_ucs4(unsigned short const * s, size_t ls)
{
	static IconvProcessor processor(ucs4_codeset, utf16_codeset);
	return iconv_convert<char_type>(processor, s, ls);
}


vector<unsigned short>
ucs4_to_utf16(char_type const * s, size_t ls)
{
	static IconvProcessor processor(utf16_codeset, ucs4_codeset);
	return iconv_convert<unsigned short>(processor, s, ls);
}


vector<char>
ucs4_to_utf8(char_type c)
{
	static IconvProcessor processor("UTF-8", ucs4_codeset);
	return iconv_convert<char>(processor, &c, 1);
}


vector<char>
ucs4_to_utf8(vector<char_type> const & ucs4str)
{
	if (ucs4str.empty())
		return vector<char>();

	return ucs4_to_utf8(&ucs4str[0], ucs4str.size());
}


vector<char>
ucs4_to_utf8(char_type const * ucs4str, size_t ls)
{
	static IconvProcessor processor("UTF-8", ucs4_codeset);
	return iconv_convert<char>(processor, ucs4str, ls);
}


vector<char_type>
eightbit_to_ucs4(char const * s, size_t ls, string const & encoding)
{
	static map<string, IconvProcessor> processors;
	if (processors.find(encoding) == processors.end()) {
		IconvProcessor processor(ucs4_codeset, encoding.c_str());
		processors.insert(make_pair(encoding, processor));
	}
	return iconv_convert<char_type>(processors[encoding], s, ls);
}


vector<char>
ucs4_to_eightbit(char_type const * ucs4str, size_t ls, string const & encoding)
{
	static map<string, IconvProcessor> processors;
	if (processors.find(encoding) == processors.end()) {
		IconvProcessor processor(encoding.c_str(), ucs4_codeset);
		processors.insert(make_pair(encoding, processor));
	}
	return iconv_convert<char>(processors[encoding], ucs4str, ls);
}


char ucs4_to_eightbit(char_type ucs4, string const & encoding)
{
	static map<string, IconvProcessor> processors;
	map<string, IconvProcessor>::iterator it = processors.find(encoding);
	if (it == processors.end()) {
		IconvProcessor processor(encoding.c_str(), ucs4_codeset);
		it = processors.insert(make_pair(encoding, processor)).first;
	}

	char out;
	int const bytes = it->second.convert((char *)(&ucs4), 4, &out, 1);
	if (bytes > 0)
		return out;
	return 0;
}


void ucs4_to_multibytes(char_type ucs4, vector<char> & out,
	string const & encoding)
{
	static map<string, IconvProcessor> processors;
	map<string, IconvProcessor>::iterator it = processors.find(encoding);
	if (it == processors.end()) {
		IconvProcessor processor(encoding.c_str(), ucs4_codeset);
		it = processors.insert(make_pair(encoding, processor)).first;
	}

	out.resize(4);
	int bytes = it->second.convert((char *)(&ucs4), 4, &out[0], 4);
	if (bytes > 0)
		out.resize(bytes);
	else
		out.clear();
}

int max_encoded_bytes(std::string const & encoding)
{
	// FIXME: this information should be transferred to lib/encodings
	// UTF8 uses at most 4 bytes to represent one UCS4 code point
	// (see RFC 3629). RFC 2279 specifies 6 bytes, but that
	// information is outdated, and RFC 2279 has been superseded by
	// RFC 3629.
	// The CJK encodings use (different) multibyte representation as well.
	// All other encodings encode one UCS4 code point in one byte
	// (and can therefore only encode a subset of UCS4)
	// Note that BIG5 and SJIS do not work with LaTeX (see lib/encodings).
	// Furthermore, all encodings that use shifting (like SJIS) do not work with
	// iconv_codecvt_facet.
	if (encoding == "UTF-8" ||
	    encoding == "GB" ||
	    encoding == "EUC-TW")
		return 4;
	else if (encoding == "EUC-JP")
		return 3;
	else if (encoding == "ISO-2022-JP")
		return 8;
	else if (encoding == "BIG5" ||
	         encoding == "EUC-KR" ||
	         encoding == "EUC-CN" ||
	         encoding == "SJIS" ||
	         encoding == "GBK")
		return 2;
	else
		return 1;
}

} // namespace lyx