lyx_mirror/src/support/unicode.cpp

378 lines
9.7 KiB
C++
Raw Normal View History

/**
Rename files in src/support, step one. src/support/package.h src/support/Package.h Package src/support/package.C.in src/support/Package.C.in Package src/support/path.h src/support/Path.h Path src/support/fs_extras.h src/support/fs_extras.h NOCLASSES src/support/RandomAccessList.h src/support/RandomAccessList.h RandomAccessList src/support/lyxmanip.h src/support/lyxmanip.h NOCLASSES src/support/rename.C src/support/rename.cpp NOCLASSES src/support/abort.C src/support/abort.cpp NOCLASSES src/support/lyxlib.h src/support/lyxlib.h NOCLASSES src/support/ExceptionMessage.h src/support/ExceptionMessage.h ExceptionMessage src/support/copy.C src/support/copy.cpp NOCLASSES src/support/limited_stack.h src/support/limited_stack.h limited_stack src/support/filefilterlist.C src/support/FileFilterList.cpp ['FileFilterList', 'Filter'] src/support/cow_ptr.h src/support/cow_ptr.h cow_ptr src/support/os_unix.C src/support/os_unix.cpp NOCLASSES src/support/socktools.h src/support/socktools.h NOCLASSES src/support/forkedcontr.h src/support/ForkedcallsController.h ForkedcallsController src/support/os.h src/support/os.h NOCLASSES src/support/FileMonitor.h src/support/FileMonitor.h FileMonitor src/support/copied_ptr.h src/support/copied_ptr.h copied_ptr src/support/translator.h src/support/Translator.h Translator src/support/filetools.C src/support/filetools.cpp NOCLASSES src/support/unlink.C src/support/unlink.cpp NOCLASSES src/support/os_win32.C src/support/os_win32.cpp GetFolderPath src/support/lstrings.C src/support/lstrings.cpp NOCLASSES src/support/qstring_helpers.C src/support/qstring_helpers.cpp NOCLASSES src/support/getcwd.C src/support/getcwd.cpp NOCLASSES src/support/systemcall.C src/support/Systemcall.cpp Systemcall src/support/lyxalgo.h src/support/lyxalgo.h NOCLASSES src/support/filefilterlist.h src/support/FileFilterList.h ['FileFilterList', 'Filter'] src/support/unicode.C src/support/unicode.cpp IconvProcessor src/support/userinfo.C src/support/userinfo.cpp NOCLASSES src/support/lyxtime.C src/support/lyxtime.cpp NOCLASSES src/support/kill.C src/support/kill.cpp NOCLASSES src/support/docstring.C src/support/docstring.cpp to_local8bit_failure src/support/os_cygwin.C src/support/os_cygwin.cpp NOCLASSES src/support/lyxsum.C src/support/lyxsum.cpp NOCLASSES src/support/environment.C src/support/environment.cpp NOCLASSES src/support/filetools.h src/support/filetools.h NOCLASSES src/support/textutils.C src/support/textutils.cpp NOCLASSES src/support/mkdir.C src/support/mkdir.cpp NOCLASSES src/support/forkedcall.C src/support/Forkedcall.cpp ['ForkedProcess', 'Forkedcall'] src/support/tempname.C src/support/tempname.cpp NOCLASSES src/support/os_win32.h src/support/os_win32.h GetFolderPath src/support/types.h src/support/types.h NOCLASSES src/support/lstrings.h src/support/lstrings.h NOCLASSES src/support/forkedcallqueue.C src/support/ForkedCallQueue.cpp ForkedCallQueue src/support/qstring_helpers.h src/support/qstring_helpers.h NOCLASSES src/support/convert.C src/support/convert.cpp NOCLASSES src/support/filename.C src/support/FileName.cpp ['FileName', 'DocFileName'] src/support/tests/convert.C src/support/tests/convert.cpp NOCLASSES src/support/tests/filetools.C src/support/tests/filetools.cpp NOCLASSES src/support/tests/lstrings.C src/support/tests/lstrings.cpp NOCLASSES src/support/tests/boost.C src/support/tests/boost.cpp NOCLASSES src/support/docstream.C src/support/docstream.cpp ['iconv_codecvt_facet_exception', 'idocfstream', 'odocfstream'] src/support/std_istream.h src/support/std_istream.h NOCLASSES src/support/systemcall.h src/support/Systemcall.h Systemcall src/support/chdir.C src/support/chdir.cpp NOCLASSES src/support/std_ostream.h src/support/std_ostream.h NOCLASSES src/support/unicode.h src/support/unicode.h IconvProcessor src/support/path.C src/support/Path.cpp Path src/support/fs_extras.C src/support/fs_extras.cpp NOCLASSES src/support/userinfo.h src/support/userinfo.h NOCLASSES src/support/lyxtime.h src/support/lyxtime.h NOCLASSES src/support/docstring.h src/support/docstring.h to_local8bit_failure src/support/debugstream.h src/support/debugstream.h basic_debugstream src/support/environment.h src/support/environment.h NOCLASSES src/support/textutils.h src/support/textutils.h NOCLASSES src/support/forkedcall.h src/support/Forkedcall.h ['ForkedProcess', 'Forkedcall'] src/support/socktools.C src/support/socktools.cpp NOCLASSES src/support/forkedcallqueue.h src/support/ForkedCallQueue.h ForkedCallQueue src/support/forkedcontr.C src/support/ForkedcallsController.cpp ForkedcallsController src/support/os.C src/support/os.cpp NOCLASSES src/support/convert.h src/support/convert.h NOCLASSES src/support/filename.h src/support/FileName.h ['FileName', 'DocFileName'] src/support/docstream.h src/support/docstream.h ['iconv_codecvt_facet_exception', 'idocfstream', 'odocfstream'] src/support/FileMonitor.C src/support/FileMonitor.cpp FileMonitor git-svn-id: svn://svn.lyx.org/lyx/lyx-devel/trunk@18024 a592a061-630c-0410-9148-cb99ea01b6c8
2007-04-26 05:12:52 +00:00
* \file unicode.cpp
* This file is part of LyX, the document processor.
* Licence details can be found in the file COPYING.
*
* \author Lars Gullik Bjønnes
*
* Full author contact details are available in file CREDITS.
*
* A collection of unicode conversion functions, using iconv.
*/
#include <config.h>
#include "support/unicode.h"
#include "support/debug.h"
#include <QThreadStorage>
#include <iconv.h>
#include <cerrno>
#include <cstdint>
#include <map>
#include <ostream>
//Needed in MSVC
#include <string>
using namespace std;
namespace {
#ifdef WORDS_BIGENDIAN
char const * utf16_codeset = "UTF16-BE";
#else
char const * utf16_codeset = "UTF16-LE";
#endif
}
namespace lyx {
#ifdef WORDS_BIGENDIAN
char const * ucs4_codeset = "UCS-4BE";
#else
char const * ucs4_codeset = "UCS-4LE";
#endif
struct IconvProcessor::Handler {
// assumes cd is valid
explicit Handler(iconv_t const cd) : cd(cd) {}
~Handler() {
if (iconv_close(cd) == -1)
LYXERR0("Error returned from iconv_close(" << errno << ')');
}
iconv_t const cd;
};
IconvProcessor::IconvProcessor(string tocode, string fromcode)
: tocode_(std::move(tocode)), fromcode_(std::move(fromcode))
{}
bool IconvProcessor::init()
{
if (h_)
return true;
iconv_t cd = iconv_open(tocode_.c_str(), fromcode_.c_str());
if (cd != (iconv_t)(-1)) {
h_ = make_unique<Handler>(cd);
return true;
}
lyxerr << "Error returned from iconv_open" << endl;
switch (errno) {
case EINVAL:
lyxerr << "EINVAL The conversion from " << fromcode_ << " to "
<< tocode_ << " is not supported by the implementation."
<< endl;
break;
default:
lyxerr << "\tSome other error: " << errno << endl;
break;
}
return false;
}
int IconvProcessor::convert(char const * buf, size_t buflen,
char * outbuf, size_t maxoutsize)
{
if (buflen == 0)
return 0;
if (!h_ && !init())
return -1;
char ICONV_CONST * inbuf = const_cast<char ICONV_CONST *>(buf);
size_t inbytesleft = buflen;
size_t outbytesleft = maxoutsize;
int res = iconv(h_->cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
// flush out remaining data. This is needed because iconv sometimes
// holds back chars in the stream, waiting for a combination character
// (see e.g. http://sources.redhat.com/bugzilla/show_bug.cgi?id=1124)
2020-02-29 04:51:53 +00:00
iconv(h_->cd, nullptr, nullptr, &outbuf, &outbytesleft);
//lyxerr << dec;
//lyxerr << "Inbytesleft: " << inbytesleft << endl;
//lyxerr << "Outbytesleft: " << outbytesleft << endl;
if (res != -1)
// Everything went well.
return maxoutsize - outbytesleft;
// There are some errors in the conversion
lyxerr << "Error returned from iconv" << endl;
switch (errno) {
case E2BIG:
lyxerr << "E2BIG There is not sufficient room at *outbuf." << endl;
break;
case EILSEQ:
case EINVAL:
lyxerr << (errno == EINVAL
? "EINVAL An incomplete "
: "EILSEQ An invalid ")
<< "multibyte sequence has been encountered in the input.\n"
<< "When converting from " << fromcode_
<< " to " << tocode_ << ".\n";
lyxerr << "Input:" << hex;
for (size_t i = 0; i < buflen; ++i) {
// char may be signed, avoid output of
// something like 0xffffffc2
uint32_t const b =
*reinterpret_cast<unsigned char const *>(buf + i);
lyxerr << " 0x" << (unsigned int)b;
}
lyxerr << dec << endl;
break;
default:
lyxerr << "\tSome other error: " << errno << endl;
break;
}
// We got an error so we close down the conversion engine
h_.reset();
return -1;
}
namespace {
template<typename RetType, typename InType>
vector<RetType>
iconv_convert(IconvProcessor & processor, InType const * buf, size_t buflen)
{
if (buflen == 0)
return vector<RetType>();
char const * inbuf = reinterpret_cast<char const *>(buf);
size_t inbytesleft = buflen * sizeof(InType);
static QThreadStorage<std::vector<char> *> static_outbuf;
if (!static_outbuf.hasLocalData())
static_outbuf.setLocalData(new std::vector<char>(32768));
std::vector<char> & outbuf = *static_outbuf.localData();
// The number of UCS4 code points in buf is at most inbytesleft.
// The output encoding will use at most
// max_encoded_bytes(pimpl_->tocode_) per UCS4 code point.
size_t maxoutbufsize = max_encoded_bytes(processor.to()) * inbytesleft;
if (outbuf.size() < maxoutbufsize)
outbuf.resize(maxoutbufsize);
int bytes = processor.convert(inbuf, inbytesleft, &outbuf[0], outbuf.size());
if (bytes <= 0)
// Conversion failed
// FIXME Maybe throw an exception and handle that in the caller?
return vector<RetType>();
RetType const * tmp = reinterpret_cast<RetType const *>(&outbuf[0]);
return vector<RetType>(tmp, tmp + bytes / sizeof(RetType));
}
Bulk cleanup/fix incorrect annotation at the end of namespaces. This commit does a bulk fix of incorrect annotations (comments) at the end of namespaces. The commit was generated by initially running clang-format, and then from the diff of the result extracting the hunks corresponding to fixes of namespace comments. The changes being applied and all the results have been manually reviewed. The source code successfully builds on macOS. Further details on the steps below, in case they're of interest to someone else in the future. 1. Checkout a fresh and up to date version of src/ git pull && git checkout -- src && git status src 2. Ensure there's a suitable .clang-format in place, i.e. with options to fix the comment at the end of namespaces, including: FixNamespaceComments: true SpacesBeforeTrailingComments: 1 and that clang-format is >= 5.0.0, by doing e.g.: clang-format -dump-config | grep Comments: clang-format --version 3. Apply clang-format to the source: clang-format -i $(find src -name "*.cpp" -or -name "*.h") 4. Create and filter out hunks related to fixing the namespace git diff -U0 src > tmp.patch grepdiff '^} // namespace' --output-matching=hunk tmp.patch > fix_namespace.patch 5. Filter out hunks corresponding to simple fixes into to a separate patch: pcregrep -M -e '^diff[^\n]+\nindex[^\n]+\n--- [^\n]+\n\+\+\+ [^\n]+\n' \ -e '^@@ -[0-9]+ \+[0-9]+ @@[^\n]*\n-\}[^\n]*\n\+\}[^\n]*\n' \ fix_namespace.patch > fix_namespace_simple.patch 6. Manually review the simple patch and then apply it, after first restoring the source. git checkout -- src patch -p1 < fix_namespace_simple.path 7. Manually review the (simple) changes and then stage the changes git diff src git add src 8. Again apply clang-format and filter out hunks related to any remaining fixes to the namespace, this time filter with more context. There will be fewer hunks as all the simple cases have already been handled: clang-format -i $(find src -name "*.cpp" -or -name "*.h") git diff src > tmp.patch grepdiff '^} // namespace' --output-matching=hunk tmp.patch > fix_namespace2.patch 9. Manually review/edit the resulting patch file to remove hunks for files which need to be dealt with manually, noting the file names and line numbers. Then restore files to as before applying clang-format and apply the patch: git checkout src patch -p1 < fix_namespace2.patch 10. Manually fix the files noted in the previous step. Stage files, review changes and commit.
2017-07-23 11:11:54 +00:00
} // namespace
IconvProcessor & utf8ToUcs4()
{
static QThreadStorage<IconvProcessor *> processor;
if (!processor.hasLocalData())
processor.setLocalData(new IconvProcessor(ucs4_codeset, "UTF-8"));
return *processor.localData();
}
vector<char_type> utf8_to_ucs4(vector<char> const & utf8str)
{
if (utf8str.empty())
return vector<char_type>();
return utf8_to_ucs4(&utf8str[0], utf8str.size());
}
vector<char_type>
utf8_to_ucs4(char const * utf8str, size_t ls)
{
return iconv_convert<char_type>(utf8ToUcs4(), utf8str, ls);
}
vector<char_type>
utf16_to_ucs4(unsigned short const * s, size_t ls)
{
static QThreadStorage<IconvProcessor *> processor;
if (!processor.hasLocalData())
processor.setLocalData(new IconvProcessor(ucs4_codeset, utf16_codeset));
return iconv_convert<char_type>(*processor.localData(), s, ls);
}
vector<unsigned short>
ucs4_to_utf16(char_type const * s, size_t ls)
{
static QThreadStorage<IconvProcessor *> processor;
if (!processor.hasLocalData())
processor.setLocalData(new IconvProcessor(utf16_codeset, ucs4_codeset));
return iconv_convert<unsigned short>(*processor.localData(), s, ls);
}
IconvProcessor & ucs4ToUtf8()
{
static QThreadStorage<IconvProcessor *> processor;
if (!processor.hasLocalData())
processor.setLocalData(new IconvProcessor("UTF-8", ucs4_codeset));
return *processor.localData();
}
namespace {
IconvProcessor & getProc(map<string, IconvProcessor> & processors,
string const & encoding, bool to)
{
string const & fromcode = to ? ucs4_codeset : encoding;
string const & tocode = to ? encoding : ucs4_codeset;
map<string, IconvProcessor>::iterator const it = processors.find(encoding);
if (it == processors.end()) {
IconvProcessor p(fromcode, tocode);
return processors.insert(make_pair(encoding, std::move(p))).first->second;
}
return it->second;
}
Bulk cleanup/fix incorrect annotation at the end of namespaces. This commit does a bulk fix of incorrect annotations (comments) at the end of namespaces. The commit was generated by initially running clang-format, and then from the diff of the result extracting the hunks corresponding to fixes of namespace comments. The changes being applied and all the results have been manually reviewed. The source code successfully builds on macOS. Further details on the steps below, in case they're of interest to someone else in the future. 1. Checkout a fresh and up to date version of src/ git pull && git checkout -- src && git status src 2. Ensure there's a suitable .clang-format in place, i.e. with options to fix the comment at the end of namespaces, including: FixNamespaceComments: true SpacesBeforeTrailingComments: 1 and that clang-format is >= 5.0.0, by doing e.g.: clang-format -dump-config | grep Comments: clang-format --version 3. Apply clang-format to the source: clang-format -i $(find src -name "*.cpp" -or -name "*.h") 4. Create and filter out hunks related to fixing the namespace git diff -U0 src > tmp.patch grepdiff '^} // namespace' --output-matching=hunk tmp.patch > fix_namespace.patch 5. Filter out hunks corresponding to simple fixes into to a separate patch: pcregrep -M -e '^diff[^\n]+\nindex[^\n]+\n--- [^\n]+\n\+\+\+ [^\n]+\n' \ -e '^@@ -[0-9]+ \+[0-9]+ @@[^\n]*\n-\}[^\n]*\n\+\}[^\n]*\n' \ fix_namespace.patch > fix_namespace_simple.patch 6. Manually review the simple patch and then apply it, after first restoring the source. git checkout -- src patch -p1 < fix_namespace_simple.path 7. Manually review the (simple) changes and then stage the changes git diff src git add src 8. Again apply clang-format and filter out hunks related to any remaining fixes to the namespace, this time filter with more context. There will be fewer hunks as all the simple cases have already been handled: clang-format -i $(find src -name "*.cpp" -or -name "*.h") git diff src > tmp.patch grepdiff '^} // namespace' --output-matching=hunk tmp.patch > fix_namespace2.patch 9. Manually review/edit the resulting patch file to remove hunks for files which need to be dealt with manually, noting the file names and line numbers. Then restore files to as before applying clang-format and apply the patch: git checkout src patch -p1 < fix_namespace2.patch 10. Manually fix the files noted in the previous step. Stage files, review changes and commit.
2017-07-23 11:11:54 +00:00
} // namespace
vector<char>
ucs4_to_utf8(char_type c)
{
return iconv_convert<char>(ucs4ToUtf8(), &c, 1);
}
vector<char>
ucs4_to_utf8(vector<char_type> const & ucs4str)
{
if (ucs4str.empty())
return vector<char>();
return ucs4_to_utf8(&ucs4str[0], ucs4str.size());
}
vector<char>
ucs4_to_utf8(char_type const * ucs4str, size_t ls)
{
return iconv_convert<char>(ucs4ToUtf8(), ucs4str, ls);
}
vector<char_type>
eightbit_to_ucs4(char const * s, size_t ls, string const & encoding)
{
static QThreadStorage<map<string, IconvProcessor> *> static_processors;
if (!static_processors.hasLocalData())
static_processors.setLocalData(new map<string, IconvProcessor>);
map<string, IconvProcessor> & processors = *static_processors.localData();
IconvProcessor & processor = getProc(processors, encoding, true);
return iconv_convert<char_type>(processor, s, ls);
}
namespace {
map<string, IconvProcessor> & ucs4To8bitProcessors()
{
static QThreadStorage<map<string, IconvProcessor> *> processors;
if (!processors.hasLocalData())
processors.setLocalData(new map<string, IconvProcessor>);
return *processors.localData();
}
Bulk cleanup/fix incorrect annotation at the end of namespaces. This commit does a bulk fix of incorrect annotations (comments) at the end of namespaces. The commit was generated by initially running clang-format, and then from the diff of the result extracting the hunks corresponding to fixes of namespace comments. The changes being applied and all the results have been manually reviewed. The source code successfully builds on macOS. Further details on the steps below, in case they're of interest to someone else in the future. 1. Checkout a fresh and up to date version of src/ git pull && git checkout -- src && git status src 2. Ensure there's a suitable .clang-format in place, i.e. with options to fix the comment at the end of namespaces, including: FixNamespaceComments: true SpacesBeforeTrailingComments: 1 and that clang-format is >= 5.0.0, by doing e.g.: clang-format -dump-config | grep Comments: clang-format --version 3. Apply clang-format to the source: clang-format -i $(find src -name "*.cpp" -or -name "*.h") 4. Create and filter out hunks related to fixing the namespace git diff -U0 src > tmp.patch grepdiff '^} // namespace' --output-matching=hunk tmp.patch > fix_namespace.patch 5. Filter out hunks corresponding to simple fixes into to a separate patch: pcregrep -M -e '^diff[^\n]+\nindex[^\n]+\n--- [^\n]+\n\+\+\+ [^\n]+\n' \ -e '^@@ -[0-9]+ \+[0-9]+ @@[^\n]*\n-\}[^\n]*\n\+\}[^\n]*\n' \ fix_namespace.patch > fix_namespace_simple.patch 6. Manually review the simple patch and then apply it, after first restoring the source. git checkout -- src patch -p1 < fix_namespace_simple.path 7. Manually review the (simple) changes and then stage the changes git diff src git add src 8. Again apply clang-format and filter out hunks related to any remaining fixes to the namespace, this time filter with more context. There will be fewer hunks as all the simple cases have already been handled: clang-format -i $(find src -name "*.cpp" -or -name "*.h") git diff src > tmp.patch grepdiff '^} // namespace' --output-matching=hunk tmp.patch > fix_namespace2.patch 9. Manually review/edit the resulting patch file to remove hunks for files which need to be dealt with manually, noting the file names and line numbers. Then restore files to as before applying clang-format and apply the patch: git checkout src patch -p1 < fix_namespace2.patch 10. Manually fix the files noted in the previous step. Stage files, review changes and commit.
2017-07-23 11:11:54 +00:00
} // namespace
vector<char>
ucs4_to_eightbit(char_type const * ucs4str, size_t ls, string const & encoding)
{
map<string, IconvProcessor> & processors(ucs4To8bitProcessors());
IconvProcessor & processor = getProc(processors, encoding, false);
return iconv_convert<char>(processor, ucs4str, ls);
}
char ucs4_to_eightbit(char_type ucs4, string const & encoding)
{
map<string, IconvProcessor> & processors(ucs4To8bitProcessors());
IconvProcessor & processor = getProc(processors, encoding, false);
char out;
int const bytes = processor.convert((char *)(&ucs4), 4, &out, 1);
if (bytes > 0)
return out;
return 0;
}
void ucs4_to_multibytes(char_type ucs4, vector<char> & out,
string const & encoding)
{
static QThreadStorage<map<string, IconvProcessor> *> static_processors;
if (!static_processors.hasLocalData())
static_processors.setLocalData(new map<string, IconvProcessor>);
map<string, IconvProcessor> & processors = *static_processors.localData();
IconvProcessor & processor = getProc(processors, encoding, false);
out.resize(4);
int bytes = processor.convert((char *)(&ucs4), 4, &out[0], 4);
if (bytes > 0)
out.resize(bytes);
else
out.clear();
}
int max_encoded_bytes(std::string const & encoding)
{
// FIXME: this information should be transferred to lib/encodings
// UTF8 uses at most 4 bytes to represent one UCS4 code point
// (see RFC 3629). RFC 2279 specifies 6 bytes, but that
// information is outdated, and RFC 2279 has been superseded by
// RFC 3629.
// The CJK encodings use (different) multibyte representation as well.
// All other encodings encode one UCS4 code point in one byte
// (and can therefore only encode a subset of UCS4)
// Furthermore, all encodings that use shifting (like SJIS) do not work with
// iconv_codecvt_facet.
if (encoding == "UTF-8" ||
encoding == "GB" ||
encoding == "EUC-TW")
return 4;
else if (encoding == "EUC-JP")
return 3;
else if (encoding == "ISO-2022-JP")
return 8;
else if (encoding == "BIG5" ||
encoding == "EUC-KR" ||
encoding == "EUC-CN" ||
encoding == "SJIS" ||
encoding == "GBK")
return 2;
else
return 1;
}
} // namespace lyx