2016-06-02 21:27:52 +00:00
|
|
|
// -*- C++ -*-
|
2006-08-13 22:54:59 +00:00
|
|
|
/**
|
|
|
|
* \file unicode.h
|
|
|
|
* This file is part of LyX, the document processor.
|
|
|
|
* Licence details can be found in the file COPYING.
|
|
|
|
*
|
2008-11-14 15:58:50 +00:00
|
|
|
* \author Lars Gullik Bjønnes
|
2006-08-13 22:54:59 +00:00
|
|
|
*
|
|
|
|
* Full author contact details are available in file CREDITS.
|
|
|
|
*
|
|
|
|
* A collection of unicode conversion functions, using iconv.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef LYX_SUPPORT_UNICODE_H
|
|
|
|
#define LYX_SUPPORT_UNICODE_H
|
|
|
|
|
2007-10-31 22:40:34 +00:00
|
|
|
#include "support/strfwd.h"
|
2016-06-02 21:27:52 +00:00
|
|
|
#include "support/unique_ptr.h"
|
2006-09-13 17:11:39 +00:00
|
|
|
|
2011-02-07 22:47:02 +00:00
|
|
|
#include <cstddef>
|
2016-06-02 21:27:52 +00:00
|
|
|
#include <string>
|
2006-08-13 22:54:59 +00:00
|
|
|
#include <vector>
|
|
|
|
|
2006-10-21 00:16:43 +00:00
|
|
|
|
|
|
|
namespace lyx {
|
|
|
|
|
2013-10-10 19:20:44 +00:00
|
|
|
/**
|
|
|
|
* Wrapper for iconv(3).
|
|
|
|
*
|
|
|
|
* According to the POSIX standard, all specified functions are thread-safe,
|
|
|
|
* with some exceptions. The iconv() function is not listed as an exception:
|
|
|
|
* http://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xsh_chap02.html#tag_22_02_09_09
|
|
|
|
* http://man7.org/linux/man-pages/man7/pthreads.7.html
|
|
|
|
*
|
|
|
|
* Therefore, you can use as many instances of this class in parallel as you
|
|
|
|
* like. However, you need to ensure that each instance is only used by one
|
|
|
|
* thread at any given time. If this condition is not met you get nasty
|
|
|
|
* mixtures of different thread data as in bug 7240.
|
|
|
|
*
|
|
|
|
* From a performance point of view it is best to use one static instance
|
|
|
|
* per thread for each in/out encoding pair. This can e.g. be achieved by
|
|
|
|
* using helpers for thread-local storage such as QThreadStorage or
|
|
|
|
* boost::thread_specific_ptr. A single static instance protected by a mutex
|
|
|
|
* would work as well, and might be preferrable for exotic encoding pairs.
|
|
|
|
* Creating local IconvProcessor instances should be avoided because of the
|
|
|
|
* overhead in iconv_open().
|
|
|
|
*/
|
2006-10-29 21:59:59 +00:00
|
|
|
class IconvProcessor
|
|
|
|
{
|
2016-06-02 21:27:52 +00:00
|
|
|
/// open iconv.
|
|
|
|
/// \return true if the processor is ready to use.
|
|
|
|
bool init();
|
|
|
|
std::string const tocode_;
|
|
|
|
std::string const fromcode_;
|
|
|
|
struct Handler;
|
|
|
|
unique_ptr<Handler> h_;
|
2006-10-29 21:59:59 +00:00
|
|
|
public:
|
2016-06-02 21:27:52 +00:00
|
|
|
IconvProcessor(std::string tocode, std::string fromcode);
|
2006-10-29 21:59:59 +00:00
|
|
|
/// convert any data from \c fromcode to \c tocode unicode format.
|
|
|
|
/// \return the number of bytes of the converted output buffer.
|
2007-11-29 07:24:55 +00:00
|
|
|
int convert(char const * in_buffer, size_t in_size,
|
|
|
|
char * out_buffer, size_t max_out_size);
|
2009-05-19 15:27:54 +00:00
|
|
|
/// target encoding
|
2016-06-02 21:27:52 +00:00
|
|
|
std::string to() const { return tocode_; }
|
2019-06-12 13:03:18 +00:00
|
|
|
// required by g++ 4.7
|
|
|
|
IconvProcessor(IconvProcessor &&) = default;
|
2006-10-29 21:59:59 +00:00
|
|
|
};
|
|
|
|
|
2013-10-10 19:20:44 +00:00
|
|
|
/// Get the global IconvProcessor instance of the current thread for
|
|
|
|
/// utf8->ucs4 conversions
|
|
|
|
IconvProcessor & utf8ToUcs4();
|
|
|
|
|
2006-09-10 18:34:24 +00:00
|
|
|
// A single codepoint conversion for utf8_to_ucs4 does not make
|
|
|
|
// sense, so that function is left out.
|
|
|
|
|
2007-05-08 14:09:27 +00:00
|
|
|
std::vector<char_type> utf8_to_ucs4(std::vector<char> const & utf8str);
|
2006-08-13 22:54:59 +00:00
|
|
|
|
2007-05-08 14:09:27 +00:00
|
|
|
std::vector<char_type> utf8_to_ucs4(char const * utf8str, size_t ls);
|
2006-09-10 18:34:24 +00:00
|
|
|
|
2006-12-04 13:50:46 +00:00
|
|
|
// utf16_to_ucs4
|
2006-09-10 18:34:24 +00:00
|
|
|
|
2006-12-04 13:50:46 +00:00
|
|
|
std::vector<char_type> utf16_to_ucs4(unsigned short const * s, size_t ls);
|
2006-09-10 18:34:24 +00:00
|
|
|
|
2006-12-04 13:50:46 +00:00
|
|
|
// ucs4_to_utf16
|
2006-08-13 22:54:59 +00:00
|
|
|
|
2006-12-04 13:50:46 +00:00
|
|
|
std::vector<unsigned short> ucs4_to_utf16(char_type const * s, size_t ls);
|
2006-08-13 22:54:59 +00:00
|
|
|
|
2013-10-10 19:20:44 +00:00
|
|
|
/// Get the global IconvProcessor instance of the current thread for
|
|
|
|
/// ucs4->utf8 conversions
|
|
|
|
IconvProcessor & ucs4ToUtf8();
|
|
|
|
|
2006-09-10 18:34:24 +00:00
|
|
|
// ucs4_to_utf8
|
|
|
|
|
2007-05-08 14:09:27 +00:00
|
|
|
std::vector<char> ucs4_to_utf8(char_type c);
|
2006-08-13 22:54:59 +00:00
|
|
|
|
2007-05-08 14:09:27 +00:00
|
|
|
std::vector<char> ucs4_to_utf8(std::vector<char_type> const & ucs4str);
|
2006-08-13 22:54:59 +00:00
|
|
|
|
2007-05-08 14:09:27 +00:00
|
|
|
std::vector<char> ucs4_to_utf8(char_type const * ucs4str, size_t ls);
|
2006-08-13 22:54:59 +00:00
|
|
|
|
2006-10-26 15:01:45 +00:00
|
|
|
/// convert \p s from encoding \p encoding to ucs4.
|
|
|
|
/// \p encoding must be a valid iconv 8bit encoding
|
2007-05-08 14:09:27 +00:00
|
|
|
std::vector<char_type>
|
2006-10-26 15:01:45 +00:00
|
|
|
eightbit_to_ucs4(char const * s, size_t ls, std::string const & encoding);
|
|
|
|
|
|
|
|
/// convert \p s from ucs4 to encoding \p encoding.
|
|
|
|
/// \p encoding must be a valid iconv 8bit encoding
|
2007-11-03 20:52:09 +00:00
|
|
|
std::vector<char> ucs4_to_eightbit(char_type const * ucs4str,
|
|
|
|
size_t ls, std::string const & encoding);
|
2006-10-26 15:01:45 +00:00
|
|
|
|
2007-05-14 09:41:00 +00:00
|
|
|
/// convert ucs4 character \p c to encoding \p encoding.
|
|
|
|
/// \p encoding must be a valid iconv 8bit encoding
|
|
|
|
char ucs4_to_eightbit(char_type c, std::string const & encoding);
|
|
|
|
|
|
|
|
///
|
|
|
|
void ucs4_to_multibytes(char_type ucs4, std::vector<char> & out,
|
|
|
|
std::string const & encoding);
|
|
|
|
|
2006-10-17 11:58:21 +00:00
|
|
|
extern char const * ucs4_codeset;
|
|
|
|
|
2009-05-19 15:27:54 +00:00
|
|
|
/// How many bytes does one UCS4 code point use at most in encoding \p encoding?
|
|
|
|
int max_encoded_bytes(std::string const & encoding);
|
2006-10-21 00:16:43 +00:00
|
|
|
|
|
|
|
} // namespace lyx
|
|
|
|
|
2006-08-13 22:54:59 +00:00
|
|
|
#endif
|