Georg's patch for bug 3043:

Introduces new encodings for the CJK-languages.
The CJK-languages can now be selected from LyX's menu.
For remaining small problems, see the thread "Help needed for bug 3043" on the devel-list.

git-svn-id: svn://svn.lyx.org/lyx/lyx-devel/trunk@18216 a592a061-630c-0410-9148-cb99ea01b6c8
This commit is contained in:
Uwe Stöhr 2007-05-06 20:26:02 +00:00
parent 54d2810b01
commit 80b94c6bf5
15 changed files with 411 additions and 135 deletions

View File

@ -1,6 +1,18 @@
LyX file-format changes LyX file-format changes
----------------------- -----------------------
2007-05-06 Uwe Stöhr <uwestoehr@web.de>
* format incremented to 268: add support for the CJK encodings
- all encodings supported by CJK.sty are now in lib/encodings
- lib/encoding has got two new flags:
- encoding package:
"inputenc" for those languages that use the inputenc-package and
"CJK" for the CJK encodings
- "fixed"/"variable" to divide between multi- or single-byte encoding
- lib/languages allows now to set an empty language for babel
in this case babel isn't called.
2007-05-04 Jürgen Spitzmüller <j.spitzmueller@gmx.de> 2007-05-04 Jürgen Spitzmüller <j.spitzmueller@gmx.de>
* format incremented to 267: add plain utf8 encoding (for XeTeX). * format incremented to 267: add plain utf8 encoding (for XeTeX).

View File

@ -1,134 +1,189 @@
# FIXME: Have a look at the encodings known by the inputenc package and add # FIXME: Have a look at the encodings known by the inputenc package and add
# missing ones. Caution: File format change! # missing ones. Caution: File format change!
# Note that you can only add single byte encodings to this file without # Note that you can only add singlebyte encodings to this file.
# changing some C++ code. # LyX does not support the output of multibyte encodings (e.g. utf16).
# The only multibyte encoding that is currently supported is utf8, and this # It does support singlebyte encodings with variable with (e.g. utf8).
# support is hardcoded to the iconv name "UTF-8". # These are marked with the "variable" keyword.
# Fixed width encodings are marked with the "fixed" keyword.
# Order of names: LyX name LaTeX name iconv name # Syntax: Encoding <LyX name> <LaTeX name> <iconv name> fixed|variable <package> End
Encoding utf8 utf8 UTF-8 # encodings used by inputenc.sty
Encoding utf8 utf8 UTF-8 variable inputenc
End End
# This one is used by many CJK packages. utf8 is supposed to be the successor, # This one is used by many CJK packages. utf8 is supposed to be the successor,
# but does not have all features of utf8x yet. # but does not have all features of utf8x yet.
Encoding utf8x utf8x UTF-8 Encoding utf8x utf8x UTF-8 variable inputenc
End End
# A plain utf8 encoding that does not use the inputenc package. # A plain utf8 encoding that does not use the inputenc package.
# Such an encoding is required for XeTeX. # Such an encoding is required for XeTeX.
Encoding utf8-plain utf8-plain UTF-8 Encoding utf8-plain utf8-plain UTF-8 variable none
End End
# This encoding is used to typeset Armenian using the armtex package # This encoding is used to typeset Armenian using the armtex package
Encoding armscii8 armscii8 ARMSCII-8 Encoding armscii8 armscii8 ARMSCII-8 fixed inputenc
End End
Encoding iso8859-1 latin1 ISO-8859-1 Encoding iso8859-1 latin1 ISO-8859-1 fixed inputenc
End End
Encoding iso8859-2 latin2 ISO-8859-2 Encoding iso8859-2 latin2 ISO-8859-2 fixed inputenc
End End
Encoding iso8859-3 latin3 ISO-8859-3 Encoding iso8859-3 latin3 ISO-8859-3 fixed inputenc
End End
Encoding iso8859-4 latin4 ISO-8859-4 Encoding iso8859-4 latin4 ISO-8859-4 fixed inputenc
End End
Encoding iso8859-5 iso88595 ISO-8859-5 Encoding iso8859-5 iso88595 ISO-8859-5 fixed inputenc
End End
# Not standard, see http://tug.ctan.org/tex-archive/language/arabic/arabi/arabi/texmf/latex/arabi/ # Not standard, see http://tug.ctan.org/tex-archive/language/arabic/arabi/arabi/texmf/latex/arabi/
Encoding iso8859-6 8859-6 ISO-8859-6 Encoding iso8859-6 8859-6 ISO-8859-6 fixed inputenc
End End
Encoding iso8859-7 iso-8859-7 ISO-8859-7 Encoding iso8859-7 iso-8859-7 ISO-8859-7 fixed inputenc
End End
Encoding iso8859-8 8859-8 ISO-8859-8 Encoding iso8859-8 8859-8 ISO-8859-8 fixed inputenc
End End
Encoding iso8859-9 latin5 ISO-8859-9 Encoding iso8859-9 latin5 ISO-8859-9 fixed inputenc
End End
# Not standard, see http://www.vtex.lt/tex/littex/index.html # Not standard, see http://www.vtex.lt/tex/littex/index.html
Encoding iso8859-13 l7xenc ISO-8859-13 Encoding iso8859-13 l7xenc ISO-8859-13 fixed inputenc
End End
Encoding iso8859-15 latin9 ISO-8859-15 Encoding iso8859-15 latin9 ISO-8859-15 fixed inputenc
End End
Encoding iso8859-16 latin10 ISO-8859-16 Encoding iso8859-16 latin10 ISO-8859-16 fixed inputenc
End End
Encoding cp437 cp437 CP437 Encoding cp437 cp437 CP437 fixed inputenc
End End
# cp437, but on position 225 is sz instead of beta # cp437, but on position 225 is sz instead of beta
Encoding cp437de cp437de CP437 Encoding cp437de cp437de CP437 fixed inputenc
End End
Encoding cp850 cp850 CP850 Encoding cp850 cp850 CP850 fixed inputenc
End End
Encoding cp852 cp852 CP852 Encoding cp852 cp852 CP852 fixed inputenc
End End
Encoding cp855 cp855 CP855 Encoding cp855 cp855 CP855 fixed inputenc
End End
Encoding cp858 cp858 CP858 Encoding cp858 cp858 CP858 fixed inputenc
End End
Encoding cp862 cp862 CP862 Encoding cp862 cp862 CP862 fixed inputenc
End End
Encoding cp865 cp865 CP865 Encoding cp865 cp865 CP865 fixed inputenc
End End
Encoding cp866 cp866 CP866 Encoding cp866 cp866 CP866 fixed inputenc
End End
Encoding cp1250 cp1250 CP1250 Encoding cp1250 cp1250 CP1250 fixed inputenc
End End
Encoding cp1251 cp1251 CP1251 Encoding cp1251 cp1251 CP1251 fixed inputenc
End End
Encoding cp1252 cp1252 CP1252 Encoding cp1252 cp1252 CP1252 fixed inputenc
End End
Encoding cp1255 cp1255 CP1255 Encoding cp1255 cp1255 CP1255 fixed inputenc
End End
# Not standard, see http://tug.ctan.org/tex-archive/language/arabic/arabi/arabi/texmf/latex/arabi/ # Not standard, see http://tug.ctan.org/tex-archive/language/arabic/arabi/arabi/texmf/latex/arabi/
Encoding cp1256 cp1256 CP1256 Encoding cp1256 cp1256 CP1256 fixed inputenc
End End
Encoding cp1257 cp1257 CP1257 Encoding cp1257 cp1257 CP1257 fixed inputenc
End End
Encoding koi8 koi8-r KOI8-R Encoding koi8 koi8-r KOI8-R fixed inputenc
End End
Encoding koi8-u koi8-u KOI8-U Encoding koi8-u koi8-u KOI8-U fixed inputenc
End End
Encoding pt154 pt154 PT154 fixed inputenc
End
Encoding pt254 pt254 PT254 fixed inputenc
End
# encodings used by CJK.sty
# The following encodings that are supported by the CJK package are not
# included here, because they are not widely used and lack proper iconv support:
# Bg5+, GBt
# See the NOTES file of libiconv for details.
# The following encodings can't be handled directly, because the code points
# of TeX control characters like {, } and \ can occur in the second byte:
# For traditional chinese
#Encoding big5 Bg5 BIG5 variable CJK
#End
# For japanese
#Encoding shift-jis SJIS SJIS variable CJK
#End
# The following encodings need hardcoded support of the encodable unicode
# range, but are known by iconv:
# For simplified chinese
Encoding euc-cn GB EUC-CN variable CJK
End
# For simplified chinese
Encoding gbk GBK GBK variable CJK
End
# For japanese
Encoding jis JIS JIS variable CJK
End
# For korean
Encoding euc-kr KS EUC-KR variable CJK
End
# The CJK package has yet another name for utf8...
Encoding utf8-cjk UTF8 UTF-8 variable CJK
End
# For traditional chinese
Encoding euc-tw EUC-TW EUC-TW variable CJK
End
# For japanese
Encoding euc-jp EUC-JP EUC-JP variable CJK
End
# This one needs hardcoded support, since the inputenc package does not know # This one needs hardcoded support, since the inputenc package does not know
# tis620-0, and thailatex sets up babel directly to use tis620-0, so the value # tis620-0, and thailatex sets up babel directly to use tis620-0, so the
# for inputenc is never output to .tex files (but needed for the hardcoded # LaTeX name is never output to .tex files (but needed for the hardcoded
# tis620-0 support). # tis620-0 support).
Encoding tis620-0 tis620-0 TIS620-0 Encoding tis620-0 tis620-0 TIS620-0 fixed none
End End
Encoding pt154 pt154 PT154
End
Encoding pt254 pt254 PT254
End
# Pure 7bit ASCII encoding (partially hardcoded in LyX) # Pure 7bit ASCII encoding (partially hardcoded in LyX)
Encoding ascii ascii ascii Encoding ascii ascii ascii fixed none
End End

View File

@ -2,10 +2,7 @@
afrikaans afrikaans "Afrikaans" false iso8859-15 af_ZA "" afrikaans afrikaans "Afrikaans" false iso8859-15 af_ZA ""
american american "American" false iso8859-15 en_US "" american american "American" false iso8859-15 en_US ""
arabic arabic "Arabic" true cp1256 ar_SA "" arabic arabic "Arabic" true cp1256 ar_SA ""
# loading babel with the option "english" as armtex overwrites this later armenian "" "Armenian" false armscii8 hy_AM ""
# armtex provides the armscii8 encoding and translates words instead of babel
# the english option can later be omitted when the patch for bug 3043 is in.
armenian english "Armenian" false armscii8 hy_AM ""
austrian austrian "Austrian" false iso8859-15 de_AT "" austrian austrian "Austrian" false iso8859-15 de_AT ""
naustrian naustrian "Austrian (new spelling)" false iso8859-15 de_AT "" naustrian naustrian "Austrian (new spelling)" false iso8859-15 de_AT ""
bahasa bahasa "Bahasa" false iso8859-15 in_ID "" bahasa bahasa "Bahasa" false iso8859-15 in_ID ""
@ -18,6 +15,8 @@ bulgarian bulgarian "Bulgarian" false cp1251 bg_BG ""
canadian canadian "Canadian" false iso8859-15 en_CA "" canadian canadian "Canadian" false iso8859-15 en_CA ""
canadien canadien "French Canadian" false iso8859-15 fr_CA "" canadien canadien "French Canadian" false iso8859-15 fr_CA ""
catalan catalan "Catalan" false iso8859-15 ca_ES "" catalan catalan "Catalan" false iso8859-15 ca_ES ""
chinese-simplified "" "Chinese (simplified)" false euc-cn zh ""
chinese-traditional "" "Chinese (traditional)" false utf8-cjk zh ""
croatian croatian "Croatian" false iso8859-2 hr_HR "" croatian croatian "Croatian" false iso8859-2 hr_HR ""
czech czech "Czech" false iso8859-2 cs_CZ "" czech czech "Czech" false iso8859-2 cs_CZ ""
danish danish "Danish" false iso8859-15 da_DK "" danish danish "Danish" false iso8859-15 da_DK ""
@ -39,7 +38,9 @@ hebrew hebrew "Hebrew" true cp1255 he_IL ""
#hungarian hungarian "Hungarian" false iso8859-2 hu_HU "" #hungarian hungarian "Hungarian" false iso8859-2 hu_HU ""
irish irish "Irish" false iso8859-15 ga_IE "" irish irish "Irish" false iso8859-15 ga_IE ""
italian italian "Italian" false iso8859-15 it_IT "" italian italian "Italian" false iso8859-15 it_IT ""
japanese "" "Japanese" false euc-jp ja ""
kazakh kazakh "Kazakh" false pt154 kk_KZ "" kazakh kazakh "Kazakh" false pt154 kk_KZ ""
korean "" "Korean" false euc-kr ko ""
#lsorbian lsorbian "Lower Sorbian" false iso8859-2 dsb_DE "" #lsorbian lsorbian "Lower Sorbian" false iso8859-2 dsb_DE ""
lithuanian lithuanian "Lithuanian" false iso8859-13 lt_LT "" lithuanian lithuanian "Lithuanian" false iso8859-13 lt_LT ""
latvian latvian "Latvian" false iso8859-13 lv_LV "" latvian latvian "Latvian" false iso8859-13 lv_LV ""

View File

@ -74,7 +74,7 @@ format_relation = [("0_06", [200], generate_minor_versions("0.6" , 4)),
("1_2", [220], generate_minor_versions("1.2" , 4)), ("1_2", [220], generate_minor_versions("1.2" , 4)),
("1_3", [221], generate_minor_versions("1.3" , 7)), ("1_3", [221], generate_minor_versions("1.3" , 7)),
("1_4", range(222,246), generate_minor_versions("1.4" , 4)), ("1_4", range(222,246), generate_minor_versions("1.4" , 4)),
("1_5", range(246,268), generate_minor_versions("1.5" , 0))] ("1_5", range(246,269), generate_minor_versions("1.5" , 0))]
def formats_list(): def formats_list():

View File

@ -1325,6 +1325,28 @@ def revert_armenian(document):
document.preamble.append('\\usepackage{armtex}') document.preamble.append('\\usepackage{armtex}')
def revert_CJK(document):
" Set CJK encodings to default and languages chinese, japanese and korean to english. "
encodings = ["Bg5", "Bg5+", "GB", "GBt", "GBK", "JIS",
"KS", "SJIS", "UTF8", "EUC-TW", "EUC-JP"]
i = find_token(document.header, "\\inputencoding", 0)
if i == -1:
document.header.append("\\inputencoding auto")
else:
inputenc = get_value(document.header, "\\inputencoding", i)
if inputenc in encodings:
document.header[i] = "\\inputencoding default"
document.inputencoding = get_value(document.header, "\\inputencoding", 0)
if document.language == "chinese-simplified" or \
document.language == "chinese-traditional" or \
document.language == "japanese" or document.language == "korean":
document.language = "english"
i = find_token(document.header, "\\language", 0)
if i != -1:
document.header[i] = "\\language english"
## ##
# Conversion hub # Conversion hub
# #
@ -1351,9 +1373,11 @@ convert = [[246, []],
[264, [convert_cv_textclass]], [264, [convert_cv_textclass]],
[265, [convert_tableborder]], [265, [convert_tableborder]],
[266, []], [266, []],
[267, []]] [267, []],
[268, []]]
revert = [[266, [revert_utf8plain]], revert = [[267, [revert_CJK]],
[266, [revert_utf8plain]],
[265, [revert_armenian]], [265, [revert_armenian]],
[264, [revert_tableborder]], [264, [revert_tableborder]],
[263, [revert_cv_textclass]], [263, [revert_cv_textclass]],
@ -1380,3 +1404,4 @@ revert = [[266, [revert_utf8plain]],
if __name__ == "__main__": if __name__ == "__main__":
pass pass

View File

@ -141,7 +141,7 @@ using std::string;
namespace { namespace {
int const LYX_FORMAT = 267; int const LYX_FORMAT = 268;
} // namespace anon } // namespace anon
@ -974,6 +974,16 @@ void Buffer::writeLaTeXSource(odocstream & os,
texrow().newline(); texrow().newline();
} }
Encoding const & encoding = params().encoding();
if (encoding.package() == Encoding::CJK) {
// Open a CJK environment, since in contrast to the encodings
// handled by inputenc the document encoding is not set in
// the preamble if it is handled by CJK.sty.
os << "\\begin{CJK}{" << from_ascii(encoding.latexName())
<< "}{}\n";
texrow().newline();
}
// if we are doing a real file with body, even if this is the // if we are doing a real file with body, even if this is the
// child of some other buffer, let's cut the link here. // child of some other buffer, let's cut the link here.
// This happens for example if only a child document is printed. // This happens for example if only a child document is printed.
@ -994,6 +1004,14 @@ void Buffer::writeLaTeXSource(odocstream & os,
os << endl; os << endl;
texrow().newline(); texrow().newline();
if (encoding.package() == Encoding::CJK) {
// Close the open CJK environment.
// latexParagraphs will have opened one even if the last text
// was not CJK.
os << "\\end{CJK}\n";
texrow().newline();
}
if (!lyxrc.language_auto_end && if (!lyxrc.language_auto_end &&
!params().language->babel().empty()) { !params().language->babel().empty()) {
os << from_utf8(subst(lyxrc.language_command_end, os << from_utf8(subst(lyxrc.language_command_end,

View File

@ -893,16 +893,15 @@ bool BufferParams::writeLaTeX(odocstream & os, LaTeXFeatures & features,
if (inputenc == "auto") { if (inputenc == "auto") {
string const doc_encoding = string const doc_encoding =
language->encoding()->latexName(); language->encoding()->latexName();
Encoding::Package const package =
language->encoding()->package();
// Create a list with all the input encodings used // Create a list with all the input encodings used
// in the document // in the document
std::set<string> encodings = std::set<string> encodings =
features.getEncodingSet(doc_encoding); features.getEncodingSet(doc_encoding);
// thailatex does not use the inputenc package, but sets up if (!encodings.empty() || package == Encoding::inputenc) {
// babel directly for tis620-0 encoding, therefore we must
// not request inputenc for tis620-0 encoding
if (!encodings.empty() || doc_encoding != "tis620-0") {
os << "\\usepackage["; os << "\\usepackage[";
std::set<string>::const_iterator it = encodings.begin(); std::set<string>::const_iterator it = encodings.begin();
std::set<string>::const_iterator const end = encodings.end(); std::set<string>::const_iterator const end = encodings.end();
@ -912,7 +911,7 @@ bool BufferParams::writeLaTeX(odocstream & os, LaTeXFeatures & features,
} }
for (; it != end; ++it) for (; it != end; ++it)
os << ',' << from_ascii(*it); os << ',' << from_ascii(*it);
if (doc_encoding != "tis620-0") { if (package == Encoding::inputenc) {
if (!encodings.empty()) if (!encodings.empty())
os << ','; os << ',';
os << from_ascii(doc_encoding); os << from_ascii(doc_encoding);
@ -920,12 +919,24 @@ bool BufferParams::writeLaTeX(odocstream & os, LaTeXFeatures & features,
os << "]{inputenc}\n"; os << "]{inputenc}\n";
texrow.newline(); texrow.newline();
} }
// utf8-plain is for XeTeX users (inputenc not desired) if (package == Encoding::CJK) {
} else if (inputenc != "default" && inputenc != "tis620-0" && os << "\\usepackage{CJK}\n";
inputenc != "ascii" && inputenc != "utf8-plain") { texrow.newline();
}
} else if (inputenc != "default") {
switch (language->encoding()->package()) {
case Encoding::none:
break;
case Encoding::inputenc:
os << "\\usepackage[" << from_ascii(inputenc) os << "\\usepackage[" << from_ascii(inputenc)
<< "]{inputenc}\n"; << "]{inputenc}\n";
texrow.newline(); texrow.newline();
break;
case Encoding::CJK:
os << "\\usepackage{CJK}\n";
texrow.newline();
break;
}
} }
// The encoding "armscii8" is only available when the package "armtex" is loaded. // The encoding "armscii8" is only available when the package "armtex" is loaded.

View File

@ -123,25 +123,42 @@ struct CharInfo {
typedef std::map<char_type, CharInfo> CharInfoMap; typedef std::map<char_type, CharInfo> CharInfoMap;
CharInfoMap unicodesymbols; CharInfoMap unicodesymbols;
/// The highest code point in UCS4 encoding (1<<20 + 1<<16)
char_type const max_ucs4 = 0x110000;
} // namespace anon } // namespace anon
Encoding::Encoding(string const & n, string const & l, string const & i) Encoding::Encoding(string const & n, string const & l, string const & i,
: Name_(n), LatexName_(l), iconvName_(i) bool f, Encoding::Package p)
: Name_(n), LatexName_(l), iconvName_(i), fixedwidth_(f), package_(p)
{ {
if (n == "ascii") if (n == "ascii") {
// ASCII can encode 128 code points and nothing else // ASCII can encode 128 code points and nothing else
start_encodable_ = 128; start_encodable_ = 128;
else if (i == "UTF-8") complete_ = true;
// UTF8 can encode all 1<<20 + 1<<16 UCS4 code points } else if (i == "UTF-8") {
start_encodable_ = 0x110000; // UTF8 can encode all UCS4 code points
else { start_encodable_ = max_ucs4;
complete_ = true;
} else {
complete_ = false;
}
}
void Encoding::init() const
{
start_encodable_ = 0; start_encodable_ = 0;
// temporarily switch off lyxerr, since we will generate iconv errors // temporarily switch off lyxerr, since we will generate iconv errors
lyxerr.disable(); lyxerr.disable();
if (fixedwidth_) {
// We do not need to check all UCS4 code points, it is enough
// if we check all 256 code points of this encoding.
for (unsigned short j = 0; j < 256; ++j) { for (unsigned short j = 0; j < 256; ++j) {
char const c = j; char const c = j;
std::vector<char_type> const ucs4 = eightbit_to_ucs4(&c, 1, i); std::vector<char_type> const ucs4 = eightbit_to_ucs4(&c, 1, iconvName_);
if (ucs4.size() == 1) { if (ucs4.size() == 1) {
char_type const c = ucs4[0]; char_type const c = ucs4[0];
CharInfoMap::const_iterator const it = unicodesymbols.find(c); CharInfoMap::const_iterator const it = unicodesymbols.find(c);
@ -149,6 +166,20 @@ Encoding::Encoding(string const & n, string const & l, string const & i)
encodable_.insert(c); encodable_.insert(c);
} }
} }
} else {
// We do not know how many code points this encoding has, and
// they do not have a direct representation as a single byte,
// therefore we need to check all UCS4 code points.
// This is expensive!
for (char_type c = 0; c < max_ucs4; ++c) {
std::vector<char> const eightbit = ucs4_to_eightbit(&c, 1, iconvName_);
if (!eightbit.empty()) {
CharInfoMap::const_iterator const it = unicodesymbols.find(c);
if (it == unicodesymbols.end() || !it->second.force)
encodable_.insert(c);
}
}
}
lyxerr.enable(); lyxerr.enable();
CharSet::iterator it = encodable_.find(start_encodable_); CharSet::iterator it = encodable_.find(start_encodable_);
while (it != encodable_.end()) { while (it != encodable_.end()) {
@ -156,12 +187,15 @@ Encoding::Encoding(string const & n, string const & l, string const & i)
++start_encodable_; ++start_encodable_;
it = encodable_.find(start_encodable_); it = encodable_.find(start_encodable_);
} }
} complete_ = true;
} }
docstring const Encoding::latexChar(char_type c) const docstring const Encoding::latexChar(char_type c) const
{ {
// validate() should have been called before
//BOOST_ASSERT(complete_);
if (c < start_encodable_) if (c < start_encodable_)
return docstring(1, c); return docstring(1, c);
if (encodable_.find(c) == encodable_.end()) { if (encodable_.find(c) == encodable_.end()) {
@ -360,8 +394,32 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile)
string const latexname = lex.getString(); string const latexname = lex.getString();
lex.next(); lex.next();
string const iconvname = lex.getString(); string const iconvname = lex.getString();
lex.next();
string const width = lex.getString();
bool fixedwidth;
if (width == "fixed")
fixedwidth = true;
else if (width == "variable")
fixedwidth = false;
else
lex.printError("Encodings::read: "
"Unknown width: `$$Token'");
lex.next();
string const p = lex.getString();
Encoding::Package package;
if (p == "none")
package = Encoding::none;
else if (p == "inputenc")
package = Encoding::inputenc;
else if (p == "CJK")
package = Encoding::CJK;
else
lex.printError("Encodings::read: "
"Unknown package: `$$Token'");
LYXERR(Debug::INFO) << "Reading encoding " << name << endl; LYXERR(Debug::INFO) << "Reading encoding " << name << endl;
encodinglist[name] = Encoding(name, latexname, iconvname); encodinglist[name] = Encoding(name, latexname,
iconvname, fixedwidth,
package);
if (lex.lex() != et_end) if (lex.lex() != et_end)
lex.printError("Encodings::read: " lex.printError("Encodings::read: "
"missing end"); "missing end");

View File

@ -28,11 +28,19 @@ class LaTeXFeatures;
/// ///
class Encoding { class Encoding {
public: public:
/// Which LaTeX package handles this encoding?
enum Package {
none,
inputenc,
CJK
};
/// ///
Encoding() {} Encoding() {}
/// ///
Encoding(std::string const & n, std::string const & l, Encoding(std::string const & n, std::string const & l,
std::string const & i); std::string const & i, bool f, Package p);
///
void init() const;
/// ///
std::string const & name() const { return Name_; } std::string const & name() const { return Name_; }
/// ///
@ -48,6 +56,8 @@ public:
* character is returned. * character is returned.
*/ */
docstring const latexChar(char_type c) const; docstring const latexChar(char_type c) const;
/// Which LaTeX package handles this encoding?
Package package() const { return package_; }
private: private:
/// ///
std::string Name_; std::string Name_;
@ -55,15 +65,27 @@ private:
std::string LatexName_; std::string LatexName_;
/// ///
std::string iconvName_; std::string iconvName_;
/// Is this a fixed width encoding?
bool fixedwidth_;
/// ///
typedef std::set<char_type> CharSet; typedef std::set<char_type> CharSet;
/// Set of UCS4 characters that we can encode (for singlebyte /// Set of UCS4 characters that we can encode (for singlebyte
/// encodings only) /// encodings only)
CharSet encodable_; mutable CharSet encodable_;
/// All code points below this are encodable. This helps us to avoid /// All code points below this are encodable. This helps us to avoid
/// lokup of ASCII characters in encodable_ and gives about 1 sec /// lokup of ASCII characters in encodable_ and gives about 1 sec
/// speedup on export of the Userguide. /// speedup on export of the Userguide.
char_type start_encodable_; mutable char_type start_encodable_;
/// Which LaTeX package handles this encoding?
Package package_;
/**
* If this is true the stored information about the encoding covers
* all encodable characters. We set this to false initially so that
* we only need to query iconv for the actually used encodings.
* This is needed especially for the multibyte encodings, if we
* complete all encoding info on startup it takes 2-3 minutes.
*/
mutable bool complete_;
}; };
class Encodings { class Encodings {

View File

@ -18,11 +18,14 @@
#include "BufferParams.h" // stateText #include "BufferParams.h" // stateText
#include "debug.h" #include "debug.h"
#include "Encoding.h"
#include "gettext.h" #include "gettext.h"
#include "Language.h" #include "Language.h"
#include "Color.h" #include "Color.h"
#include "Lexer.h" #include "Lexer.h"
#include "LyXRC.h" #include "LyXRC.h"
#include "output_latex.h"
#include "OutputParams.h"
#include "support/lstrings.h" #include "support/lstrings.h"
@ -163,37 +166,37 @@ bool operator==(Font::FontBits const & lhs,
Font::Font() Font::Font()
: bits(sane), lang(default_language) : bits(sane), lang(default_language), open_encoding_(false)
{} {}
Font::Font(Font::FONT_INIT1) Font::Font(Font::FONT_INIT1)
: bits(inherit), lang(default_language) : bits(inherit), lang(default_language), open_encoding_(false)
{} {}
Font::Font(Font::FONT_INIT2) Font::Font(Font::FONT_INIT2)
: bits(ignore), lang(ignore_language) : bits(ignore), lang(ignore_language), open_encoding_(false)
{} {}
Font::Font(Font::FONT_INIT3) Font::Font(Font::FONT_INIT3)
: bits(sane), lang(default_language) : bits(sane), lang(default_language), open_encoding_(false)
{} {}
Font::Font(Font::FONT_INIT1, Language const * l) Font::Font(Font::FONT_INIT1, Language const * l)
: bits(inherit), lang(l) : bits(inherit), lang(l), open_encoding_(false)
{} {}
Font::Font(Font::FONT_INIT2, Language const * l) Font::Font(Font::FONT_INIT2, Language const * l)
: bits(ignore), lang(l) : bits(ignore), lang(l), open_encoding_(false)
{} {}
Font::Font(Font::FONT_INIT3, Language const * l) Font::Font(Font::FONT_INIT3, Language const * l)
: bits(sane), lang(l) : bits(sane), lang(l), open_encoding_(false)
{} {}
@ -736,7 +739,9 @@ void Font::lyxWriteChanges(Font const & orgfont,
/// Writes the head of the LaTeX needed to impose this font /// Writes the head of the LaTeX needed to impose this font
// Returns number of chars written. // Returns number of chars written.
int Font::latexWriteStartChanges(odocstream & os, Font const & base, int Font::latexWriteStartChanges(odocstream & os, BufferParams const & bparams,
OutputParams const & runparams,
Font const & base,
Font const & prev) const Font const & prev) const
{ {
bool env = false; bool env = false;
@ -758,6 +763,20 @@ int Font::latexWriteStartChanges(odocstream & os, Font const & base,
"$$lang", language()->babel()); "$$lang", language()->babel());
os << from_ascii(tmp); os << from_ascii(tmp);
count += tmp.length(); count += tmp.length();
} else {
os << '{';
count += 1;
}
}
if (language()->encoding()->package() == Encoding::CJK) {
int const c = switchEncoding(os, bparams,
runparams.moving_arg, *(runparams.encoding),
*(language()->encoding()));
if (c > 0) {
open_encoding_ = true;
count += c;
runparams.encoding = language()->encoding();
} }
} }
@ -832,7 +851,9 @@ int Font::latexWriteStartChanges(odocstream & os, Font const & base,
/// Writes ending block of LaTeX needed to close use of this font /// Writes ending block of LaTeX needed to close use of this font
// Returns number of chars written // Returns number of chars written
// This one corresponds to latexWriteStartChanges(). (Asger) // This one corresponds to latexWriteStartChanges(). (Asger)
int Font::latexWriteEndChanges(odocstream & os, Font const & base, int Font::latexWriteEndChanges(odocstream & os, BufferParams const & bparams,
OutputParams const & runparams,
Font const & base,
Font const & next) const Font const & next) const
{ {
int count = 0; int count = 0;
@ -893,6 +914,19 @@ int Font::latexWriteEndChanges(odocstream & os, Font const & base,
count += 6; count += 6;
} }
if (open_encoding_) {
// We need to close the encoding even if it does not change
// to do correct environment nesting
Encoding const * const ascii = encodings.getFromLyXName("ascii");
int const c = switchEncoding(os, bparams,
runparams.moving_arg, *(runparams.encoding),
*ascii);
BOOST_ASSERT(c > 0);
count += c;
runparams.encoding = ascii;
open_encoding_ = false;
}
if (language() != base.language() && language() != next.language()) { if (language() != base.language() && language() != next.language()) {
os << '}'; os << '}';
++count; ++count;

View File

@ -25,6 +25,7 @@ namespace lyx {
class Lexer; class Lexer;
class BufferParams; class BufferParams;
class Language; class Language;
class OutputParams;
/// ///
@ -296,14 +297,18 @@ public:
to this font. Returns number of chars written. Base is the to this font. Returns number of chars written. Base is the
font state active now. font state active now.
*/ */
int latexWriteStartChanges(odocstream &, Font const & base, int latexWriteStartChanges(odocstream &, BufferParams const & bparams,
OutputParams const & runparams,
Font const & base,
Font const & prev) const; Font const & prev) const;
/** Writes the tail of the LaTeX needed to change to this font. /** Writes the tail of the LaTeX needed to change to this font.
Returns number of chars written. Base is the font state we want Returns number of chars written. Base is the font state we want
to achieve. to achieve.
*/ */
int latexWriteEndChanges(odocstream &, Font const & base, int latexWriteEndChanges(odocstream &, BufferParams const & bparams,
OutputParams const & runparams,
Font const & base,
Font const & next) const; Font const & next) const;
@ -348,6 +353,8 @@ private:
/// Updates a misc setting according to request /// Updates a misc setting according to request
Font::FONT_MISC_STATE setMisc(Font::FONT_MISC_STATE newfont, Font::FONT_MISC_STATE setMisc(Font::FONT_MISC_STATE newfont,
Font::FONT_MISC_STATE org); Font::FONT_MISC_STATE org);
/// Did latexWriteStartChanges open an encoding environment?
mutable bool open_encoding_;
}; };

View File

@ -344,26 +344,29 @@ string LaTeXFeatures::getLanguages() const
{ {
ostringstream languages; ostringstream languages;
for (LanguageList::const_iterator cit = LanguageList::const_iterator const begin = UsedLanguages_.begin();
UsedLanguages_.begin(); for (LanguageList::const_iterator cit = begin;
cit != UsedLanguages_.end(); cit != UsedLanguages_.end();
++cit) ++cit) {
languages << (*cit)->babel() << ','; if (cit != begin)
languages << ',';
languages << (*cit)->babel();
}
return languages.str(); return languages.str();
} }
set<string> LaTeXFeatures::getEncodingSet(string const & doc_encoding) const set<string> LaTeXFeatures::getEncodingSet(string const & doc_encoding) const
{ {
// This does only find encodings of languages supported by babel, but
// that does not matter since we don't have a language with an
// encoding supported by inputenc but without babel support.
set<string> encodings; set<string> encodings;
LanguageList::const_iterator it = UsedLanguages_.begin(); LanguageList::const_iterator it = UsedLanguages_.begin();
LanguageList::const_iterator end = UsedLanguages_.end(); LanguageList::const_iterator end = UsedLanguages_.end();
for (; it != end; ++it) for (; it != end; ++it)
// thailatex does not use the inputenc package, but sets up
// babel directly for tis620-0 encoding, therefore we must
// not add tis620-0 to the encoding set.
if ((*it)->encoding()->latexName() != doc_encoding && if ((*it)->encoding()->latexName() != doc_encoding &&
(*it)->encoding()->name() != "tis620-0") (*it)->encoding()->package() == Encoding::inputenc)
encodings.insert((*it)->encoding()->latexName()); encodings.insert((*it)->encoding()->latexName());
return encodings; return encodings;
} }

View File

@ -118,7 +118,7 @@ private:
static PackagesList packages_; static PackagesList packages_;
/// ///
typedef std::set<Language const *> LanguageList; typedef std::set<Language const *> LanguageList;
/// /// used languages (only those that are supported by babel)
LanguageList UsedLanguages_; LanguageList UsedLanguages_;
/// ///
typedef std::set<std::string> UsedFloats; typedef std::set<std::string> UsedFloats;

View File

@ -702,7 +702,8 @@ void Paragraph::Pimpl::simpleTeXSpecialChars(Buffer const & buf,
} else { } else {
if (open_font) { if (open_font) {
column += running_font.latexWriteEndChanges( column += running_font.latexWriteEndChanges(
os, basefont, basefont); os, bparams, runparams,
basefont, basefont);
open_font = false; open_font = false;
} }
@ -757,7 +758,8 @@ void Paragraph::Pimpl::simpleTeXSpecialChars(Buffer const & buf,
// some insets cannot be inside a font change command // some insets cannot be inside a font change command
if (open_font && inset->noFontChange()) { if (open_font && inset->noFontChange()) {
column += running_font.latexWriteEndChanges( column += running_font.latexWriteEndChanges(
os, basefont, basefont); os, bparams, runparams,
basefont, basefont);
open_font = false; open_font = false;
basefont = owner_->getLayoutFont(bparams, outerfont); basefont = owner_->getLayoutFont(bparams, outerfont);
running_font = basefont; running_font = basefont;
@ -991,7 +993,7 @@ void Paragraph::Pimpl::validate(LaTeXFeatures & features,
{ {
features.useLanguage(language); features.useLanguage(language);
LYXERR(Debug::LATEX) << "Found language " LYXERR(Debug::LATEX) << "Found language "
<< language->babel() << endl; << language->lang() << endl;
} }
} }
@ -1975,7 +1977,8 @@ bool Paragraph::simpleTeXOnePar(Buffer const & buf,
if (body_pos > 0) { if (body_pos > 0) {
if (open_font) { if (open_font) {
column += running_font.latexWriteEndChanges( column += running_font.latexWriteEndChanges(
os, basefont, basefont); os, bparams, runparams,
basefont, basefont);
open_font = false; open_font = false;
} }
basefont = getLayoutFont(bparams, outerfont); basefont = getLayoutFont(bparams, outerfont);
@ -2027,13 +2030,15 @@ bool Paragraph::simpleTeXOnePar(Buffer const & buf,
font.language() != running_font.language())) font.language() != running_font.language()))
{ {
column += running_font.latexWriteEndChanges( column += running_font.latexWriteEndChanges(
os, basefont, os, bparams, runparams, basefont,
(i == body_pos-1) ? basefont : font); (i == body_pos-1) ? basefont : font);
running_font = basefont; running_font = basefont;
open_font = false; open_font = false;
} }
// Switch file encoding if necessary // Switch file encoding if necessary
if (runparams.encoding->package() == Encoding::inputenc &&
font.language()->encoding()->package() == Encoding::inputenc) {
int const count = switchEncoding(os, bparams, int const count = switchEncoding(os, bparams,
runparams.moving_arg, *(runparams.encoding), runparams.moving_arg, *(runparams.encoding),
*(font.language()->encoding())); *(font.language()->encoding()));
@ -2041,13 +2046,15 @@ bool Paragraph::simpleTeXOnePar(Buffer const & buf,
column += count; column += count;
runparams.encoding = font.language()->encoding(); runparams.encoding = font.language()->encoding();
} }
}
// Do we need to change font? // Do we need to change font?
if ((font != running_font || if ((font != running_font ||
font.language() != running_font.language()) && font.language() != running_font.language()) &&
i != body_pos - 1) i != body_pos - 1)
{ {
column += font.latexWriteStartChanges(os, basefont, column += font.latexWriteStartChanges(os, bparams,
runparams, basefont,
last_font); last_font);
running_font = font; running_font = font;
open_font = true; open_font = true;
@ -2087,11 +2094,12 @@ bool Paragraph::simpleTeXOnePar(Buffer const & buf,
#ifdef FIXED_LANGUAGE_END_DETECTION #ifdef FIXED_LANGUAGE_END_DETECTION
if (next_) { if (next_) {
running_font running_font
.latexWriteEndChanges(os, basefont, .latexWriteEndChanges(os, bparams, runparams,
basefont,
next_->getFont(bparams, 0, outerfont)); next_->getFont(bparams, 0, outerfont));
} else { } else {
running_font.latexWriteEndChanges(os, basefont, running_font.latexWriteEndChanges(os, bparams,
basefont); runparams, basefont, basefont);
} }
#else #else
#ifdef WITH_WARNINGS #ifdef WITH_WARNINGS
@ -2099,7 +2107,8 @@ bool Paragraph::simpleTeXOnePar(Buffer const & buf,
//#warning there as we start another \selectlanguage with the next paragraph if //#warning there as we start another \selectlanguage with the next paragraph if
//#warning we are in need of this. This should be fixed sometime (Jug) //#warning we are in need of this. This should be fixed sometime (Jug)
#endif #endif
running_font.latexWriteEndChanges(os, basefont, basefont); running_font.latexWriteEndChanges(os, bparams, runparams,
basefont, basefont);
#endif #endif
} }

View File

@ -297,7 +297,8 @@ TeXOnePar(Buffer const & buf,
} }
// Switch file encoding if necessary // Switch file encoding if necessary
if (bparams.inputenc == "auto") { if (bparams.inputenc == "auto" &&
runparams.encoding->package() == Encoding::inputenc) {
// Look ahead for future encoding changes. // Look ahead for future encoding changes.
// We try to output them at the beginning of the paragraph, // We try to output them at the beginning of the paragraph,
// since the \inputencoding command is not allowed e.g. in // since the \inputencoding command is not allowed e.g. in
@ -313,7 +314,8 @@ TeXOnePar(Buffer const & buf,
// encoding to that required by the language of c. // encoding to that required by the language of c.
Encoding const * const encoding = Encoding const * const encoding =
pit->getFontSettings(bparams, i).language()->encoding(); pit->getFontSettings(bparams, i).language()->encoding();
if (switchEncoding(os, bparams, false, if (encoding->package() == Encoding::inputenc &&
switchEncoding(os, bparams, false,
*(runparams.encoding), *encoding) > 0) { *(runparams.encoding), *encoding) > 0) {
runparams.encoding = encoding; runparams.encoding = encoding;
os << '\n'; os << '\n';
@ -601,25 +603,44 @@ int switchEncoding(odocstream & os, BufferParams const & bparams,
bool moving_arg, Encoding const & oldEnc, bool moving_arg, Encoding const & oldEnc,
Encoding const & newEnc) Encoding const & newEnc)
{ {
// FIXME thailatex does not support the inputenc package, so we // FIXME We ignore encoding switches from/to encodings that do
// ignore switches from/to tis620-0 encoding here. This does of // neither support the inputenc package nor the CJK package here.
// course only work as long as the non-thai text contains ASCII // This does of course only work in special cases (e.g. switch from
// only, but it is the best we can do. // tis620-0 to latin1, but the text in latin1 contains ASCII only,
// Since the \inputencoding command does not work inside sections // but it is the best we can do
// we ignore the encoding switch also in moving arguments.
if (((bparams.inputenc == "auto" && !moving_arg) || if (((bparams.inputenc == "auto" && !moving_arg) ||
bparams.inputenc == "default") && bparams.inputenc == "default") &&
oldEnc.name() != newEnc.name() && oldEnc.name() != newEnc.name() &&
oldEnc.name() != "ascii" && newEnc.name() != "ascii" && oldEnc.package() != Encoding::none &&
oldEnc.name() != "tis620-0" && newEnc.name() != "tis620-0") { newEnc.package() != Encoding::none) {
LYXERR(Debug::LATEX) << "Changing LaTeX encoding from " LYXERR(Debug::LATEX) << "Changing LaTeX encoding from "
<< oldEnc.name() << " to " << oldEnc.name() << " to "
<< newEnc.name() << endl; << newEnc.name() << endl;
os << setEncoding(newEnc.iconvName()); os << setEncoding(newEnc.iconvName());
if (bparams.inputenc != "default") { if (bparams.inputenc != "default") {
docstring const inputenc(from_ascii(newEnc.latexName())); docstring const inputenc(from_ascii(newEnc.latexName()));
switch (newEnc.package()) {
case Encoding::none:
break;
case Encoding::inputenc: {
int count = inputenc.length();
if (oldEnc.package() == Encoding::CJK) {
os << "\\end{CJK}";
count += 9;
}
os << "\\inputencoding{" << inputenc << '}'; os << "\\inputencoding{" << inputenc << '}';
return 16 + inputenc.length(); return count + 16;
}
case Encoding::CJK: {
int count = inputenc.length();
if (oldEnc.package() == Encoding::CJK) {
os << "\\end{CJK}";
count += 9;
}
os << "\\begin{CJK}{" << inputenc << "}{}";
return count + 15;
}
}
} }
} }
return 0; return 0;