Fix plaintext output of dashes (bug #3647)

Previously, consecutive dashes in .lyx files were combined to endash and emdash
in some cases, and in other cases they were output as is. This made the code
complicated, and resulted in inconsitencies ((bug #3647).
Now, a dash in a .lyx file is always a dash in the output, for all flavours.
The special handling is moved to the input side, so that you still get an
endash if you type two hyphens. If needed, this can be changed or made
customizable without the need to update the file format again. Many thanks
for the fruitful mailing list dicsussion, which contributed significantly to
the final version.
This commit is contained in:
Georg Baum 2015-02-24 21:58:27 +01:00
parent fc9e6326b9
commit 8aa37c43a1
24 changed files with 169 additions and 89 deletions

View File

@ -11,6 +11,11 @@ adjustments are made to tex2lyx and bugs are fixed in lyx2lyx.
-----------------------
2015-02-24 Georg Baum <Georg.Baum@post.rwth-aachen.de>
* Format incremented to 481
"--" and "---" are not treated as endash and emdash anymore, since
we have unicode symbols for that now (bug 3647).
2015-01-09 Jürgen Spitzmüller <spitz@lyx.org>
* Format incremented to 480:
Add self-defined Question* and Question lemma types to

View File

@ -85,7 +85,7 @@ format_relation = [("0_06", [200], minor_versions("0.6" , 4)),
("1_6", range(277,346), minor_versions("1.6" , 10)),
("2_0", range(346,414), minor_versions("2.0", 8)),
("2_1", range(414,475), minor_versions("2.1", 0)),
("2_2", range(475,481), minor_versions("2.2", 0))
("2_2", range(475,482), minor_versions("2.2", 0))
]
####################################################################

View File

@ -480,6 +480,71 @@ def revert_question_env(document):
i = j
def convert_dashes(document):
"convert -- and --- to \\twohyphens and \\threehyphens"
if document.backend != "latex":
return
i = 0
while i < len(document.body):
words = document.body[i].split()
if len(words) > 1 and words[0] == "\\begin_inset" and \
words[1] in ["ERT", "Formula", "IPA"]:
# must not replace anything in math
# filtering out IPA makes Text::readParToken() more simple
# skip ERT as well since it is not needed there
j = find_end_of_inset(document.body, i)
if j == -1:
document.warning("Malformed LyX document: Can't find end of " + words[1] + " inset at line " + str(i))
i += 1
else:
i = j
continue
while True:
j = document.body[i].find("--")
if j == -1:
break
front = document.body[i][:j]
back = document.body[i][j+2:]
# We can have an arbitrary number of consecutive hyphens.
# These must be split into the corresponding number of two and three hyphens
# We must match what LaTeX does: First try emdash, then endash, then single hyphen
if back.find("-") == 0:
back = back[1:]
if len(back) > 0:
document.body.insert(i+1, back)
document.body[i] = front + "\\threehyphens"
else:
if len(back) > 0:
document.body.insert(i+1, back)
document.body[i] = front + "\\twohyphens"
i += 1
def revert_dashes(document):
"convert \\twohyphens and \\threehyphens to -- and ---"
i = 0
while i < len(document.body):
replaced = False
if document.body[i].find("\\twohyphens") >= 0:
document.body[i] = document.body[i].replace("\\twohyphens", "--")
replaced = True
if document.body[i].find("\\threehyphens") >= 0:
document.body[i] = document.body[i].replace("\\threehyphens", "---")
replaced = True
if replaced and i+1 < len(document.body) and \
(document.body[i+1].find("\\") != 0 or \
document.body[i+1].find("\\twohyphens") == 0 or
document.body[i+1].find("\\threehyphens") == 0) and \
len(document.body[i]) + len(document.body[i+1]) <= 80:
document.body[i] = document.body[i] + document.body[i+1]
document.body[i+1:i+2] = []
else:
i += 1
##
# Conversion hub
@ -495,10 +560,12 @@ convert = [
[477, []],
[478, []],
[479, []],
[480, []]
[480, []],
[481, [convert_dashes]]
]
revert = [
[480, [revert_dashes]],
[479, [revert_question_env]],
[478, [revert_beamer_lemma]],
[477, [revert_xarrow]],

View File

@ -364,12 +364,6 @@ public:
pos_type i,
unsigned int & column);
///
bool latexSpecialTypewriter(
char_type const c,
otexstream & os,
pos_type i,
unsigned int & column);
///
bool latexSpecialPhrase(
otexstream & os,
pos_type & i,
@ -1216,12 +1210,6 @@ void Paragraph::Private::latexSpecialChar(otexstream & os,
&& lyxrc.fontenc == "T1" && latexSpecialT1(c, os, i, column))
return;
// \tt font needs special treatment
if (!runparams.inIPA
&& running_font.fontInfo().family() == TYPEWRITER_FAMILY
&& latexSpecialTypewriter(c, os, i, column))
return;
// Otherwise, we use what LaTeX provides us.
switch (c) {
case '\\':
@ -1242,6 +1230,14 @@ void Paragraph::Private::latexSpecialChar(otexstream & os,
break;
case '-':
os << '-';
if (i + 1 < end_pos && text_[i+1] == '-') {
// Prevent "--" becoming an endash and "---" becoming
// an emdash.
// Within \ttfamily, "--" is merged to "-" (no endash)
// so we avoid this rather irritating ligature as well
os << "{}";
column += 2;
}
break;
case '\"':
os << "\\char`\\\"{}";
@ -1401,28 +1397,6 @@ bool Paragraph::Private::latexSpecialT3(char_type const c, otexstream & os,
}
bool Paragraph::Private::latexSpecialTypewriter(char_type const c, otexstream & os,
pos_type i, unsigned int & column)
{
switch (c) {
case '-':
// within \ttfamily, "--" is merged to "-" (no endash)
// so we avoid this rather irritating ligature
if (i + 1 < int(text_.size()) && text_[i + 1] == '-') {
os << "-{}";
column += 2;
} else
os << '-';
return true;
// everything else has to be checked separately
// (depending on the encoding)
default:
return false;
}
}
/// \param end_pos
/// If [start_pos, end_pos) does not include entirely the special phrase, then
/// do not apply the macro transformation.
@ -3159,31 +3133,7 @@ docstring Paragraph::simpleLyXHTMLOnePar(Buffer const & buf,
}
} else {
char_type c = getUChar(buf.masterBuffer()->params(), i);
if (style.pass_thru || runparams.pass_thru)
xs << c;
else if (c == '-' && !runparams.inIPA &&
font.fontInfo().family() != TYPEWRITER_FAMILY) {
docstring str;
int j = i + 1;
if (j < size() && d->text_[j] == '-') {
j += 1;
if (j < size() && d->text_[j] == '-') {
str += from_ascii("&mdash;");
i += 2;
} else {
str += from_ascii("&ndash;");
i += 1;
}
}
else
str += c;
// We don't want to escape the entities. Note that
// it is safe to do this, since str can otherwise
// only be "-". E.g., it can't be "<".
xs << XHTMLStream::ESCAPE_NONE << str;
} else
xs << c;
xs << c;
}
font_old = font.fontInfo();
}
@ -3258,9 +3208,7 @@ bool Paragraph::isHardHyphenOrApostrophe(pos_type pos) const
if ((nextpos == psize || isSpace(nextpos))
&& (pos == 0 || isSpace(prevpos)))
return false;
return c == '\''
|| ((nextpos == psize || d->text_[nextpos] != '-')
&& (pos == 0 || d->text_[prevpos] != '-'));
return true;
}

View File

@ -498,6 +498,23 @@ void Text::readParToken(Paragraph & par, Lexer & lex,
inset->read(lex);
inset->setBuffer(*buf);
par.insertInset(par.size(), inset.release(), font, change);
} else if (token == "\\twohyphens" || token == "\\threehyphens") {
// Ideally, this should be done by lyx2lyx, but lyx2lyx does not know the
// running font and does not know anything about layouts (and CopyStyle).
Layout const & layout(par.layout());
FontInfo info = font.fontInfo();
info.realize(layout.resfont);
if (layout.pass_thru || info.family() == TYPEWRITER_FAMILY) {
if (token == "\\twohyphens")
par.insert(par.size(), from_ascii("--"), font, change);
else
par.insert(par.size(), from_ascii("---"), font, change);
} else {
if (token == "\\twohyphens")
par.insertChar(par.size(), 0x2013, font, change);
else
par.insertChar(par.size(), 0x2014, font, change);
}
} else if (token == "\\backslash") {
par.appendChar('\\', font, change);
} else if (token == "\\LyXTable") {
@ -1019,14 +1036,36 @@ void Text::insertChar(Cursor & cur, char_type c)
}
}
par.insertChar(cur.pos(), c, cur.current_font,
pos_type pos = cur.pos();
if (!cur.paragraph().isPassThru() && owner_->lyxCode() != IPA_CODE &&
cur.current_font.fontInfo().family() != TYPEWRITER_FAMILY &&
c == '-' && pos > 0) {
if (par.getChar(pos - 1) == '-') {
// convert "--" to endash
par.eraseChar(pos - 1, cur.buffer()->params().track_changes);
c = 0x2013;
pos--;
} else if (par.getChar(pos - 1) == 0x2013) {
// convert "---" to emdash
par.eraseChar(pos - 1, cur.buffer()->params().track_changes);
c = 0x2014;
pos--;
} else if (par.getChar(pos - 1) == 0x2014) {
// convert "----" to "-"
par.eraseChar(pos - 1, cur.buffer()->params().track_changes);
c = '-';
pos--;
}
}
par.insertChar(pos, c, cur.current_font,
cur.buffer()->params().track_changes);
cur.checkBufferStructure();
// cur.screenUpdateFlags(Update::Force);
bool boundary = cur.boundary()
|| tm.isRTLBoundary(cur.pit(), cur.pos() + 1);
setCursor(cur, cur.pit(), cur.pos() + 1, false, boundary);
|| tm.isRTLBoundary(cur.pit(), pos + 1);
setCursor(cur, cur.pit(), pos + 1, false, boundary);
charInserted(cur);
}

View File

@ -82,7 +82,8 @@ Context::Context(bool need_layout_,
: need_layout(need_layout_),
need_end_layout(false), need_end_deeper(false),
has_item(false), deeper_paragraph(false),
new_layout_allowed(true), textclass(textclass_),
new_layout_allowed(true), merging_hyphens_allowed(true),
textclass(textclass_),
layout(layout_), parent_layout(parent_layout_),
font(font_)
{
@ -240,6 +241,8 @@ void Context::dump(ostream & os, string const & desc) const
os << "deeper_paragraph ";
if (new_layout_allowed)
os << "new_layout_allowed ";
if (merging_hyphens_allowed)
os << "merging_hyphens_allowed ";
if (!extra_stuff.empty())
os << "extrastuff=[" << extra_stuff << "] ";
if (!par_extra_stuff.empty())

View File

@ -146,6 +146,8 @@ public:
* would not work.
*/
bool new_layout_allowed;
/// May -- be converted to endash and --- to emdash?
bool merging_hyphens_allowed;
/// Did we output anything yet in any context?
static bool empty;

View File

@ -1,5 +1,5 @@
#LyX file created by tex2lyx 2.2
\lyxformat 480
\lyxformat 481
\begin_document
\begin_header
\textclass article

View File

@ -1,5 +1,5 @@
#LyX file created by tex2lyx 2.2
\lyxformat 480
\lyxformat 481
\begin_document
\begin_header
\textclass article

View File

@ -1,5 +1,5 @@
#LyX file created by tex2lyx 2.2
\lyxformat 480
\lyxformat 481
\begin_document
\begin_header
\textclass article

View File

@ -1,5 +1,5 @@
#LyX file created by tex2lyx 2.2
\lyxformat 480
\lyxformat 481
\begin_document
\begin_header
\textclass article

View File

@ -1,5 +1,5 @@
#LyX file created by tex2lyx 2.2
\lyxformat 480
\lyxformat 481
\begin_document
\begin_header
\textclass article

View File

@ -1,5 +1,5 @@
#LyX file created by tex2lyx 2.2
\lyxformat 480
\lyxformat 481
\begin_document
\begin_header
\textclass article

View File

@ -1,5 +1,5 @@
#LyX file created by tex2lyx 2.2
\lyxformat 480
\lyxformat 481
\begin_document
\begin_header
\textclass article

View File

@ -1,5 +1,5 @@
#LyX file created by tex2lyx 2.2
\lyxformat 480
\lyxformat 481
\begin_document
\begin_header
\textclass article
@ -3507,7 +3507,7 @@ A long table
\begin_inset Caption Standard
\begin_layout Standard
A long table -- continued
A long table continued
\end_layout
\end_inset
@ -6769,7 +6769,7 @@ fy ligature break.
\end_layout
\begin_layout Standard
There are dashes: endash in short form -- and long form , emdash is alike: --- and —. If we really want several hyphens in a row, we need to separate them: -
There are dashes: endash in short form and long form , emdash is alike: — and —. If we really want several hyphens in a row, we need to separate them: -
\begin_inset ERT
status collapsed

View File

@ -1,5 +1,5 @@
#LyX file created by tex2lyx 2.2
\lyxformat 480
\lyxformat 481
\begin_document
\begin_header
\textclass memoir

View File

@ -1,5 +1,5 @@
#LyX file created by tex2lyx 2.2
\lyxformat 480
\lyxformat 481
\begin_document
\begin_header
\textclass amsart

View File

@ -1,5 +1,5 @@
#LyX file created by tex2lyx 2.2
\lyxformat 480
\lyxformat 481
\begin_document
\begin_header
\textclass book

View File

@ -1,5 +1,5 @@
#LyX file created by tex2lyx 2.2
\lyxformat 480
\lyxformat 481
\begin_document
\begin_header
\textclass scrbook

View File

@ -1,5 +1,5 @@
#LyX file created by tex2lyx 2.2
\lyxformat 480
\lyxformat 481
\begin_document
\begin_header
\textclass article

View File

@ -1,5 +1,5 @@
#LyX file created by tex2lyx 2.2
\lyxformat 480
\lyxformat 481
\begin_document
\begin_header
\textclass article

View File

@ -1,5 +1,5 @@
#LyX file created by tex2lyx 2.2
\lyxformat 480
\lyxformat 481
\begin_document
\begin_header
\textclass article

View File

@ -2387,9 +2387,22 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
else if (t.cat() == catOther ||
t.cat() == catAlign ||
t.cat() == catParameter) {
// This translates "&" to "\\&" which may be wrong...
context.check_layout(os);
os << t.cs();
if (t.asInput() == "-" && p.next_token().asInput() == "-" &&
context.merging_hyphens_allowed &&
context.font.family != "ttfamily" &&
!context.layout->pass_thru) {
if (p.next_next_token().asInput() == "-") {
// --- is emdash
os << to_utf8(docstring(1, 0x2014));
p.get_token();
} else
// -- is endash
os << to_utf8(docstring(1, 0x2013));
p.get_token();
} else
// This translates "&" to "\\&" which may be wrong...
os << t.cs();
}
else if (p.isParagraph()) {
@ -3240,7 +3253,10 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
else if (t.cs() == "textipa") {
context.check_layout(os);
begin_inset(os, "IPA\n");
bool merging_hyphens_allowed = context.merging_hyphens_allowed;
context.merging_hyphens_allowed = false;
parse_text_in_inset(p, os, FLAG_ITEM, outer, context);
context.merging_hyphens_allowed = merging_hyphens_allowed;
end_inset(os);
preamble.registerAutomaticallyLoadedPackage("tipa");
preamble.registerAutomaticallyLoadedPackage("tipx");

View File

@ -36,8 +36,8 @@ extern char const * const lyx_version_info;
// Do not remove the comment below, so we get merge conflict in
// independent branches. Instead add your own.
#define LYX_FORMAT_LYX 480 // spitz: question and question* environments
#define LYX_FORMAT_TEX2LYX 480
#define LYX_FORMAT_LYX 481 // gb: endash and emdash
#define LYX_FORMAT_TEX2LYX 481
#if LYX_FORMAT_TEX2LYX != LYX_FORMAT_LYX
#ifndef _MSC_VER