Fix bug #5408: tex2lyx cannot handle verbatim code

- Implement catcode setting in Parser - add a new Parser::verbatimStuff method that reads verbatim contents - use this method to parse "verbatim" environment. - use it to parse \verb too. - rename Parser::verbatimEnvironment to ertEnvironment. TODO: - use for other verbatim-like cases (Sweave chunk, lstlisting...) - factor out the function that outputs ERT (including line breaks) - maybe implement Parser::unparse (if needed)
2025-01-13 20:09:59 +00:00 · 2013-01-25 12:48:52 +01:00 · 2013-01-25 12:48:52 +01:00 · be42f1398d
commit be42f1398d
parent e609787a0f
6 changed files with 156 additions and 97 deletions
--- a/lib/layouts/stdlayouts.inc
+++ b/lib/layouts/stdlayouts.inc
@ -79,6 +79,7 @@ Style Verbatim
 	ParbreakIsNewline	1
 	FreeSpacing		1
 	PassThru		1
+	KeepEmpty		1
 	NewLine			0
 	ParSkip			0.4
 	TopSep			0.7
--- a/src/tex2lyx/Parser.cpp
+++ b/src/tex2lyx/Parser.cpp
@ -22,39 +22,6 @@ namespace lyx {

 namespace {

-CatCode theCatcode[256];
-
-void catInit()
-{
-	static bool init_done = false;
-	if (init_done)
-		return;
-	init_done = true;
-
-	fill(theCatcode, theCatcode + 256, catOther);
-	fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
-	fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
-
-	theCatcode[int('\\')] = catEscape;
-	theCatcode[int('{')]  = catBegin;
-	theCatcode[int('}')]  = catEnd;
-	theCatcode[int('$')]  = catMath;
-	theCatcode[int('&')]  = catAlign;
-	theCatcode[int('\n')] = catNewline;
-	theCatcode[int('#')]  = catParameter;
-	theCatcode[int('^')]  = catSuper;
-	theCatcode[int('_')]  = catSub;
-	theCatcode[0x7f]      = catIgnore;
-	theCatcode[int(' ')]  = catSpace;
-	theCatcode[int('\t')] = catSpace;
-	theCatcode[int('\r')] = catNewline;
-	theCatcode[int('~')]  = catActive;
-	theCatcode[int('%')]  = catComment;
-
-	// This is wrong!
-	theCatcode[int('@')]  = catLetter;
-}
-
 /*!
 * Translate a line ending to '\n'.
 * \p c must have catcode catNewline, and it must be the last character read
@ -79,16 +46,8 @@ char_type getNewline(idocstream & is, char_type c)
 	return c;
 }

-CatCode catcode(char_type c)
-{
-	if (c < 256)
-		return theCatcode[(unsigned char)c];
-	return catOther;
 }

-}
-
-
 //
 // Token
 //
@ -158,7 +117,8 @@ void debugToken(std::ostream & os, Token const & t, unsigned int flags)


 Parser::Parser(idocstream & is)
-	: lineno_(0), pos_(0), iss_(0), is_(is), encoding_iconv_("UTF-8")
+	: lineno_(0), pos_(0), iss_(0), is_(is), encoding_iconv_("UTF-8"),
+	  theCatcodesType_(NORMAL_CATCODES), curr_cat_(UNDECIDED_CATCODES)
 {
 }

@ -166,7 +126,8 @@ Parser::Parser(idocstream & is)
 Parser::Parser(string const & s)
 	: lineno_(0), pos_(0),
 	  iss_(new idocstringstream(from_utf8(s))), is_(*iss_),
-	  encoding_iconv_("UTF-8")
+	  encoding_iconv_("UTF-8"),
+	  theCatcodesType_(NORMAL_CATCODES), curr_cat_(UNDECIDED_CATCODES)
 {
 }

@ -192,6 +153,57 @@ void Parser::setEncoding(std::string const & e, int const & p)
 }


+void Parser::catInit()
+{
+	if (curr_cat_ == theCatcodesType_)
+		return;
+	curr_cat_ = theCatcodesType_;
+
+	fill(theCatcode_, theCatcode_ + 256, catOther);
+	fill(theCatcode_ + 'a', theCatcode_ + 'z' + 1, catLetter);
+	fill(theCatcode_ + 'A', theCatcode_ + 'Z' + 1, catLetter);
+	// This is wrong!
+	theCatcode_[int('@')]  = catLetter;
+
+	if (theCatcodesType_ == NORMAL_CATCODES) {
+		theCatcode_[int('\\')] = catEscape;
+		theCatcode_[int('{')]  = catBegin;
+		theCatcode_[int('}')]  = catEnd;
+		theCatcode_[int('$')]  = catMath;
+		theCatcode_[int('&')]  = catAlign;
+		theCatcode_[int('\n')] = catNewline;
+		theCatcode_[int('#')]  = catParameter;
+		theCatcode_[int('^')]  = catSuper;
+		theCatcode_[int('_')]  = catSub;
+		theCatcode_[0x7f]      = catIgnore;
+		theCatcode_[int(' ')]  = catSpace;
+		theCatcode_[int('\t')] = catSpace;
+		theCatcode_[int('\r')] = catNewline;
+		theCatcode_[int('~')]  = catActive;
+		theCatcode_[int('%')]  = catComment;
+	}
+}
+
+CatCode Parser::catcode(char_type c) const
+{
+	if (c < 256)
+		return theCatcode_[(unsigned char)c];
+	return catOther;
+}
+
+
+void Parser::setCatcode(char c, CatCode cat)
+{
+	theCatcode_[(unsigned char)c] = cat;
+}
+
+
+void Parser::setCatcodes(cat_type t)
+{
+	theCatcodesType_ = t;
+}
+
+
 void Parser::setEncoding(std::string const & e)
 {
 	//cerr << "setting encoding to " << e << std::endl;
@ -472,7 +484,7 @@ string Parser::getFullParentheseArg()
 }


-string const Parser::verbatimEnvironment(string const & name)
+string const Parser::ertEnvironment(string const & name)
 {
 	if (!good())
 		return string();
@ -485,7 +497,7 @@ string const Parser::verbatimEnvironment(string const & name)
 		} else if (t.asInput() == "\\begin") {
 			string const env = getArg('{', '}');
 			os << "\\begin{" << env << '}'
-			   << verbatimEnvironment(env)
+			   << ertEnvironment(env)
 			   << "\\end{" << env << '}';
 		} else if (t.asInput() == "\\end") {
 			string const end = getArg('{', '}');
@ -545,6 +557,34 @@ string const Parser::plainCommand(char left, char right, string const & name)
 }


+string const Parser::verbatimStuff(string const & end_string)
+{
+	if (!good())
+		return string();
+
+	ostringstream oss;
+	size_t match_index = 0;
+	setCatcodes(VERBATIM_CATCODES);
+	for (Token t = get_token(); good(); t = get_token()) {
+		// FIXME t.asInput() might be longer than we need ?
+		if (t.asInput() == end_string.substr(match_index,
+						     t.asInput().length())) {
+			match_index += t.asInput().length();
+			if (match_index >= end_string.length())
+				break;
+		} else if (match_index) {
+			oss << end_string.substr(0, match_index) << t.asInput();
+			match_index = 0;
+		} else
+			oss << t.asInput();
+	}
+	setCatcodes(NORMAL_CATCODES);
+	if (!good())
+		cerr << "unexpected end of input" << endl;
+	return oss.str();
+}
+
+
 void Parser::tokenize_one()
 {
 	catInit();
@ -687,16 +727,4 @@ void Parser::reset()
 }


-void Parser::setCatCode(char c, CatCode cat)
-{
-	theCatcode[(unsigned char)c] = cat;
-}
-
-
-CatCode Parser::getCatCode(char c) const
-{
-	return theCatcode[(unsigned char)c];
-}
-
-
 } // namespace lyx
--- a/src/tex2lyx/Parser.h
+++ b/src/tex2lyx/Parser.h
@ -46,6 +46,12 @@ enum CatCode {
 	catInvalid     // 15   <delete>
 };

+enum cat_type {
+	NORMAL_CATCODES,
+	VERBATIM_CATCODES,
+	UNDECIDED_CATCODES
+};
+

 enum {
 	FLAG_BRACE_LAST = 1 << 1,  //  last closing brace ends the parsing
@ -135,6 +141,13 @@ public:
 	///
 	~Parser();

+	///
+	CatCode catcode(char_type c) const;
+	///
+	void setCatcode(char c, CatCode cat);
+	/// set parser to normal or verbatim mode
+	void setCatcodes(cat_type t);
+
 	/// change the iconv encoding of the input stream
 	/// according to the latex encoding and package
 	void setEncoding(std::string const & encoding, int const & package);
@ -202,11 +215,11 @@ public:
 	/*!
 	 * \returns the contents of the environment \p name.
 	 * <tt>\begin{name}</tt> must be parsed already, <tt>\end{name}</tt>
-	 * is parsed but not returned.
+	 * is parsed but not returned. This parses nested environments properly.
 	 */
-	std::string const verbatimEnvironment(std::string const & name);
+	std::string const ertEnvironment(std::string const & name);
 	/*
-	 * The same as verbatimEnvironment(std::string const & name) but
+	 * The same as ertEnvironment(std::string const & name) but
 	 * \begin and \end commands inside the name environment are not parsed.
 	 * This function is designed to parse verbatim environments.
 	 */
@ -218,6 +231,14 @@ public:
 	 * This function is designed to parse verbatim commands.
 	 */
 	std::string const plainCommand(char left, char right, std::string const & name);
+	/*
+	 * Basically the same as plainEnvironment() but the parsing is
+	 * stopped at string \p end_string. Contrary to the other
+	 * methods, this uses proper catcode setting. This function is
+	 * designed to parse verbatim environments and command. The
+	 * intention is to eventually replace all of its siblings.
+	 */
+	std::string const verbatimStuff(std::string const & end_string);
 	/*!
 	 * Returns the character of the current token and increments
 	 * the token position.
@ -225,7 +246,7 @@ public:
 	char getChar();
 	///
 	void error(std::string const & msg);
-	/// Parses one token from \p is 
+	/// Parses one token from \p is
 	void tokenize_one();
 	///
 	void push_back(Token const & t);
@ -256,12 +277,10 @@ public:
 	std::string verbatimOption();
 	/// resets the parser to initial state
 	void reset();
-	///
-	void setCatCode(char c, CatCode cat);
-	///
-	CatCode getCatCode(char c) const;

 private:
+	/// Setup catcode table
+	void catInit();
 	///
 	int lineno_;
 	///
@ -276,6 +295,12 @@ private:
 	idocstream & is_;
 	/// iconv name of the current encoding
 	std::string encoding_iconv_;
+	///
+	CatCode theCatcode_[256];
+	//
+	cat_type theCatcodesType_;
+	//
+	cat_type curr_cat_;
 };


--- a/src/tex2lyx/Preamble.cpp
+++ b/src/tex2lyx/Preamble.cpp
@ -1299,12 +1299,12 @@ void Preamble::parse(Parser & p, string const & forceclass,

 		else if (t.cs() == "makeatletter") {
 			// LyX takes care of this
-			p.setCatCode('@', catLetter);
+			p.setCatcode('@', catLetter);
 		}

 		else if (t.cs() == "makeatother") {
 			// LyX takes care of this
-			p.setCatCode('@', catOther);
+			p.setCatcode('@', catOther);
 		}

 		else if (t.cs() == "newcommand" || t.cs() == "newcommandx"
--- a/src/tex2lyx/table.cpp
+++ b/src/tex2lyx/table.cpp
@ -786,7 +786,7 @@ void parse_table(Parser & p, ostream & os, bool is_long_tabular,
 			// treat the nested environment as a block, don't
 			// parse &, \\ etc, because they don't belong to our
 			// table if they appear.
-			os << p.verbatimEnvironment(name);
+			os << p.ertEnvironment(name);
 			os << "\\end{" << name << '}';
 			active_environments.pop_back();
 		}
@ -1227,7 +1227,7 @@ void handle_tabular(Parser & p, ostream & os, string const & name,
 							angle = p.getArg('{', '}');
 						}
 						active_environments.push_back(env);
-						p.verbatimEnvironment(env);
+						p.ertEnvironment(env);
 						active_environments.pop_back();
 						p.skip_spaces();
 						if (!p.good() && support::isStrInt(angle))
--- a/src/tex2lyx/text.cpp
+++ b/src/tex2lyx/text.cpp
@ -924,7 +924,7 @@ void parse_box(Parser & p, ostream & os, unsigned outer_flags,
 		// If yes, we need to output ERT.
 		p.pushPosition();
 		if (inner_flags & FLAG_END)
-			p.verbatimEnvironment(inner_type);
+			p.ertEnvironment(inner_type);
 		else
 			p.verbatim_item();
 		p.skip_spaces(true);
@ -1435,27 +1435,36 @@ void parse_environment(Parser & p, ostream & os, bool outer,
 	}

 	else if (name == "verbatim") {
-		os << "\n\\end_layout\n\n\\begin_layout Verbatim\n";
-		string const s = p.plainEnvironment("verbatim");
+		// FIXME: this should go in the generic code that
+		// handles environments defined in layout file that
+		// have "PassThru 1". However, the code there is
+		// already too complicated for my taste.
+		parent_context.new_paragraph(os);
+		Context context(true, parent_context.textclass,
+				   &parent_context.textclass[from_ascii("Verbatim")]);
+		context.check_layout(os);
+		string s = p.verbatimStuff("\\end{verbatim}");
+		// ignore one newline at beginning or end of string
+		if (prefixIs(s, "\n"))
+			s.erase(0,1);
+		if (suffixIs(s, "\n"))
+			s.erase(s.length(),1);
+
 		string::const_iterator it2 = s.begin();
 		for (string::const_iterator it = s.begin(), et = s.end(); it != et; ++it) {
-			if (*it == '\\')
-				os << "\\backslash ";
-			else if (*it == '\n') {
-				it2 = it + 1;
-				// avoid adding an empty paragraph at the end
-				// FIXME: if there are 2 consecutive spaces at the end ignore it
-				// because LyX will re-add a \n
-				// This hack must be removed once bug 8049 is fixed!
-				if ((it + 1 != et) && (it + 2 != et || *it2 != '\n'))
-					os << "\n\\end_layout\n\\begin_layout Verbatim\n";
-			} else
+			context.check_layout(os);
+			if (*it == '\\') {
+				os << "\n\\backslash\n";
+				context.need_end_layout = true;
+			} else if (*it == '\n') {
+				context.new_paragraph(os);
+			} else {
 				os << *it;
+				context.need_end_layout = true;
+			}
 		}
-		os << "\n\\end_layout\n\n";
+		context.new_paragraph(os);
 		p.skip_spaces();
-		// reset to Standard layout
-		os << "\n\\begin_layout Standard\n";
 	}

 	else if (name == "CJK") {
@ -1758,7 +1767,7 @@ void parse_environment(Parser & p, ostream & os, bool outer,
 		parse_arguments("\\begin{" + name + "}", arguments, p, os,
 				outer, parent_context);
 		if (contents == verbatim)
-			handle_ert(os, p.verbatimEnvironment(name),
+			handle_ert(os, p.ertEnvironment(name),
 				   parent_context);
 		else
 			parse_text_snippet(p, os, FLAG_END, outer,
@ -3819,15 +3828,11 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,

 		else if (t.cs() == "verb") {
 			context.check_layout(os);
-			char const delimiter = p.next_token().character();
-			// \verb is special: The usual escaping rules do not
-			// apply, e.g. "\verb+\+" is valid and denotes a single
-			// backslash (bug #4468). Therefore we do not allow
-			// escaping in getArg().
-			string const arg = p.getArg(delimiter, delimiter, false);
-			ostringstream oss;
-			oss << "\\verb" << delimiter << arg << delimiter;
-			handle_ert(os, oss.str(), context);
+			// set catcodes to verbatim early, just in case.
+			p.setCatcodes(VERBATIM_CATCODES);
+			string delim = p.get_token().asInput();
+			string const arg = p.verbatimStuff(delim);
+			handle_ert(os, "\\verb" + delim + arg + delim, context);
 		}

 		// Problem: \= creates a tabstop inside the tabbing environment
@ -4574,7 +4579,7 @@ string guessLanguage(Parser & p, string const & lang)
 			p.setEncoding(encoding, Encoding::CJK);
 		else
 			p.setEncoding("UTF-8");
-		string const text = p.verbatimEnvironment("CJK");
+		string const text = p.ertEnvironment("CJK");
 		p.setEncoding(encoding_old);
 		p.skip_spaces();
 		if (!where) {