Make tex2lyx encoding changes more robust

This is achieved by not calling Parse::tokenize_one() anymore in Parser::good(): The status of the input can be tested without performing the actual tokenizing. Now there are only two methods that may prevent an encoding change:next_token() and next_next_token().
2024-11-27 03:36:39 +00:00 · 2013-02-17 14:53:56 +01:00 · 2013-02-17 14:53:56 +01:00 · 25fe87e55c
commit 25fe87e55c
parent 94cb22ac43
2 changed files with 68 additions and 27 deletions
--- a/src/tex2lyx/Parser.cpp
+++ b/src/tex2lyx/Parser.cpp
@ -118,6 +118,17 @@ void debugToken(std::ostream & os, Token const & t, unsigned int flags)
 // Wrapper
 //

+bool iparserdocstream::setEncoding(std::string const & e)
+{
+	is_ << lyx::setEncoding(e);
+	if (s_.empty())
+		return true;
+	cerr << "Setting encoding " << e << " too late. The encoding of `"
+	     << to_utf8(s_) << "´ is wrong." << std::endl;
+	return false;
+}
+
+
 void iparserdocstream::putback(char_type c)
 {
 	s_ += c;
@ -182,7 +193,7 @@ void Parser::deparse()
 }


-void Parser::setEncoding(std::string const & e, int const & p)
+bool Parser::setEncoding(std::string const & e, int const & p)
 {
 	// We may (and need to) use unsafe encodings here: Since the text is
 	// converted to unicode while reading from is_, we never see text in
@ -191,9 +202,9 @@ void Parser::setEncoding(std::string const & e, int const & p)
 	Encoding const * const enc = encodings.fromLaTeXName(e, p, true);
 	if (!enc) {
 		cerr << "Unknown encoding " << e << ". Ignoring." << std::endl;
-		return;
+		return false;
 	}
-	setEncoding(enc->iconvName());
+	return setEncoding(enc->iconvName());
 }


@ -250,11 +261,11 @@ void Parser::setCatcodes(cat_type t)
 }


-void Parser::setEncoding(std::string const & e)
+bool Parser::setEncoding(std::string const & e)
 {
 	//cerr << "setting encoding to " << e << std::endl;
-	is_.docstream() << lyx::setEncoding(e);
 	encoding_iconv_ = e;
+	return is_.setEncoding(e);
 }


@ -284,7 +295,11 @@ Token const Parser::curr_token() const
 Token const Parser::next_token()
 {
 	static const Token dummy;
-	return good() ? tokens_[pos_] : dummy;
+	if (!good())
+		return dummy;
+	if (pos_ >= tokens_.size())
+		tokenize_one();
+	return pos_ < tokens_.size() ? tokens_[pos_] : dummy;
 }


@ -292,11 +307,14 @@ Token const Parser::next_token()
 Token const Parser::next_next_token()
 {
 	static const Token dummy;
-	// If good() has not been called after the last get_token() we need
-	// to tokenize two more tokens.
-	if (pos_ + 1 >= tokens_.size()) {
-		tokenize_one();
+	if (!good())
+		return dummy;
+	// If tokenize_one() has not been called after the last get_token() we
+	// need to tokenize two more tokens.
+	if (pos_ >= tokens_.size()) {
 		tokenize_one();
+		if (pos_ + 1 >= tokens_.size())
+			tokenize_one();
 	}
 	return pos_ + 1 < tokens_.size() ? tokens_[pos_ + 1] : dummy;
 }
@ -306,10 +324,16 @@ Token const Parser::next_next_token()
 Token const Parser::get_token()
 {
 	static const Token dummy;
-	// if (good()) 
-	// 	cerr << "looking at token " << tokens_[pos_] 
-	// 	     << " pos: " << pos_ << '\n';
-	return good() ? tokens_[pos_++] : dummy;
+	if (!good())
+		return dummy;
+	if (pos_ >= tokens_.size()) {
+		tokenize_one();
+		if (pos_ >= tokens_.size())
+			return dummy;
+	}
+	// cerr << "looking at token " << tokens_[pos_] 
+	//      << " pos: " << pos_ << '\n';
+	return tokens_[pos_++];
 }


@ -408,8 +432,9 @@ bool Parser::good()
 {
 	if (pos_ < tokens_.size())
 		return true;
-	tokenize_one();
-	return pos_ < tokens_.size();
+	if (!is_.good())
+		return false;
+	return is_.peek() != idocstream::traits_type::eof();
 }


--- a/src/tex2lyx/Parser.h
+++ b/src/tex2lyx/Parser.h
@ -117,15 +117,19 @@ std::ostream & operator<<(std::ostream & os, Token const & t);
 extern void debugToken(std::ostream & os, Token const & t, unsigned int flags);
 #endif

-// A docstream version that supports putback even when not buffered
+/// A docstream version that supports putback even when not buffered
 class iparserdocstream
 {
 public:
+	typedef idocstream::int_type int_type;
+
 	iparserdocstream(idocstream & is) : is_(is) {};

-	operator bool() const { return is_; };
+	/// Like std::istream::operator bool()
+	operator bool() const { return s_.empty() ? is_ : true; }

-	idocstream & docstream() { return is_; };
+	/// change the encoding of the input stream to \p e (iconv name)
+	bool setEncoding(std::string const & e);

 	// add to the list of characters to read before actually reading
 	// the stream
@ -135,7 +139,14 @@ public:
 	// the stream
 	void put_almost_back(docstring s);

+	/// Like std::istream::get()
 	iparserdocstream & get(char_type &c);
+
+	/// Like std::istream::good()
+	bool good() const { return s_.empty() ? is_.good() : true; }
+
+	/// Like std::istream::peek()
+	int_type peek() const { return s_.empty() ? is_.peek() : s_[0]; }
 private:
 	///
 	idocstream & is_;
@ -172,11 +183,11 @@ public:
 	 * re-reading. Useful when changing catcodes. */
 	void deparse();

-	/// change the iconv encoding of the input stream
-	/// according to the latex encoding and package
-	void setEncoding(std::string const & encoding, int const & package);
-	/// change the iconv encoding of the input stream
-	void setEncoding(std::string const & encoding);
+	/// change the encoding of the input stream according to \p encoding
+	/// (latex name) and package \p package
+	bool setEncoding(std::string const & encoding, int const & package);
+	/// change the encoding of the input stream to \p encoding (iconv name)
+	bool setEncoding(std::string const & encoding);
 	/// get the current iconv encoding of the input stream
 	std::string getEncoding() const { return encoding_iconv_; }

@ -288,9 +299,12 @@ public:
 	Token const prev_token() const;
 	/// The current token.
 	Token const curr_token() const;
-	/// The next token.
+	/// The next token. Caution: If this is called, an encoding change is
+	/// only possible again after get_token() has been called.
 	Token const next_token();
-	/// The next but one token.
+	/// The next but one token. Caution: If this is called, an encoding
+	/// change is only possible again after get_token() has been called
+	/// twice.
 	Token const next_next_token();
 	/// Make the next token current and return that.
 	Token const get_token();
@ -301,7 +315,9 @@ public:
 	bool skip_spaces(bool skip_comments = false);
 	/// puts back spaces (and comments if \p skip_comments is true)
 	void unskip_spaces(bool skip_comments = false);
-	///
+	/// Is any further input pending()? This is not like
+	/// std::istream::good(), which returns true if all available input
+	/// was read, and the next attempt to read would return EOF.
 	bool good();
 	/// resets the parser to initial state
 	void reset();