From 7d31194085d8665fd10cc2e8fe4a6330b3cd338f Mon Sep 17 00:00:00 2001
From: Enrico Forestieri <forenr@lyx.org>
Date: Thu, 5 Jun 2014 17:33:25 +0200
Subject: [PATCH] Fix bug #9146.

If LyX does not know about a given file format, it may easily
happen that the format is recognized as "latex" and this causes
bug #9146. This patch limits the check for a latex format to
non-binary files. The strategy for deciding that a file has
binary content is the same as that adopted by the "less" program.
---
 src/Format.cpp           | 17 +++++++++++++----
 src/support/lstrings.cpp | 25 +++++++++++++++++++++++++
 src/support/lstrings.h   |  8 ++++++++
 3 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/src/Format.cpp b/src/Format.cpp
index e2d23b5c65..ff7fb34613 100644
--- a/src/Format.cpp
+++ b/src/Format.cpp
@@ -241,12 +241,17 @@ string guessFormatFromContents(FileName const & fn)
 	int const max_count = 50;
 	int count = 0;
 
+	// Maximum number of binary chars allowed for latex detection
+	int const max_bin = 5;
+
 	string str;
 	string format;
 	bool firstLine = true;
 	bool backslash = false;
+	bool maybelatex = false;
+	int binchars = 0;
 	int dollars = 0;
-	while ((count++ < max_count) && format.empty()) {
+	while ((count++ < max_count) && format.empty() && binchars <= max_bin) {
 		if (ifs.eof())
 			break;
 
@@ -364,16 +369,20 @@ string guessFormatFromContents(FileName const & fn)
 		         contains(str, "$$") ||
 		         contains(str, "\\[") ||
 		         contains(str, "\\]"))
-			format = "latex";
+			maybelatex = true;
 		else {
 			if (contains(str, '\\'))
 				backslash = true;
 			dollars += count_char(str, '$');
+			if (backslash && dollars > 1)
+				// inline equation
+				maybelatex = true;
 		}
+
+		binchars += count_bin_chars(str);
 	}
 
-	if (format.empty() && backslash && dollars > 1)
-		// inline equation
+	if (format.empty() && binchars <= max_bin && maybelatex)
 		format = "latex";
 
 	if (format.empty()) {
diff --git a/src/support/lstrings.cpp b/src/support/lstrings.cpp
index a43feeba13..8508e4ef13 100644
--- a/src/support/lstrings.cpp
+++ b/src/support/lstrings.cpp
@@ -943,6 +943,31 @@ int count_char(docstring const & str, docstring::value_type chr)
 }
 
 
+int count_bin_chars(string const & str)
+{
+	QString const qstr = toqstr(str).simplified();
+	int count = 0;
+	QString::const_iterator cit = qstr.begin();
+	QString::const_iterator end = qstr.end();
+	for (; cit != end; ++cit)  {
+		switch (cit->category()) {
+		case QChar::Separator_Line:
+		case QChar::Separator_Paragraph:
+		case QChar::Other_Control:
+		case QChar::Other_Format:
+		case QChar::Other_Surrogate:
+		case QChar::Other_PrivateUse:
+		case QChar::Other_NotAssigned:
+			++count;
+			break;
+		default:
+			break;
+		}
+	}
+	return count;
+}
+
+
 docstring const trim(docstring const & a, char const * p)
 {
 	LASSERT(p, return a);
diff --git a/src/support/lstrings.h b/src/support/lstrings.h
index 02b5cf2f2b..0d21e954a4 100644
--- a/src/support/lstrings.h
+++ b/src/support/lstrings.h
@@ -199,6 +199,14 @@ int count_char(std::string const & str, char chr);
 /// Count all occurences of char \a chr inside \a str
 int count_char(docstring const & str, docstring::value_type chr);
 
+/** Count all occurences of binary chars inside \a str.
+    It is assumed that \a str is utf-8 encoded and that a binary char
+    belongs to the unicode class names Zl, Zp, Cc, Cf, Cs, Co, or Cn
+    (excluding white space characters such as '\t', '\n', '\v', '\f', '\r').
+    See http://www.unicode.org/Public/6.2.0/ucd/UnicodeData.txt
+*/
+int count_bin_chars(std::string const & str);
+
 /** Trims characters off the end and beginning of a string.
     \code
     trim("ccabccc", "c") == "ab".