Make binary file detection more robust.

The magic library can detect the charset used by a file. While this
detection is not full proof, actually the library seems to be infallible
as regards the binary nature of a file. So, use libmagic for the detection
and fallback to the previous method if the library is not installed or
its database cannot be loaded.
This commit is contained in:
Enrico Forestieri 2014-06-11 23:04:39 +02:00
parent 82faa66192
commit f439609304
3 changed files with 62 additions and 14 deletions

View File

@ -241,17 +241,13 @@ string guessFormatFromContents(FileName const & fn)
int const max_count = 50;
int count = 0;
// Maximum number of binary chars allowed for latex detection
int const max_bin = 5;
string str;
string format;
bool firstLine = true;
bool backslash = false;
bool maybelatex = false;
int binchars = 0;
int dollars = 0;
while ((count++ < max_count) && format.empty() && binchars <= max_bin) {
while ((count++ < max_count) && format.empty() && !maybelatex) {
if (ifs.eof())
break;
@ -378,17 +374,9 @@ string guessFormatFromContents(FileName const & fn)
// inline equation
maybelatex = true;
}
// Note that this is formally not correct, since count_bin_chars
// expects utf8, and str can be anything: plain text in any
// encoding, or really binary data. In practice it works, since
// QString::fromUtf8() drops invalid utf8 sequences, and while
// the exact number may not be correct, we still get a high
// number for truly binary files.
binchars += count_bin_chars(str);
}
if (format.empty() && binchars <= max_bin && maybelatex)
if (format.empty() && maybelatex && !isBinaryFile(fn))
format = "latex";
if (format.empty()) {

View File

@ -43,6 +43,9 @@
#include "support/regex.h"
#include <fcntl.h>
#ifdef HAVE_MAGIC_H
#include <magic.h>
#endif
#include <cerrno>
#include <cstdlib>
@ -91,6 +94,60 @@ bool isValidDVIFileName(string const & filename)
}
bool isBinaryFile(FileName const & filename)
{
bool isbinary = false;
if (filename.empty() || !filename.exists())
return isbinary;
#ifdef HAVE_MAGIC_H
magic_t magic_cookie = magic_open(MAGIC_MIME_ENCODING);
if (magic_cookie) {
bool detected = true;
if (magic_load(magic_cookie, NULL) != 0) {
LYXERR(Debug::FILES, "isBinaryFile: "
"Could not load magic database - "
<< magic_error(magic_cookie));
detected = false;
} else {
char const *charset = magic_file(magic_cookie,
filename.toFilesystemEncoding().c_str());
isbinary = contains(charset, "binary");
}
magic_close(magic_cookie);
if (detected)
return isbinary;
}
#endif
// Try by looking for binary chars at the beginning of the file.
// Note that this is formally not correct, since count_bin_chars
// expects utf8, and the passed string can be anything: plain text
// in any encoding, or really binary data. In practice it works,
// since QString::fromUtf8() drops invalid utf8 sequences, and
// while the exact number may not be correct, we still get a high
// number for truly binary files.
ifstream ifs(filename.toFilesystemEncoding().c_str());
if (!ifs)
return isbinary;
// Maximum strings to read
int const max_count = 50;
// Maximum number of binary chars allowed
int const max_bin = 5;
int count = 0;
int binchars = 0;
string str;
while (count++ < max_count && !ifs.eof()) {
getline(ifs, str);
binchars += count_bin_chars(str);
}
return binchars > max_bin;
}
string const latex_path(string const & original_path,
latex_path_extension extension,
latex_path_dots dots)

View File

@ -78,6 +78,9 @@ bool isValidLaTeXFileName(std::string const & filename);
*/
bool isValidDVIFileName(std::string const & filename);
/// check whether the file has binary contents
bool isBinaryFile(FileName const & filename);
/** Returns the path of a library data file.
Search the file name.ext in the subdirectory dir of
-# user_lyxdir