commit f439609304371eb3dd7ac238f39c470743261597 Author: Enrico Forestieri <for...@lyx.org> Date: Wed Jun 11 23:04:39 2014 +0200
Make binary file detection more robust. The magic library can detect the charset used by a file. While this detection is not full proof, actually the library seems to be infallible as regards the binary nature of a file. So, use libmagic for the detection and fallback to the previous method if the library is not installed or its database cannot be loaded. diff --git a/src/Format.cpp b/src/Format.cpp index 44e3f63..063b683 100644 --- a/src/Format.cpp +++ b/src/Format.cpp @@ -241,17 +241,13 @@ string guessFormatFromContents(FileName const & fn) int const max_count = 50; int count = 0; - // Maximum number of binary chars allowed for latex detection - int const max_bin = 5; - string str; string format; bool firstLine = true; bool backslash = false; bool maybelatex = false; - int binchars = 0; int dollars = 0; - while ((count++ < max_count) && format.empty() && binchars <= max_bin) { + while ((count++ < max_count) && format.empty() && !maybelatex) { if (ifs.eof()) break; @@ -378,17 +374,9 @@ string guessFormatFromContents(FileName const & fn) // inline equation maybelatex = true; } - - // Note that this is formally not correct, since count_bin_chars - // expects utf8, and str can be anything: plain text in any - // encoding, or really binary data. In practice it works, since - // QString::fromUtf8() drops invalid utf8 sequences, and while - // the exact number may not be correct, we still get a high - // number for truly binary files. - binchars += count_bin_chars(str); } - if (format.empty() && binchars <= max_bin && maybelatex) + if (format.empty() && maybelatex && !isBinaryFile(fn)) format = "latex"; if (format.empty()) { diff --git a/src/support/filetools.cpp b/src/support/filetools.cpp index b9c7e7f..d167d6a 100644 --- a/src/support/filetools.cpp +++ b/src/support/filetools.cpp @@ -43,6 +43,9 @@ #include "support/regex.h" #include <fcntl.h> +#ifdef HAVE_MAGIC_H +#include <magic.h> +#endif #include <cerrno> #include <cstdlib> @@ -91,6 +94,60 @@ bool isValidDVIFileName(string const & filename) } +bool isBinaryFile(FileName const & filename) +{ + bool isbinary = false; + if (filename.empty() || !filename.exists()) + return isbinary; + +#ifdef HAVE_MAGIC_H + magic_t magic_cookie = magic_open(MAGIC_MIME_ENCODING); + if (magic_cookie) { + bool detected = true; + if (magic_load(magic_cookie, NULL) != 0) { + LYXERR(Debug::FILES, "isBinaryFile: " + "Could not load magic database - " + << magic_error(magic_cookie)); + detected = false; + } else { + char const *charset = magic_file(magic_cookie, + filename.toFilesystemEncoding().c_str()); + isbinary = contains(charset, "binary"); + } + magic_close(magic_cookie); + if (detected) + return isbinary; + } +#endif + // Try by looking for binary chars at the beginning of the file. + // Note that this is formally not correct, since count_bin_chars + // expects utf8, and the passed string can be anything: plain text + // in any encoding, or really binary data. In practice it works, + // since QString::fromUtf8() drops invalid utf8 sequences, and + // while the exact number may not be correct, we still get a high + // number for truly binary files. + + ifstream ifs(filename.toFilesystemEncoding().c_str()); + if (!ifs) + return isbinary; + + // Maximum strings to read + int const max_count = 50; + + // Maximum number of binary chars allowed + int const max_bin = 5; + + int count = 0; + int binchars = 0; + string str; + while (count++ < max_count && !ifs.eof()) { + getline(ifs, str); + binchars += count_bin_chars(str); + } + return binchars > max_bin; +} + + string const latex_path(string const & original_path, latex_path_extension extension, latex_path_dots dots) diff --git a/src/support/filetools.h b/src/support/filetools.h index fbc14f8..9d91f33 100644 --- a/src/support/filetools.h +++ b/src/support/filetools.h @@ -78,6 +78,9 @@ bool isValidLaTeXFileName(std::string const & filename); */ bool isValidDVIFileName(std::string const & filename); +/// check whether the file has binary contents +bool isBinaryFile(FileName const & filename); + /** Returns the path of a library data file. Search the file name.ext in the subdirectory dir of -# user_lyxdir