// Use MIME type detection to guess whether a file is binary. This
// has false positives and false negatives, so only check the MIME
// type after allowing perfect text files (see issue #433).
match mime {
// Treat pdf as binary.
"application/pdf" => return None,
// application/* is a mix of stuff, application/json is fine
// but application/zip is binary that often decodes as valid
// UTF-16.
//
// See
// <https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/MIME_types/Common_types>
// for a list of MIME types.
"application/x-archive" => return None,
"application/x-bzip" => return None,
"application/x-bzip2" => return None,
"application/x-7zip-compressed" => return None,
"application/gzip" => return None,
"application/zip" => return None,
"application/zstd" => return None,
// Treat all image content as binary.
v if v.starts_with("image/") => return None,
// Treat all audio content as binary.
v if v.starts_with("audio/") => return None,
// Treat all video content as binary.
v if v.starts_with("video/") => return None,
// Treat all font content as binary.
v if v.starts_with("font/") => return None,
_ => {}
}
// If the input bytes are *almost* valid UTF-8, treat them as
// UTF-8. This is helpful when the user has written a small number
// of bad bytes to a file. Users would still like to be able to
// diff these files.
let utf8_string = String::from_utf8_lossy(data).to_string();
let num_utf8_invalid = utf8_string
.chars()
.take(50000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
.count();
if num_utf8_invalid <= 2 {
return Some(pijul::Encoding(encoding_rs::UTF_8));
}
// Fallback to pijul encoding detection
pijul::change::get_encoding(&data)
}