use crate::prelude::pijul;
#[derive(Debug, Clone)]
pub enum Encoding {
Text(pijul::Encoding),
Image,
Audio,
Video,
Font,
Other,
}
/// Binary data returns `None`.
// Detection logic borrowed from <https://github.com/Wilfred/difftastic/blob/1436c8eac39dcea07f8c24a8128284a25b416e8d/src/files.rs#L141>.
// TODO: improve encoding detection in pijul, then remove "application/octet-stream" MIME case as it tends to misclassify utf16 (see <https://github.com/Wilfred/difftastic/pull/473#discussion_r1093853923>)
pub fn detect(data: &[u8]) -> Encoding {
if std::str::from_utf8(data).is_ok() {
return Encoding::Text(pijul::Encoding(encoding_rs::UTF_8));
}
// Only consider the first 1,000 bytes, as tree_magic_mini
// considers the entire file, which is very slow on large files.
let mut magic_bytes = data;
if magic_bytes.len() > 1000 {
magic_bytes = &magic_bytes[..1000];
}
let mime = tree_magic_mini::from_u8(magic_bytes);
// Use MIME type detection to guess whether a file is binary. This
// has false positives and false negatives, so only check the MIME
// type after allowing perfect text files (see issue #433).
match mime {
// Treat pdf as binary.
"application/pdf" => return Encoding::Other,
// application/* is a mix of stuff, application/json is fine
// but application/zip is binary that often decodes as valid
// UTF-16.
//
// See
// <https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/MIME_types/Common_types>
// for a list of MIME types.
"application/x-archive"
| "application/x-bzip"
| "application/x-bzip2"
| "application/x-7zip-compressed"
| "application/gzip"
| "application/zip"
| "application/zstd"
| "application/octet-stream"
| "application/x-executable" => return Encoding::Other,
// Treat all image content as binary.
v if v.starts_with("image/") => return Encoding::Image,
// Treat all audio content as binary.
v if v.starts_with("audio/") => return Encoding::Audio,
// Treat all video content as binary.
v if v.starts_with("video/") => return Encoding::Video,
// Treat all font content as binary.
v if v.starts_with("font/") => return Encoding::Font,
_ => {}
}
// If the input bytes are *almost* valid UTF-8, treat them as
// UTF-8. This is helpful when the user has written a small number
// of bad bytes to a file. Users would still like to be able to
// diff these files.
let utf8_string = String::from_utf8_lossy(data).to_string();
let num_utf8_invalid = utf8_string
.chars()
.take(50000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
.count();
if num_utf8_invalid <= 2 {
return Encoding::Text(pijul::Encoding(encoding_rs::UTF_8));
}
// Fallback to pijul encoding detection
pijul::change::get_encoding(data)
.map(Encoding::Text)
.unwrap_or(Encoding::Other)
}