tzemanovic/inflorescence: libflorescence/src/encoding.rs

use crate::prelude::pijul;

#[derive(Debug, Clone)]
pub enum Encoding {
    Text(pijul::Encoding),
    Image,
    Audio,
    Video,
    Font,
    Other,
}

/// Binary data returns `None`.
// Detection logic borrowed from <https://github.com/Wilfred/difftastic/blob/1436c8eac39dcea07f8c24a8128284a25b416e8d/src/files.rs#L141>.
// TODO: improve encoding detection in pijul, then remove "application/octet-stream" MIME case as it tends to misclassify utf16 (see <https://github.com/Wilfred/difftastic/pull/473#discussion_r1093853923>)
pub fn detect(data: &[u8]) -> Encoding {
    if std::str::from_utf8(data).is_ok() {
        return Encoding::Text(pijul::Encoding(encoding_rs::UTF_8));
    }

    // Only consider the first 1,000 bytes, as tree_magic_mini
    // considers the entire file, which is very slow on large files.
    let mut magic_bytes = data;
    if magic_bytes.len() > 1000 {
        magic_bytes = &magic_bytes[..1000];
    }

    let mime = tree_magic_mini::from_u8(magic_bytes);

    // Use MIME type detection to guess whether a file is binary. This
    // has false positives and false negatives, so only check the MIME
    // type after allowing perfect text files (see issue #433).
    match mime {
        // Treat pdf as binary.
        "application/pdf" => return Encoding::Other,
        // application/* is a mix of stuff, application/json is fine
        // but application/zip is binary that often decodes as valid
        // UTF-16.
        //
        // See
        // <https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/MIME_types/Common_types>
        // for a list of MIME types.
        "application/x-archive"
        | "application/x-bzip"
        | "application/x-bzip2"
        | "application/x-7zip-compressed"
        | "application/gzip"
        | "application/zip"
        | "application/zstd"
        | "application/octet-stream"
        | "application/x-executable" => return Encoding::Other,
        // Treat all image content as binary.
        v if v.starts_with("image/") => return Encoding::Image,
        // Treat all audio content as binary.
        v if v.starts_with("audio/") => return Encoding::Audio,
        // Treat all video content as binary.
        v if v.starts_with("video/") => return Encoding::Video,
        // Treat all font content as binary.
        v if v.starts_with("font/") => return Encoding::Font,
        _ => {}
    }

    // If the input bytes are *almost* valid UTF-8, treat them as
    // UTF-8. This is helpful when the user has written a small number
    // of bad bytes to a file. Users would still like to be able to
    // diff these files.
    let utf8_string = String::from_utf8_lossy(data).to_string();
    let num_utf8_invalid = utf8_string
        .chars()
        .take(50000)
        .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
        .count();
    if num_utf8_invalid <= 2 {
        return Encoding::Text(pijul::Encoding(encoding_rs::UTF_8));
    }

    // Fallback to pijul encoding detection
    pijul::change::get_encoding(data)
        .map(Encoding::Text)
        .unwrap_or(Encoding::Other)
}