improve file encoding detection

tzemanovic
Jan 26, 2026, 8:32 PM
6LF2U2Y6QOQ7BREI6L26RXMQNKNEBOW2YUUHEIYELDP2C7PJG5XAC

Dependencies

  • [2] 6YZAVBWU Initial commit
  • [3] UB2ITZJS refresh changed files on FS changes
  • [4] KT5UYXGK fix selection after adding file, add changed file diffs
  • [5] YBJRDOTC make all repo actions async
  • [6] V55EAIWQ add src file LRU cache
  • [7] HOJZI52Y rename flowers_ui to inflorescence
  • [8] BFN2VHZS refactor file stuff into sub-mod
  • [9] 6F7Q4ZLR avoid unused warns
  • [10] I56UGW7U make record test, fix log update
  • [11] UR4J677R nav for log changes and refactors
  • [12] 7MJOO4E2 task wrappers tooling workaround
  • [13] K2SQTVJD handle moved dirs
  • [14] YGZ3VCW4 add push
  • [15] JZXYSIYD channel selection!
  • [16] FU6P5QLG indicate when a file is a dir with appended '/'

Change contents

  • replacement in inflorescence/src/file.rs at line 197
    [13.369][8.6188:6247](),[8.6188][8.6188:6247]()
    let encoding = pijul::change::get_encoding(&data);
    [13.369]
    [8.6247]
    let encoding = detect_encoding(&data);
  • edit in inflorescence/src/file.rs at line 208
    [8.6408]
    [8.6408]
    // Detection logic borrowed from <https://github.com/Wilfred/difftastic/blob/1436c8eac39dcea07f8c24a8128284a25b416e8d/src/files.rs#L141>.
    /// Binary data returns `None`
    fn detect_encoding(data: &[u8]) -> Option<pijul::Encoding> {
    if let Ok(_) = std::str::from_utf8(data) {
    return Some(pijul::Encoding(encoding_rs::UTF_8));
    }
    // Only consider the first 1,000 bytes, as tree_magic_mini
    // considers the entire file, which is very slow on large files.
    let mut magic_bytes = data;
    if magic_bytes.len() > 1000 {
    magic_bytes = &magic_bytes[..1000];
    }
    let mime = tree_magic_mini::from_u8(magic_bytes);
  • edit in inflorescence/src/file.rs at line 225
    [8.6409]
    [11.18647]
    // Use MIME type detection to guess whether a file is binary. This
    // has false positives and false negatives, so only check the MIME
    // type after allowing perfect text files (see issue #433).
    match mime {
    // Treat pdf as binary.
    "application/pdf" => return None,
    // application/* is a mix of stuff, application/json is fine
    // but application/zip is binary that often decodes as valid
    // UTF-16.
    //
    // See
    // <https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/MIME_types/Common_types>
    // for a list of MIME types.
    "application/x-archive" => return None,
    "application/x-bzip" => return None,
    "application/x-bzip2" => return None,
    "application/x-7zip-compressed" => return None,
    "application/gzip" => return None,
    "application/zip" => return None,
    "application/zstd" => return None,
    // Treat all image content as binary.
    v if v.starts_with("image/") => return None,
    // Treat all audio content as binary.
    v if v.starts_with("audio/") => return None,
    // Treat all video content as binary.
    v if v.starts_with("video/") => return None,
    // Treat all font content as binary.
    v if v.starts_with("font/") => return None,
    _ => {}
    }
    // If the input bytes are *almost* valid UTF-8, treat them as
    // UTF-8. This is helpful when the user has written a small number
    // of bad bytes to a file. Users would still like to be able to
    // diff these files.
    let utf8_string = String::from_utf8_lossy(data).to_string();
    let num_utf8_invalid = utf8_string
    .chars()
    .take(50000)
    .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
    .count();
    if num_utf8_invalid <= 2 {
    return Some(pijul::Encoding(encoding_rs::UTF_8));
    }
    // Fallback to pijul encoding detection
    pijul::change::get_encoding(&data)
    }
  • edit in inflorescence/Cargo.toml at line 28
    [6.6912]
    [6.6912]
    workspace = true
    [dependencies.encoding_rs]
  • edit in inflorescence/Cargo.toml at line 44
    [5.18417]
    [5.18417]
    workspace = true
    [dependencies.tree_magic_mini]
  • edit in Cargo.toml at line 44
    [4.7838]
    [2.4227]
    [workspace.dependencies.encoding_rs]
    version = "0.8"
  • edit in Cargo.toml at line 135
    [5.18594]
    [5.18594]
    [workspace.dependencies.tree_magic_mini]
    version = "3"
  • edit in Cargo.lock at line 1539
    [3.3054]
    [3.3054]
    [[package]]
    name = "fixedbitset"
    version = "0.5.7"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99"
  • edit in Cargo.lock at line 2581
    [10.10206]
    [7.400]
    "encoding_rs",
  • edit in Cargo.lock at line 2595
    [7.534]
    [7.534]
    "tree_magic_mini",
  • edit in Cargo.lock at line 4128
    [2.59667]
    [2.59904]
    [[package]]
    name = "petgraph"
    version = "0.8.3"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455"
    dependencies = [
    "fixedbitset",
    "hashbrown 0.15.4",
    "indexmap",
    ]
  • edit in Cargo.lock at line 5979
    [2.80023]
    [14.13643]
    name = "tree_magic_mini"
    version = "3.2.2"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "b8765b90061cba6c22b5831f675da109ae5561588290f9fa2317adab2714d5a6"
    dependencies = [
    "memchr",
    "nom 8.0.0",
    "petgraph",
    ]
    [[package]]