improve diffs encoding detection

tzemanovic
Jan 27, 2026, 7:30 PM
F6O6FGOJ762C5CFA4R5K4BTUHTERHT2AOWGQS4S3QYUI7QU6N4IQC

Dependencies

  • [2] 6YZAVBWU Initial commit
  • [3] YBJRDOTC make all repo actions async
  • [4] A5YBC77V record!
  • [5] V55EAIWQ add src file LRU cache
  • [6] HOJZI52Y rename flowers_ui to inflorescence
  • [7] MJDGPSHG WIP contents diff
  • [8] QMAUTRB6 refactor diff
  • [9] WI2BVQ6J rm client lib crate
  • [10] NWJD6VM6 mv libflowers libflorescence
  • [11] CALXOZXA flatten crates dir
  • [12] BFN2VHZS refactor file stuff into sub-mod
  • [13] 3SYSJKYL add app icon
  • [14] 23SFYK4Q big view refactor into a new crate
  • [15] OPXFZKEB view tests setup
  • [16] XSZZB47U refactor stuff into lib
  • [17] 3BK22XE5 add a test for hover btn and more refactors
  • [18] VCNKFNUF app init test
  • [19] 6F7Q4ZLR avoid unused warns
  • [20] I56UGW7U make record test, fix log update
  • [21] DXAYDIMQ update to latest pijul
  • [22] PTWZYQFR use nav-scrollable for repo status
  • [23] 7MJOO4E2 task wrappers tooling workaround
  • [24] WH57EHNM update tests
  • [25] IFQPVMBD error handling for repo actions
  • [26] K2SQTVJD handle moved dirs
  • [27] UPWS6J3B filter to-record changes from selection
  • [28] FU6P5QLG indicate when a file is a dir with appended '/'
  • [29] YGZ3VCW4 add push
  • [30] 6LF2U2Y6 improve file encoding detection
  • [31] G5WLRXOD add screenshots for test
  • [32] TJHMERBN tiny fixes
  • [33] OLT666N4 fix screenshot test to include status, fix failed test report
  • [34] UF5NJKAS test load repo
  • [35] SWWE2R6M display basic repo stuff
  • [36] 2SLTGWP6 add change files diffs to-record selection
  • [37] W4LFX7IH group diffs by file name
  • [38] UB2ITZJS refresh changed files on FS changes
  • [39] KWTBNTO3 diffs selection and scrolling
  • [40] UR4J677R nav for log changes and refactors
  • [41] JZXYSIYD channel selection!
  • [42] SASAN2XC use nav-scrollable
  • [43] RPCIGCNS add replacement diff details
  • [44] HPSOAD4R fix moved tracked file view
  • [45] EC3TVL4X add untracked files
  • [46] 4WO3ZJM2 show untracked files' contents
  • [47] L6KSEFQI move cursor related stuff into its module

Change contents

  • edit in libflorescence/src/repo.rs at line 12
    [4.47][9.39:62]()
    use crate::prelude::*;
  • edit in libflorescence/src/repo.rs at line 13
    [28.1310]
    [9.62]
    use crate::{encoding, prelude::*};
  • replacement in libflorescence/src/repo.rs at line 960
    [27.7664][27.7664:7686]()
    encoding,
    [27.7664]
    [27.7686]
    encoding: _,
  • replacement in libflorescence/src/repo.rs at line 968
    [25.8128][27.7860:7926]()
    Some(try_decode_contents(raw_contents, encoding))
    [25.8128]
    [27.7926]
    let encoding = encoding::detect(&raw_contents);
    Some(try_decode_contents(raw_contents, &encoding))
  • replacement in libflorescence/src/repo.rs at line 998
    [27.8750][27.8750:8772]()
    encoding,
    [27.8750]
    [27.8772]
    encoding: _,
  • replacement in libflorescence/src/repo.rs at line 1012
    [28.2348][27.9279:9351](),[27.9279][27.9279:9351]()
    let contents = try_decode_contents(raw_contents, encoding);
    [28.2348]
    [27.9351]
    let encoding = encoding::detect(&raw_contents);
    let contents = try_decode_contents(raw_contents, &encoding);
  • replacement in libflorescence/src/repo.rs at line 1026
    [27.9632][27.9632:9654]()
    encoding,
    [27.9632]
    [27.9654]
    encoding: _e,
  • edit in libflorescence/src/repo.rs at line 1029
    [27.9704]
    [27.9704]
  • edit in libflorescence/src/repo.rs at line 1032
    [28.2426]
    [27.9818]
    let encoding = encoding::detect(&raw_change_contents);
    let change_contents =
    try_decode_contents(raw_change_contents, &encoding);
  • edit in libflorescence/src/repo.rs at line 1036
    [27.9819][27.9819:9921]()
    let change_contents =
    try_decode_contents(raw_change_contents, encoding);
  • edit in libflorescence/src/repo.rs at line 1041
    [28.2599]
    [27.10045]
    let encoding = encoding::detect(&raw_replacement_contents);
  • replacement in libflorescence/src/repo.rs at line 1043
    [27.10084][27.10084:10157]()
    try_decode_contents(raw_replacement_contents, encoding);
    [27.10084]
    [27.10157]
    try_decode_contents(raw_replacement_contents, &encoding);
  • edit in libflorescence/src/lib.rs at line 2
    [16.37]
    [16.37]
    pub mod encoding;
  • file addition: encoding.rs (----------)
    [11.31]
    use crate::prelude::pijul;
    /// Binary data returns `None`.
    // Detection logic borrowed from <https://github.com/Wilfred/difftastic/blob/1436c8eac39dcea07f8c24a8128284a25b416e8d/src/files.rs#L141>.
    pub fn detect(data: &[u8]) -> Option<pijul::Encoding> {
    if std::str::from_utf8(data).is_ok() {
    return Some(pijul::Encoding(encoding_rs::UTF_8));
    }
    // Only consider the first 1,000 bytes, as tree_magic_mini
    // considers the entire file, which is very slow on large files.
    let mut magic_bytes = data;
    if magic_bytes.len() > 1000 {
    magic_bytes = &magic_bytes[..1000];
    }
    let mime = tree_magic_mini::from_u8(magic_bytes);
    dbg!(mime);
    // Use MIME type detection to guess whether a file is binary. This
    // has false positives and false negatives, so only check the MIME
    // type after allowing perfect text files (see issue #433).
    match mime {
    // Treat pdf as binary.
    "application/pdf" => return None,
    // application/* is a mix of stuff, application/json is fine
    // but application/zip is binary that often decodes as valid
    // UTF-16.
    //
    // See
    // <https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/MIME_types/Common_types>
    // for a list of MIME types.
    "application/x-archive"
    | "application/x-bzip"
    | "application/x-bzip2"
    | "application/x-7zip-compressed"
    | "application/gzip"
    | "application/zip"
    | "application/zstd"
    | "application/octet-stream" => return None,
    // Treat all image content as binary.
    v if v.starts_with("image/") => return None,
    // Treat all audio content as binary.
    v if v.starts_with("audio/") => return None,
    // Treat all video content as binary.
    v if v.starts_with("video/") => return None,
    // Treat all font content as binary.
    v if v.starts_with("font/") => return None,
    _ => {}
    }
    // If the input bytes are *almost* valid UTF-8, treat them as
    // UTF-8. This is helpful when the user has written a small number
    // of bad bytes to a file. Users would still like to be able to
    // diff these files.
    let utf8_string = String::from_utf8_lossy(data).to_string();
    let num_utf8_invalid = utf8_string
    .chars()
    .take(50000)
    .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
    .count();
    if num_utf8_invalid <= 2 {
    return Some(pijul::Encoding(encoding_rs::UTF_8));
    }
    // Fallback to pijul encoding detection
    pijul::change::get_encoding(data)
    }
  • edit in libflorescence/Cargo.toml at line 38
    [2.2685]
    [21.1119]
    [dependencies.encoding_rs]
    workspace = true
  • edit in libflorescence/Cargo.toml at line 72
    [3.4655]
    [18.2236]
    workspace = true
    [dependencies.tree_magic_mini]
  • edit in inflorescence/src/file.rs at line 3
    [14.29020]
    [17.1663]
    use libflorescence::encoding;
  • replacement in inflorescence/src/file.rs at line 198
    [26.369][30.124:171]()
    let encoding = detect_encoding(&data);
    [26.369]
    [12.6247]
    let encoding = encoding::detect(&data);
  • edit in inflorescence/src/file.rs at line 207
    [12.6400][12.6400:6408](),[12.6408][30.172:403](),[30.403][32.343:386](),[32.386][30.450:818](),[30.450][30.450:818](),[30.818][12.6408:6409](),[12.6408][12.6408:6409](),[12.6409][30.819:2163]()
    }
    }
    // Detection logic borrowed from <https://github.com/Wilfred/difftastic/blob/1436c8eac39dcea07f8c24a8128284a25b416e8d/src/files.rs#L141>.
    /// Binary data returns `None`
    fn detect_encoding(data: &[u8]) -> Option<pijul::Encoding> {
    if std::str::from_utf8(data).is_ok() {
    return Some(pijul::Encoding(encoding_rs::UTF_8));
    }
    // Only consider the first 1,000 bytes, as tree_magic_mini
    // considers the entire file, which is very slow on large files.
    let mut magic_bytes = data;
    if magic_bytes.len() > 1000 {
    magic_bytes = &magic_bytes[..1000];
    }
    let mime = tree_magic_mini::from_u8(magic_bytes);
    // Use MIME type detection to guess whether a file is binary. This
    // has false positives and false negatives, so only check the MIME
    // type after allowing perfect text files (see issue #433).
    match mime {
    // Treat pdf as binary.
    "application/pdf" => return None,
    // application/* is a mix of stuff, application/json is fine
    // but application/zip is binary that often decodes as valid
    // UTF-16.
    //
    // See
    // <https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/MIME_types/Common_types>
    // for a list of MIME types.
    "application/x-archive" => return None,
    "application/x-bzip" => return None,
    "application/x-bzip2" => return None,
    "application/x-7zip-compressed" => return None,
    "application/gzip" => return None,
    "application/zip" => return None,
    "application/zstd" => return None,
    // Treat all image content as binary.
    v if v.starts_with("image/") => return None,
    // Treat all audio content as binary.
    v if v.starts_with("audio/") => return None,
    // Treat all video content as binary.
    v if v.starts_with("video/") => return None,
    // Treat all font content as binary.
    v if v.starts_with("font/") => return None,
    _ => {}
  • edit in inflorescence/src/file.rs at line 208
    [30.2169][30.2169:2774](),[30.2774][32.387:425]()
    // If the input bytes are *almost* valid UTF-8, treat them as
    // UTF-8. This is helpful when the user has written a small number
    // of bad bytes to a file. Users would still like to be able to
    // diff these files.
    let utf8_string = String::from_utf8_lossy(data).to_string();
    let num_utf8_invalid = utf8_string
    .chars()
    .take(50000)
    .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
    .count();
    if num_utf8_invalid <= 2 {
    return Some(pijul::Encoding(encoding_rs::UTF_8));
    }
    // Fallback to pijul encoding detection
    pijul::change::get_encoding(data)
  • edit in inflorescence/src/diff.rs at line 13
    [10.579]
    [7.5153]
    use tracing::error;
  • replacement in inflorescence/src/diff.rs at line 108
    [8.7376][8.7376:7487]()
    (repo::Contents::UnknownEncoding(_change), repo::Contents::UnknownEncoding(_replacement)) => {
    [8.7376]
    [8.7487]
    (_, repo::Contents::UnknownEncoding(_)) |
    (repo::Contents::UnknownEncoding(_), _) => {
  • replacement in inflorescence/src/diff.rs at line 117
    [8.7821][8.7821:7912]()
    unimplemented!("The change and replacement have different encoding!");
    [8.7821]
    [8.7912]
    error!("The change and replacement have different encoding! Change: {change_contents:?}, replacement: {replacement_contents:?}");
    without.push(DiffWithoutContents::Replacement {
    line: *line,
    change_contents: UndecodableContents::UnknownEncoding,
    replacement_contents: UndecodableContents::UnknownEncoding,
    });
  • edit in inflorescence/Cargo.toml at line 28
    [5.6912][30.2819:2863]()
    workspace = true
    [dependencies.encoding_rs]
  • edit in inflorescence/Cargo.toml at line 42
    [3.18417][30.2864:2912]()
    workspace = true
    [dependencies.tree_magic_mini]
  • edit in Cargo.lock at line 2581
    [20.10206][30.3238:3254]()
    "encoding_rs",
  • edit in Cargo.lock at line 2594
    [6.534][30.3255:3275]()
    "tree_magic_mini",
  • edit in Cargo.lock at line 2897
    [2.44952]
    [21.3965]
    "encoding_rs",
  • edit in Cargo.lock at line 2917
    [3.18816]
    [13.5953]
    "tree_magic_mini",