improve file encoding detection
Dependencies
- [2]
6YZAVBWUInitial commit - [3]
UB2ITZJSrefresh changed files on FS changes - [4]
KT5UYXGKfix selection after adding file, add changed file diffs - [5]
YBJRDOTCmake all repo actions async - [6]
V55EAIWQadd src file LRU cache - [7]
HOJZI52Yrename flowers_ui to inflorescence - [8]
BFN2VHZSrefactor file stuff into sub-mod - [9]
6F7Q4ZLRavoid unused warns - [10]
I56UGW7Umake record test, fix log update - [11]
UR4J677Rnav for log changes and refactors - [12]
7MJOO4E2task wrappers tooling workaround - [13]
K2SQTVJDhandle moved dirs - [14]
YGZ3VCW4add push - [15]
JZXYSIYDchannel selection! - [16]
FU6P5QLGindicate when a file is a dir with appended '/'
Change contents
- replacement in inflorescence/src/file.rs at line 197
let encoding = pijul::change::get_encoding(&data);let encoding = detect_encoding(&data); - edit in inflorescence/src/file.rs at line 208
// Detection logic borrowed from <https://github.com/Wilfred/difftastic/blob/1436c8eac39dcea07f8c24a8128284a25b416e8d/src/files.rs#L141>./// Binary data returns `None`fn detect_encoding(data: &[u8]) -> Option<pijul::Encoding> {if let Ok(_) = std::str::from_utf8(data) {return Some(pijul::Encoding(encoding_rs::UTF_8));}// Only consider the first 1,000 bytes, as tree_magic_mini// considers the entire file, which is very slow on large files.let mut magic_bytes = data;if magic_bytes.len() > 1000 {magic_bytes = &magic_bytes[..1000];}let mime = tree_magic_mini::from_u8(magic_bytes); - edit in inflorescence/src/file.rs at line 225
// Use MIME type detection to guess whether a file is binary. This// has false positives and false negatives, so only check the MIME// type after allowing perfect text files (see issue #433).match mime {// Treat pdf as binary."application/pdf" => return None,// application/* is a mix of stuff, application/json is fine// but application/zip is binary that often decodes as valid// UTF-16.//// See// <https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/MIME_types/Common_types>// for a list of MIME types."application/x-archive" => return None,"application/x-bzip" => return None,"application/x-bzip2" => return None,"application/x-7zip-compressed" => return None,"application/gzip" => return None,"application/zip" => return None,"application/zstd" => return None,// Treat all image content as binary.v if v.starts_with("image/") => return None,// Treat all audio content as binary.v if v.starts_with("audio/") => return None,// Treat all video content as binary.v if v.starts_with("video/") => return None,// Treat all font content as binary.v if v.starts_with("font/") => return None,_ => {}}// If the input bytes are *almost* valid UTF-8, treat them as// UTF-8. This is helpful when the user has written a small number// of bad bytes to a file. Users would still like to be able to// diff these files.let utf8_string = String::from_utf8_lossy(data).to_string();let num_utf8_invalid = utf8_string.chars().take(50000).filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0').count();if num_utf8_invalid <= 2 {return Some(pijul::Encoding(encoding_rs::UTF_8));}// Fallback to pijul encoding detectionpijul::change::get_encoding(&data)} - edit in inflorescence/Cargo.toml at line 28
workspace = true[dependencies.encoding_rs] - edit in inflorescence/Cargo.toml at line 44
workspace = true[dependencies.tree_magic_mini] - edit in Cargo.toml at line 44
[workspace.dependencies.encoding_rs]version = "0.8" - edit in Cargo.toml at line 135
[workspace.dependencies.tree_magic_mini]version = "3" - edit in Cargo.lock at line 1539
[[package]]name = "fixedbitset"version = "0.5.7"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" - edit in Cargo.lock at line 2581
"encoding_rs", - edit in Cargo.lock at line 2595
"tree_magic_mini", - edit in Cargo.lock at line 4128
[[package]]name = "petgraph"version = "0.8.3"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455"dependencies = ["fixedbitset","hashbrown 0.15.4","indexmap",] - edit in Cargo.lock at line 5979
name = "tree_magic_mini"version = "3.2.2"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "b8765b90061cba6c22b5831f675da109ae5561588290f9fa2317adab2714d5a6"dependencies = ["memchr","nom 8.0.0","petgraph",][[package]]