improve diffs encoding detection
Dependencies
- [2]
6YZAVBWUInitial commit - [3]
YBJRDOTCmake all repo actions async - [4]
A5YBC77Vrecord! - [5]
V55EAIWQadd src file LRU cache - [6]
HOJZI52Yrename flowers_ui to inflorescence - [7]
MJDGPSHGWIP contents diff - [8]
QMAUTRB6refactor diff - [9]
WI2BVQ6Jrm client lib crate - [10]
NWJD6VM6mv libflowers libflorescence - [11]
CALXOZXAflatten crates dir - [12]
BFN2VHZSrefactor file stuff into sub-mod - [13]
3SYSJKYLadd app icon - [14]
23SFYK4Qbig view refactor into a new crate - [15]
OPXFZKEBview tests setup - [16]
XSZZB47Urefactor stuff into lib - [17]
3BK22XE5add a test for hover btn and more refactors - [18]
VCNKFNUFapp init test - [19]
6F7Q4ZLRavoid unused warns - [20]
I56UGW7Umake record test, fix log update - [21]
DXAYDIMQupdate to latest pijul - [22]
PTWZYQFRuse nav-scrollable for repo status - [23]
7MJOO4E2task wrappers tooling workaround - [24]
WH57EHNMupdate tests - [25]
IFQPVMBDerror handling for repo actions - [26]
K2SQTVJDhandle moved dirs - [27]
UPWS6J3Bfilter to-record changes from selection - [28]
FU6P5QLGindicate when a file is a dir with appended '/' - [29]
YGZ3VCW4add push - [30]
6LF2U2Y6improve file encoding detection - [31]
G5WLRXODadd screenshots for test - [32]
TJHMERBNtiny fixes - [33]
OLT666N4fix screenshot test to include status, fix failed test report - [34]
UF5NJKAStest load repo - [35]
SWWE2R6Mdisplay basic repo stuff - [36]
2SLTGWP6add change files diffs to-record selection - [37]
W4LFX7IHgroup diffs by file name - [38]
UB2ITZJSrefresh changed files on FS changes - [39]
KWTBNTO3diffs selection and scrolling - [40]
UR4J677Rnav for log changes and refactors - [41]
JZXYSIYDchannel selection! - [42]
SASAN2XCuse nav-scrollable - [43]
RPCIGCNSadd replacement diff details - [44]
HPSOAD4Rfix moved tracked file view - [45]
EC3TVL4Xadd untracked files - [46]
4WO3ZJM2show untracked files' contents - [47]
L6KSEFQImove cursor related stuff into its module
Change contents
- edit in libflorescence/src/repo.rs at line 12
use crate::prelude::*; - edit in libflorescence/src/repo.rs at line 13
use crate::{encoding, prelude::*}; - replacement in libflorescence/src/repo.rs at line 960
encoding,encoding: _, - replacement in libflorescence/src/repo.rs at line 968
Some(try_decode_contents(raw_contents, encoding))let encoding = encoding::detect(&raw_contents);Some(try_decode_contents(raw_contents, &encoding)) - replacement in libflorescence/src/repo.rs at line 998
encoding,encoding: _, - replacement in libflorescence/src/repo.rs at line 1012
let contents = try_decode_contents(raw_contents, encoding);let encoding = encoding::detect(&raw_contents);let contents = try_decode_contents(raw_contents, &encoding); - replacement in libflorescence/src/repo.rs at line 1026
encoding,encoding: _e, - edit in libflorescence/src/repo.rs at line 1029
- edit in libflorescence/src/repo.rs at line 1032
let encoding = encoding::detect(&raw_change_contents);let change_contents =try_decode_contents(raw_change_contents, &encoding); - edit in libflorescence/src/repo.rs at line 1036
let change_contents =try_decode_contents(raw_change_contents, encoding); - edit in libflorescence/src/repo.rs at line 1041
let encoding = encoding::detect(&raw_replacement_contents); - replacement in libflorescence/src/repo.rs at line 1043
try_decode_contents(raw_replacement_contents, encoding);try_decode_contents(raw_replacement_contents, &encoding); - edit in libflorescence/src/lib.rs at line 2
pub mod encoding; - file addition: encoding.rs[11.31]
use crate::prelude::pijul;/// Binary data returns `None`.// Detection logic borrowed from <https://github.com/Wilfred/difftastic/blob/1436c8eac39dcea07f8c24a8128284a25b416e8d/src/files.rs#L141>.pub fn detect(data: &[u8]) -> Option<pijul::Encoding> {if std::str::from_utf8(data).is_ok() {return Some(pijul::Encoding(encoding_rs::UTF_8));}// Only consider the first 1,000 bytes, as tree_magic_mini// considers the entire file, which is very slow on large files.let mut magic_bytes = data;if magic_bytes.len() > 1000 {magic_bytes = &magic_bytes[..1000];}let mime = tree_magic_mini::from_u8(magic_bytes);dbg!(mime);// Use MIME type detection to guess whether a file is binary. This// has false positives and false negatives, so only check the MIME// type after allowing perfect text files (see issue #433).match mime {// Treat pdf as binary."application/pdf" => return None,// application/* is a mix of stuff, application/json is fine// but application/zip is binary that often decodes as valid// UTF-16.//// See// <https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/MIME_types/Common_types>// for a list of MIME types."application/x-archive"| "application/x-bzip"| "application/x-bzip2"| "application/x-7zip-compressed"| "application/gzip"| "application/zip"| "application/zstd"| "application/octet-stream" => return None,// Treat all image content as binary.v if v.starts_with("image/") => return None,// Treat all audio content as binary.v if v.starts_with("audio/") => return None,// Treat all video content as binary.v if v.starts_with("video/") => return None,// Treat all font content as binary.v if v.starts_with("font/") => return None,_ => {}}// If the input bytes are *almost* valid UTF-8, treat them as// UTF-8. This is helpful when the user has written a small number// of bad bytes to a file. Users would still like to be able to// diff these files.let utf8_string = String::from_utf8_lossy(data).to_string();let num_utf8_invalid = utf8_string.chars().take(50000).filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0').count();if num_utf8_invalid <= 2 {return Some(pijul::Encoding(encoding_rs::UTF_8));}// Fallback to pijul encoding detectionpijul::change::get_encoding(data)} - edit in libflorescence/Cargo.toml at line 38
[dependencies.encoding_rs]workspace = true - edit in libflorescence/Cargo.toml at line 72
workspace = true[dependencies.tree_magic_mini] - edit in inflorescence/src/file.rs at line 3
use libflorescence::encoding; - replacement in inflorescence/src/file.rs at line 198
let encoding = detect_encoding(&data);let encoding = encoding::detect(&data); - edit in inflorescence/src/file.rs at line 207[12.6400]→[12.6400:6408](∅→∅),[12.6408]→[30.172:403](∅→∅),[30.403]→[32.343:386](∅→∅),[32.386]→[30.450:818](∅→∅),[30.450]→[30.450:818](∅→∅),[30.818]→[12.6408:6409](∅→∅),[12.6408]→[12.6408:6409](∅→∅),[12.6409]→[30.819:2163](∅→∅)
}}// Detection logic borrowed from <https://github.com/Wilfred/difftastic/blob/1436c8eac39dcea07f8c24a8128284a25b416e8d/src/files.rs#L141>./// Binary data returns `None`fn detect_encoding(data: &[u8]) -> Option<pijul::Encoding> {if std::str::from_utf8(data).is_ok() {return Some(pijul::Encoding(encoding_rs::UTF_8));}// Only consider the first 1,000 bytes, as tree_magic_mini// considers the entire file, which is very slow on large files.let mut magic_bytes = data;if magic_bytes.len() > 1000 {magic_bytes = &magic_bytes[..1000];}let mime = tree_magic_mini::from_u8(magic_bytes);// Use MIME type detection to guess whether a file is binary. This// has false positives and false negatives, so only check the MIME// type after allowing perfect text files (see issue #433).match mime {// Treat pdf as binary."application/pdf" => return None,// application/* is a mix of stuff, application/json is fine// but application/zip is binary that often decodes as valid// UTF-16.//// See// <https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/MIME_types/Common_types>// for a list of MIME types."application/x-archive" => return None,"application/x-bzip" => return None,"application/x-bzip2" => return None,"application/x-7zip-compressed" => return None,"application/gzip" => return None,"application/zip" => return None,"application/zstd" => return None,// Treat all image content as binary.v if v.starts_with("image/") => return None,// Treat all audio content as binary.v if v.starts_with("audio/") => return None,// Treat all video content as binary.v if v.starts_with("video/") => return None,// Treat all font content as binary.v if v.starts_with("font/") => return None,_ => {} - edit in inflorescence/src/file.rs at line 208
// If the input bytes are *almost* valid UTF-8, treat them as// UTF-8. This is helpful when the user has written a small number// of bad bytes to a file. Users would still like to be able to// diff these files.let utf8_string = String::from_utf8_lossy(data).to_string();let num_utf8_invalid = utf8_string.chars().take(50000).filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0').count();if num_utf8_invalid <= 2 {return Some(pijul::Encoding(encoding_rs::UTF_8));}// Fallback to pijul encoding detectionpijul::change::get_encoding(data) - edit in inflorescence/src/diff.rs at line 13
use tracing::error; - replacement in inflorescence/src/diff.rs at line 108
(repo::Contents::UnknownEncoding(_change), repo::Contents::UnknownEncoding(_replacement)) => {(_, repo::Contents::UnknownEncoding(_)) |(repo::Contents::UnknownEncoding(_), _) => { - replacement in inflorescence/src/diff.rs at line 117
unimplemented!("The change and replacement have different encoding!");error!("The change and replacement have different encoding! Change: {change_contents:?}, replacement: {replacement_contents:?}");without.push(DiffWithoutContents::Replacement {line: *line,change_contents: UndecodableContents::UnknownEncoding,replacement_contents: UndecodableContents::UnknownEncoding,}); - edit in inflorescence/Cargo.toml at line 28
workspace = true[dependencies.encoding_rs] - edit in inflorescence/Cargo.toml at line 42
workspace = true[dependencies.tree_magic_mini] - edit in Cargo.lock at line 2581
"encoding_rs", - edit in Cargo.lock at line 2594
"tree_magic_mini", - edit in Cargo.lock at line 2897
"encoding_rs", - edit in Cargo.lock at line 2917
"tree_magic_mini",