detect non-utf8 text files
[?]
Dec 9, 2020, 7:57 AM
6HNRL5RT76NH5YNSUN7B4FHNRZXKNLX4DROFGMO4R5P2U7JWOL2QCDependencies
- [2]
JACZWIJ6Version bump - [3]
WZVCLZKYaddress clippy lints - [4]
KJDQ2WOMFixing the parsing of section headers in the text change format - [5]
SXEYMYF7Fixing the bad changes in history (unfortunately, by rebooting). - [6]
VO5OQW4WRemoving anyhow in libpijul - [*]
OUWD436AVersion bump - [*]
SAGSYAPXVarious version bumps
Change contents
- file addition: text.rs[3.248792]
use super::*;use crate::change::*;use crate::working_copy::WorkingCopy;#[test]fn add_non_utf8_file_test() -> Result<(), anyhow::Error> {env_logger::try_init().unwrap_or(());let mut buf = Vec::new();use std::io::Read;let mut fh = std::fs::File::open("src/tests/data/1252.1")?;fh.read_to_end(&mut buf)?;let mut repo = working_copy::memory::Memory::new();repo.add_file("file", buf);let env = pristine::sanakirja::Pristine::new_anon()?;let mut txn = env.mut_txn_begin();let mut channel = txn.open_or_create_channel("main")?;txn.add_file("file")?;let store = changestore::memory::Memory::new();let (h, change) = record_all(&mut repo, &store, &mut txn, &mut channel, "")?;let mut v = Vec::new();change.write(&store,Some(h),|l, _p| format!("{}:{}", l.path, l.line),true,&mut v,).unwrap();for l in std::str::from_utf8(&v).unwrap().lines() {error!("{:?}", l);}let lines = std::str::from_utf8(&v).unwrap().lines();assert_eq!(1,lines.clone().filter(|l| l.starts_with("+") && l.contains("French / Français (Windows CP 1252)")).count());assert_eq!(1,lines.filter(|l| l.starts_with("+") && l.contains("€‚ƒ„…†‡, Salut")).count());Ok(())}/// Change a non-utf-8 text file.#[test]fn change_non_utf8_file_test() -> Result<(), anyhow::Error> {env_logger::try_init().unwrap_or(());let mut buf = Vec::new();use std::io::Read;let mut fh = std::fs::File::open("src/tests/data/8859-1.1")?;fh.read_to_end(&mut buf)?;let mut repo = working_copy::memory::Memory::new();repo.add_file("file", buf);let env = pristine::sanakirja::Pristine::new_anon()?;let mut txn = env.mut_txn_begin();let mut channel = txn.open_or_create_channel("main")?;txn.add_file("file")?;let store = changestore::memory::Memory::new();record_all(&mut repo, &store, &mut txn, &mut channel, "")?;let mut buf = Vec::new();{use std::io::Read;let mut fh = std::fs::File::open("src/tests/data/8859-1.2")?;fh.read_to_end(&mut buf)?;}repo.write_file::<_, std::io::Error, _>("file", |w| {w.write_all(&buf).unwrap();Ok(())})?;let (h1, change1) = record_all(&mut repo, &store, &mut txn, &mut channel, "")?;// only one line was changedlet mut v = Vec::new();change1.write(&store,Some(h1),|l, _p| format!("{}:{}", l.path, l.line),true,&mut v,).unwrap();for l in std::str::from_utf8(&v).unwrap().lines() {error!("{:?}", l);}assert_eq!(1,std::str::from_utf8(&v).unwrap().lines().filter(|l| l.starts_with("-")&& l.contains("French / Français (ISO Latin-1 / ISO 8859-1)")).count());Ok(())}fn record_all<T: MutTxnT, R: WorkingCopy, P: ChangeStore>(repo: &mut R,store: &P,txn: &mut T,channel: &mut ChannelRef<T>,prefix: &str,) -> Result<(Hash, Change), anyhow::Error>whereR::Error: Send + Sync + 'static,{let mut state = Builder::new();state.record(txn, Algorithm::default(), channel, repo, store, prefix)?;let rec = state.finish();let changes = rec.actions.into_iter().map(|rec| rec.globalize(txn)).collect();let change0 = crate::change::Change::make_change(txn,&channel,changes,rec.contents,crate::change::ChangeHeader {message: "test".to_string(),authors: vec![],description: None,// Beware of changing the following line: two changes// doing the same thing will be equal. Sometimes we don't// want that, as in tests::unrecord::unrecord_double.timestamp: chrono::Utc::now(),},Vec::new(),);let hash = store.save_change(&change0)?;apply::apply_local_change(txn, channel, &change0, hash, &rec.updatables)?;Ok((hash, change0))} - edit in libpijul/src/tests/mod.rs at line 19
mod text; - file addition: data[3.248792]
- file addition: gb.2[0.4375]
- file addition: gb.1[0.4375]
- file addition: 8859-1.2[0.4375]
- file addition: 8859-1.1[0.4375]
- file addition: 1252.2[0.4375]
- file addition: 1252.1[0.4375]
- replacement in libpijul/src/record.rs at line 245
let utf8 = std::str::from_utf8(&self.rec.contents[s..e]);debug!("utf8 = {:?}", utf8);match utf8 {Err(e) => e.valid_up_to() < CHECK_UTF8,Ok(_) => false,}let mime = tree_magic_mini::from_u8(&self.rec.contents[s..e]);debug!("mime = {:?}", mime);!mime.starts_with("text/") - replacement in libpijul/src/diff/mod.rs at line 54
if (std::str::from_utf8(&d.contents_a).is_err() || std::str::from_utf8(&b).is_err())&& d.contents_a != b{let mime_a = tree_magic_mini::from_u8(&d.contents_a);let mime_b = tree_magic_mini::from_u8(&b);debug!("mimes = {:?}, {:?}", mime_a, mime_b);if (!mime_a.starts_with("text/") || !mime_b.starts_with("text/")) && d.contents_a != b { - edit in libpijul/src/change/text_changes.rs at line 4
use chardetng::EncodingDetector; - replacement in libpijul/src/change/text_changes.rs at line 1161
if let Ok(mut contents) = std::str::from_utf8(&contents) {while let Some(n) = contents.as_bytes().iter().position(|&c| c == b'\n') {let (a, b) = contents.split_at(n + 1);contents = b;write!(w, "{} {}", pref, a)?;if tree_magic_mini::from_u8(&contents).starts_with("text/") {let mut detector = EncodingDetector::new();detector.feed(&contents, true);let encoding = detector.guess(None, true);debug!("guessed encoding = {:?}", encoding.name());let (contents, encoding, malformed) = encoding.decode(&contents);debug!("final encoding = {:?}", encoding.name());if malformed {warn!("text file was malformed, should probably try binary instead") - replacement in libpijul/src/change/text_changes.rs at line 1171
if !contents.is_empty() {writeln!(w, "{} {}", pref, contents)?;for a in contents.split_terminator('\n') {writeln!(w, "{} {}", pref, a)?; - edit in libpijul/Cargo.toml at line 72
"src/tests/text.rs", - edit in libpijul/Cargo.toml at line 114
tree_magic_mini = "1.0.0"chardetng = "0.1.10"encoding_rs = "0.8.26" - edit in Cargo.lock at line 177
[[package]]name = "chardetng"version = "0.1.10"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "f866cba7596c2e70200523e399101d460514a5e59191223aa87d579e49e52025"dependencies = ["cfg-if 0.1.10","encoding_rs","memchr",] - edit in Cargo.lock at line 481
name = "fixedbitset"version = "0.2.0"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "37ab347416e802de484e4d03c7316c48f1ecb56574dfd4a46a80f173ce1de04d"[[package]] - edit in Cargo.lock at line 960
[[package]]name = "lexical-core"version = "0.7.4"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "db65c6da02e61f55dae90a0ae427b2a5f6b3e8db09f58d10efab23af92592616"dependencies = ["arrayvec","bitflags","cfg-if 0.1.10","ryu","static_assertions",] - edit in Cargo.lock at line 1003
"chardetng", - edit in Cargo.lock at line 1008[8.867][9.5758]
"encoding_rs", - edit in Cargo.lock at line 1026
"tree_magic_mini", - edit in Cargo.lock at line 1217
][[package]]name = "nom"version = "5.1.2"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af"dependencies = ["lexical-core","memchr","version_check", - edit in Cargo.lock at line 1377
[[package]]name = "petgraph"version = "0.5.1"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "467d164a6de56270bd7c4d070df81d07beace25012d5103ced4e9ff08d6afdb7"dependencies = ["fixedbitset","indexmap",] - edit in Cargo.lock at line 1853
name = "static_assertions"version = "1.1.0"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"[[package]] - edit in Cargo.lock at line 2174
][[package]]name = "tree_magic_mini"version = "1.0.0"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "92a265e0c5b89a31cb939a9d7ffce63382e450af10df56a3b9bfb7084d3c2178"dependencies = ["fnv","lazy_static","nom","petgraph",