detect non-utf8 text files

[?]
Dec 9, 2020, 7:57 AM
6HNRL5RT76NH5YNSUN7B4FHNRZXKNLX4DROFGMO4R5P2U7JWOL2QC

Dependencies

  • [2] JACZWIJ6 Version bump
  • [3] VO5OQW4W Removing anyhow in libpijul
  • [4] KJDQ2WOM Fixing the parsing of section headers in the text change format
  • [5] SXEYMYF7 Fixing the bad changes in history (unfortunately, by rebooting).
  • [6] WZVCLZKY address clippy lints
  • [*] OUWD436A Version bump
  • [*] SAGSYAPX Various version bumps

Change contents

  • file addition: text.rs (-xw-x--x--)
    [3.248792]
    use super::*;
    use crate::change::*;
    use crate::working_copy::WorkingCopy;
    #[test]
    fn add_non_utf8_file_test() -> Result<(), anyhow::Error> {
    env_logger::try_init().unwrap_or(());
    let mut buf = Vec::new();
    use std::io::Read;
    let mut fh = std::fs::File::open("src/tests/data/1252.1")?;
    fh.read_to_end(&mut buf)?;
    let mut repo = working_copy::memory::Memory::new();
    repo.add_file("file", buf);
    let env = pristine::sanakirja::Pristine::new_anon()?;
    let mut txn = env.mut_txn_begin();
    let mut channel = txn.open_or_create_channel("main")?;
    txn.add_file("file")?;
    let store = changestore::memory::Memory::new();
    let (h, change) = record_all(&mut repo, &store, &mut txn, &mut channel, "")?;
    let mut v = Vec::new();
    change
    .write(
    &store,
    Some(h),
    |l, _p| format!("{}:{}", l.path, l.line),
    true,
    &mut v,
    )
    .unwrap();
    for l in std::str::from_utf8(&v).unwrap().lines() {
    error!("{:?}", l);
    }
    let lines = std::str::from_utf8(&v).unwrap().lines();
    assert_eq!(
    1,
    lines
    .clone()
    .filter(|l| l.starts_with("+") && l.contains("French / Français (Windows CP 1252)"))
    .count()
    );
    assert_eq!(
    1,
    lines
    .filter(|l| l.starts_with("+") && l.contains("€‚ƒ„…†‡, Salut"))
    .count()
    );
    Ok(())
    }
    /// Change a non-utf-8 text file.
    #[test]
    fn change_non_utf8_file_test() -> Result<(), anyhow::Error> {
    env_logger::try_init().unwrap_or(());
    let mut buf = Vec::new();
    use std::io::Read;
    let mut fh = std::fs::File::open("src/tests/data/8859-1.1")?;
    fh.read_to_end(&mut buf)?;
    let mut repo = working_copy::memory::Memory::new();
    repo.add_file("file", buf);
    let env = pristine::sanakirja::Pristine::new_anon()?;
    let mut txn = env.mut_txn_begin();
    let mut channel = txn.open_or_create_channel("main")?;
    txn.add_file("file")?;
    let store = changestore::memory::Memory::new();
    record_all(&mut repo, &store, &mut txn, &mut channel, "")?;
    let mut buf = Vec::new();
    {
    use std::io::Read;
    let mut fh = std::fs::File::open("src/tests/data/8859-1.2")?;
    fh.read_to_end(&mut buf)?;
    }
    repo.write_file::<_, std::io::Error, _>("file", |w| {
    w.write_all(&buf).unwrap();
    Ok(())
    })?;
    let (h1, change1) = record_all(&mut repo, &store, &mut txn, &mut channel, "")?;
    // only one line was changed
    let mut v = Vec::new();
    change1
    .write(
    &store,
    Some(h1),
    |l, _p| format!("{}:{}", l.path, l.line),
    true,
    &mut v,
    )
    .unwrap();
    for l in std::str::from_utf8(&v).unwrap().lines() {
    error!("{:?}", l);
    }
    assert_eq!(
    1,
    std::str::from_utf8(&v)
    .unwrap()
    .lines()
    .filter(|l| l.starts_with("-")
    && l.contains("French / Français (ISO Latin-1 / ISO 8859-1)"))
    .count()
    );
    Ok(())
    }
    fn record_all<T: MutTxnT, R: WorkingCopy, P: ChangeStore>(
    repo: &mut R,
    store: &P,
    txn: &mut T,
    channel: &mut ChannelRef<T>,
    prefix: &str,
    ) -> Result<(Hash, Change), anyhow::Error>
    where
    R::Error: Send + Sync + 'static,
    {
    let mut state = Builder::new();
    state.record(txn, Algorithm::default(), channel, repo, store, prefix)?;
    let rec = state.finish();
    let changes = rec
    .actions
    .into_iter()
    .map(|rec| rec.globalize(txn))
    .collect();
    let change0 = crate::change::Change::make_change(
    txn,
    &channel,
    changes,
    rec.contents,
    crate::change::ChangeHeader {
    message: "test".to_string(),
    authors: vec![],
    description: None,
    // Beware of changing the following line: two changes
    // doing the same thing will be equal. Sometimes we don't
    // want that, as in tests::unrecord::unrecord_double.
    timestamp: chrono::Utc::now(),
    },
    Vec::new(),
    );
    let hash = store.save_change(&change0)?;
    apply::apply_local_change(txn, channel, &change0, hash, &rec.updatables)?;
    Ok((hash, change0))
    }
  • edit in libpijul/src/tests/mod.rs at line 19
    [3.289715]
    [3.289715]
    mod text;
  • file addition: data (dxwrx-rx-r)
    [3.248792]
  • file addition: gb.2 (-xw-x--x--)
    [0.4375]
  • file addition: gb.1 (-xw-x--x--)
    [0.4375]
  • file addition: 8859-1.2 (-xw-x--x--)
    [0.4375]
  • file addition: 8859-1.1 (-xw-x--x--)
    [0.4375]
  • file addition: 1252.2 (-xw-x--x--)
    [0.4375]
  • file addition: 1252.1 (-xw-x--x--)
    [0.4375]
  • replacement in libpijul/src/record.rs at line 245
    [3.496931][3.496931:497193]()
    let utf8 = std::str::from_utf8(&self.rec.contents[s..e]);
    debug!("utf8 = {:?}", utf8);
    match utf8 {
    Err(e) => e.valid_up_to() < CHECK_UTF8,
    Ok(_) => false,
    }
    [3.496931]
    [3.497193]
    let mime = tree_magic_mini::from_u8(&self.rec.contents[s..e]);
    debug!("mime = {:?}", mime);
    !mime.starts_with("text/")
  • replacement in libpijul/src/diff/mod.rs at line 54
    [3.794648][3.5785:5878](),[3.5878][3.5878:5921]()
    if (std::str::from_utf8(&d.contents_a).is_err() || std::str::from_utf8(&b).is_err())
    && d.contents_a != b
    {
    [3.794648]
    [3.5921]
    let mime_a = tree_magic_mini::from_u8(&d.contents_a);
    let mime_b = tree_magic_mini::from_u8(&b);
    debug!("mimes = {:?}, {:?}", mime_a, mime_b);
    if (!mime_a.starts_with("text/") || !mime_b.starts_with("text/")) && d.contents_a != b {
  • edit in libpijul/src/change/text_changes.rs at line 4
    [3.38019]
    [3.38019]
    use chardetng::EncodingDetector;
  • replacement in libpijul/src/change/text_changes.rs at line 1161
    [3.84728][3.84728:84993]()
    if let Ok(mut contents) = std::str::from_utf8(&contents) {
    while let Some(n) = contents.as_bytes().iter().position(|&c| c == b'\n') {
    let (a, b) = contents.split_at(n + 1);
    contents = b;
    write!(w, "{} {}", pref, a)?;
    [3.84728]
    [3.84993]
    if tree_magic_mini::from_u8(&contents).starts_with("text/") {
    let mut detector = EncodingDetector::new();
    detector.feed(&contents, true);
    let encoding = detector.guess(None, true);
    debug!("guessed encoding = {:?}", encoding.name());
    let (contents, encoding, malformed) = encoding.decode(&contents);
    debug!("final encoding = {:?}", encoding.name());
    if malformed {
    warn!("text file was malformed, should probably try binary instead")
  • replacement in libpijul/src/change/text_changes.rs at line 1171
    [3.85003][3.85003:85088]()
    if !contents.is_empty() {
    writeln!(w, "{} {}", pref, contents)?;
    [3.85003]
    [3.85088]
    for a in contents.split_terminator('\n') {
    writeln!(w, "{} {}", pref, a)?;
  • edit in libpijul/Cargo.toml at line 72
    [3.1022480]
    [3.1022480]
    "src/tests/text.rs",
  • edit in libpijul/Cargo.toml at line 114
    [3.1023338]
    [2.228]
    tree_magic_mini = "1.0.0"
    chardetng = "0.1.10"
    encoding_rs = "0.8.26"
  • edit in Cargo.lock at line 177
    [3.1036080]
    [3.1036080]
    [[package]]
    name = "chardetng"
    version = "0.1.10"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "f866cba7596c2e70200523e399101d460514a5e59191223aa87d579e49e52025"
    dependencies = [
    "cfg-if 0.1.10",
    "encoding_rs",
    "memchr",
    ]
  • edit in Cargo.lock at line 481
    [3.1043055]
    [3.1043055]
    name = "fixedbitset"
    version = "0.2.0"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "37ab347416e802de484e4d03c7316c48f1ecb56574dfd4a46a80f173ce1de04d"
    [[package]]
  • edit in Cargo.lock at line 960
    [3.1054528]
    [3.1054528]
    [[package]]
    name = "lexical-core"
    version = "0.7.4"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "db65c6da02e61f55dae90a0ae427b2a5f6b3e8db09f58d10efab23af92592616"
    dependencies = [
    "arrayvec",
    "bitflags",
    "cfg-if 0.1.10",
    "ryu",
    "static_assertions",
    ]
  • edit in Cargo.lock at line 1003
    [8.807]
    [8.807]
    "chardetng",
  • edit in Cargo.lock at line 1008
    [8.867]
    [9.5758]
    "encoding_rs",
  • edit in Cargo.lock at line 1026
    [3.1055861]
    [3.1055861]
    "tree_magic_mini",
  • edit in Cargo.lock at line 1217
    [3.1060297]
    [3.1060297]
    ]
    [[package]]
    name = "nom"
    version = "5.1.2"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af"
    dependencies = [
    "lexical-core",
    "memchr",
    "version_check",
  • edit in Cargo.lock at line 1377
    [3.1064043]
    [3.1064043]
    [[package]]
    name = "petgraph"
    version = "0.5.1"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "467d164a6de56270bd7c4d070df81d07beace25012d5103ced4e9ff08d6afdb7"
    dependencies = [
    "fixedbitset",
    "indexmap",
    ]
  • edit in Cargo.lock at line 1853
    [3.1075404]
    [3.1075404]
    name = "static_assertions"
    version = "1.1.0"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
    [[package]]
  • edit in Cargo.lock at line 2174
    [3.1082329]
    [3.1082329]
    ]
    [[package]]
    name = "tree_magic_mini"
    version = "1.0.0"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "92a265e0c5b89a31cb939a9d7ffce63382e450af10df56a3b9bfb7084d3c2178"
    dependencies = [
    "fnv",
    "lazy_static",
    "nom",
    "petgraph",