Initial support for binary diffs (conflicts are not yet supported in the output)

pmeunier
Jul 1, 2021, 1:15 PM
TGA6QXGIZBDTXMOCHH23HJUGWYOCJQ32M3CQ3U7HUF3TDJO6QNIQC

Dependencies

  • [2] XR7MNOMU file encoding in updates
  • [3] Z6FWHKCA Improving the UI around zombie conflicts
  • [4] WZVCLZKY address clippy lints
  • [5] Q3GU26WD merge with changes from sanakirja v1.1.2
  • [6] 6HNRL5RT detect non-utf8 text files
  • [7] IYJZVLET Cleaning up the literate programming bits
  • [8] NA5I4WYN Fixing the inverse of conflict resolutions
  • [9] SXEYMYF7 Fixing the bad changes in history (unfortunately, by rebooting).
  • [10] GHO6DWPI Refactoring iterators
  • [11] CCLLB7OI Upgrading to Sanakirja 0.15 + version bump
  • [12] 3AMEP2Y5 More convenient interface for channels
  • [*] VO5OQW4W Removing anyhow in libpijul

Change contents

  • edit in libpijul/src/diff/mod.rs at line 7
    [2.677]
    [4.793368]
    mod bin;
  • edit in libpijul/src/diff/mod.rs at line 21
    [4.793593]
    [4.793593]
    }
    impl<'a> Default for Line<'a> {
    fn default() -> Self {
    Line {
    l: &[],
    cyclic: false,
    before_end_marker: false,
    last: false,
    }
    }
  • edit in libpijul/src/diff/mod.rs at line 61
    [4.115839]
    [14.33372]
    }
    fn make_old_lines(d: &vertex_buffer::Diff) -> Vec<Line> {
    d.lines()
    .map(|l| {
    let old_bytes = l.as_ptr() as usize - d.contents_a.as_ptr() as usize;
    let cyclic = if let Err(n) = d
    .cyclic_conflict_bytes
    .binary_search(&(old_bytes, std::usize::MAX))
    {
    n > 0 && {
    let (a, b) = d.cyclic_conflict_bytes[n - 1];
    a <= old_bytes && old_bytes < b
    }
    } else {
    false
    };
    let before_end_marker = if l.last() != Some(&b'\n') {
    let next_index = l.as_ptr() as usize + l.len() - d.contents_a.as_ptr() as usize + 1;
    d.marker.get(&next_index) == Some(&vertex_buffer::ConflictMarker::End)
    } else {
    false
    };
    debug!("old = {:?}", l);
    Line {
    l,
    cyclic,
    before_end_marker,
    last: l.as_ptr() as usize + l.len() - d.contents_a.as_ptr() as usize
    >= d.contents_a.len(),
    }
    })
    .collect()
    }
    fn make_new_lines(b: &[u8]) -> Vec<Line> {
    split::LineSplit::from(b)
    .map(|l| {
    debug!("new: {:?}", l);
    let next_index = l.as_ptr() as usize + l.len() - b.as_ptr() as usize;
    Line {
    l,
    cyclic: false,
    before_end_marker: false,
    last: next_index >= b.len(),
    }
    })
    .collect()
  • replacement in libpijul/src/diff/mod.rs at line 128
    [2.783][2.783:815](),[2.815][4.1064:1149](),[4.5212][4.1064:1149](),[4.1149][4.5994:6021](),[4.6760][4.5994:6021](),[4.63649][4.5994:6021](),[4.116037][4.5994:6021](),[4.5994][4.5994:6021](),[4.6021][4.794898:795855](),[4.794898][4.794898:795855](),[4.795855][4.143:184](),[4.184][4.795855:796248](),[4.795855][4.795855:796248](),[4.796248][4.185:225](),[4.225][4.796248:796567](),[4.796248][4.796248:796567]()
    if encoding.is_none() {
    self.diff_binary(changes, txn, txn.graph(channel), path, inode, a, &b)?;
    return Ok(());
    }
    let lines_a: Vec<Line> = d
    .lines()
    .map(|l| {
    let old_bytes = l.as_ptr() as usize - d.contents_a.as_ptr() as usize;
    let cyclic = if let Err(n) = d
    .cyclic_conflict_bytes
    .binary_search(&(old_bytes, std::usize::MAX))
    {
    n > 0 && {
    let (a, b) = d.cyclic_conflict_bytes[n - 1];
    a <= old_bytes && old_bytes < b
    }
    } else {
    false
    };
    let before_end_marker = if l.last() != Some(&b'\n') {
    let next_index =
    l.as_ptr() as usize + l.len() - d.contents_a.as_ptr() as usize + 1;
    d.marker.get(&next_index) == Some(&vertex_buffer::ConflictMarker::End)
    } else {
    false
    };
    debug!("old = {:?}", l);
    Line {
    l,
    cyclic,
    before_end_marker,
    last: l.as_ptr() as usize + l.len() - d.contents_a.as_ptr() as usize
    >= d.contents_a.len(),
    }
    })
    .collect();
    let lines_b: Vec<Line> = split::LineSplit::from(&b[..])
    .map(|l| {
    debug!("new: {:?}", l);
    let next_index = l.as_ptr() as usize + l.len() - b.as_ptr() as usize;
    Line {
    l,
    cyclic: false,
    before_end_marker: false,
    last: next_index >= b.len(),
    }
    })
    .collect();
    [2.783]
    [3.29]
    let (lines_a, lines_b) = if encoding.is_none() {
    // self.diff_binary(changes, txn, txn.graph(channel), path, inode, a, &b)?;
    // return Ok(());
    const ROLLING_SIZE: usize = 8192;
    let (ah, old) = bin::make_old_chunks(ROLLING_SIZE, &d.contents_a);
    let (bb, new) = bin::make_new_chunks(ROLLING_SIZE, &ah, &b);
    debug!("bb = {:?}", bb);
    (old, new)
    } else {
    (make_old_lines(&d), make_new_lines(&b))
    };
  • replacement in libpijul/src/diff/mod.rs at line 265
    [4.800857][4.800857:800910]()
    debug!("bytes pos {:?} {:?}", old, chunks[old]);
    [4.800857]
    [4.800910]
    debug!(
    "bytes pos {:?} {:?}",
    old,
    Line {
    l: &(chunks[old].l)[..20.min(chunks[old].l.len())],
    ..chunks[old]
    }
    );
  • file addition: bin.rs (----------)
    [4.768883]
    use adler32::*;
    use std::collections::hash_map::Entry;
    use std::collections::HashMap;
    pub(super) fn make_old_chunks(
    window: usize,
    a: &[u8],
    ) -> (HashMap<u32, Vec<(usize, &[u8])>>, Vec<super::Line>) {
    let mut a_ad = 0;
    let mut a_h = HashMap::with_capacity(a.len() / window + 1);
    let mut lines = Vec::new();
    'outer: for ch in a.chunks(window) {
    debug!("chunk {:?}", ch.len());
    lines.push(super::Line {
    l: ch,
    ..super::Line::default()
    });
    let ad = adler32(ch).unwrap();
    match a_h.entry(ad) {
    Entry::Vacant(e) => {
    e.insert(vec![(a_ad, ch)]);
    }
    Entry::Occupied(mut e) => {
    let e = e.get_mut();
    for (_, old) in e.iter() {
    if *old == ch {
    continue 'outer;
    }
    }
    e.push((a_ad, ch));
    }
    }
    a_ad += 1
    }
    if let Some(l) = lines.last_mut() {
    l.last = true
    }
    (a_h, lines)
    }
    pub(super) fn make_new_chunks<'a>(
    window: usize,
    a_h: &HashMap<u32, Vec<(usize, &[u8])>>,
    b: &'a [u8],
    ) -> (Vec<Chunk>, Vec<super::Line<'a>>) {
    let mut ad = RollingAdler32::from_buffer(&b[..window]);
    let mut bb = Vec::new();
    let mut i = window;
    let mut j = 0;
    let mut lines = Vec::new();
    while j < b.len() {
    let h = ad.hash();
    if let Some(v) = a_h.get(&h) {
    // We've found a match from the old version.
    for &(v, old) in v.iter() {
    if old == &b[j..i] {
    bb.push(Chunk::Old {
    start: j,
    end: i,
    old_pos: v,
    });
    for _ in 0..window {
    if j < b.len() {
    ad.remove(i - j, b[j]);
    j += 1;
    }
    if i < b.len() {
    ad.update(b[i]);
    i += 1;
    }
    }
    break;
    }
    }
    } else {
    if let Some(Chunk::New { ref mut len, .. }) = bb.last_mut() {
    *len += 1
    } else {
    bb.push(Chunk::New { start: j, len: 1 })
    }
    ad.remove(i - j, b[j]);
    j += 1;
    if i < b.len() {
    ad.update(b[i]);
    i += 1;
    }
    }
    }
    for chunk in bb.iter() {
    match *chunk {
    Chunk::Old { start, end, .. } => lines.push(super::Line {
    l: &b[start..end],
    ..super::Line::default()
    }),
    Chunk::New { start, len } => lines.push(super::Line {
    l: &b[start..start + len],
    ..super::Line::default()
    }),
    }
    }
    if let Some(l) = lines.last_mut() {
    l.last = true
    }
    (bb, lines)
    }
    #[derive(Debug)]
    pub(super) enum Chunk {
    Old {
    start: usize,
    end: usize,
    old_pos: usize,
    },
    New {
    start: usize,
    len: usize,
    },
    }
    /*
    pub fn diff<D: diffs::Diff>(window: usize, a: &[u8], b: &[u8], d: D)
    where
    D::Error: std::fmt::Debug,
    {
    let a_h = make_old_chunks(window, a);
    let bb = make_new_chunks(window, &a_h, b);
    // Make a dummy vector (because `std::ops::Index` wants a borrow).
    let mut aa = Vec::with_capacity(a.len() / window + 1);
    for pos in 0..(a.len() + window - 1) / window {
    aa.push(pos)
    }
    diffs::myers::diff(
    &mut W {
    d,
    window,
    old_len: a.len(),
    a: &aa,
    b: &bb,
    },
    &aa,
    0,
    aa.len(),
    &bb,
    0,
    bb.len(),
    )
    .unwrap();
    }
    impl Chunk {
    fn start(&self) -> usize {
    match *self {
    Chunk::Old { start, .. } => start,
    Chunk::New { start, .. } => start,
    }
    }
    }
    impl PartialEq<usize> for Chunk {
    fn eq(&self, b: &usize) -> bool {
    if let Chunk::Old { old_pos, .. } = *self {
    old_pos == *b
    } else {
    false
    }
    }
    }
    #[derive(Debug)]
    struct W<'a, D> {
    d: D,
    window: usize,
    old_len: usize,
    a: &'a [usize],
    b: &'a [Chunk],
    }
    impl<'a, D: diffs::Diff> diffs::Diff for W<'a, D>
    where
    D::Error: std::fmt::Debug,
    {
    type Error = D::Error;
    fn equal(&mut self, old: usize, new: usize, len: usize) -> Result<(), Self::Error> {
    let old = old * self.window;
    let new = self.b[new].start();
    let len = (len * self.window).min(self.old_len - old);
    self.d.equal(old, new, len)
    }
    fn delete(&mut self, old: usize, len: usize, new: usize) -> Result<(), Self::Error> {
    let old = old * self.window;
    let new = self.b[new].start();
    let len = (len * self.window).min(self.old_len - old);
    self.d.delete(old, len, new)
    }
    fn insert(&mut self, old: usize, new: usize, new_len: usize) -> Result<(), Self::Error> {
    let old = old * self.window;
    let new = self.b[new].start();
    let mut new_len_ = 0;
    for b in &self.b[new .. new + new_len] {
    match b {
    Chunk::Old { start, .. } => {
    new_len_ += self.window.min(self.old_len - start)
    }
    Chunk::New { len, .. } => {
    new_len_ += len
    }
    }
    }
    self.d.insert(old, new, new_len_)
    }
    fn replace(
    &mut self,
    old: usize,
    old_len: usize,
    new: usize,
    new_len: usize,
    ) -> Result<(), Self::Error> {
    let old = old * self.window;
    let old_len = (old_len * self.window).min(self.old_len - old);
    let new = self.b[new].start();
    let mut new_len_ = 0;
    for b in &self.b[new .. new + new_len] {
    match b {
    Chunk::Old { start, .. } => {
    new_len_ += self.window.min(self.old_len - start)
    }
    Chunk::New { len, .. } => {
    new_len_ += len
    }
    }
    }
    self.d.replace(old, old_len, new, new_len_)
    }
    fn finish(&mut self) -> Result<(), Self::Error> {
    self.d.finish()
    }
    }
    */