Faster guessing of encoding

pmeunier
Jun 29, 2021, 10:59 AM
6CZYYOG7C7ULY5Q4WSRVVC3QVSYACV3KFEYXDCH63OXXX2QFFFBAC

Dependencies

  • [2] 52SOYOCN tree_magic instead of tree_magic_mini
  • [3] QJXNUQFJ Solving conflicts
  • [4] JRPSGFZY Compilation problem on Windows
  • [5] NYOF5766 track file encoding in the record, including change text for file adds
  • [6] PDTUHOMV fix left over conflicts
  • [7] JRENVH5D Reqwest 0.11
  • [8] EEBKW7VT Keys and identities
  • [9] W4NSLQNG make text_encoding available to all of libpijul
  • [10] 246V5TYI decode existing files
  • [11] TVVW53HZ Conflict resolution
  • [12] SXEYMYF7 Fixing the bad changes in history (unfortunately, by rebooting).
  • [13] 6HNRL5RT detect non-utf8 text files
  • [14] UM5DLRPB store new non-UTF-8 files raw and decode to deplay the contents
  • [15] I24UEJQL Various post-fire fixes

Change contents

  • replacement in libpijul/src/working_copy/mod.rs at line 34
    [5.351][5.159:243](),[5.159][5.159:243](),[5.243][2.0:50](),[2.50][5.407:444](),[5.407][5.407:444](),[5.444][5.113:167](),[5.167][5.342:560](),[5.513][5.342:560](),[5.342][5.342:560](),[5.560][5.168:245](),[5.245][5.636:726](),[5.636][5.636:726](),[5.726][5.246:287](),[5.287][5.788:859](),[5.586][5.788:859](),[5.788][5.788:859](),[5.859][5.288:309](),[5.309][5.891:905](),[5.891][5.891:905]()
    let mut uncoded = Vec::new();
    self.read_file(&file, &mut uncoded)?;
    let mime = tree_magic::from_u8(&uncoded);
    debug!("mime = {:?}", mime);
    let encoding = if mime.starts_with("text/") {
    let mut detector = EncodingDetector::new();
    detector.feed(&uncoded, true);
    let encoding = detector.guess(None, true);
    debug!("guessed encoding = {:?}", encoding.name());
    let (_decoded, encoding, malformed) = encoding.decode(&uncoded);
    debug!("final encoding = {:?}", encoding.name());
    if !malformed {
    Some(Encoding(encoding))
    } else {
    warn!("text file was malformed");
    None
    }
    [5.351]
    [5.905]
    let init = buffer.len();
    self.read_file(&file, buffer)?;
    let mut detector = EncodingDetector::new();
    detector.feed(&buffer[init..], true);
    let encoding = detector.guess(None, true);
    let (_decoded, encoding, malformed) = encoding.decode(&buffer[init..]);
    Ok(if !malformed {
    Some(Encoding(encoding))
  • edit in libpijul/src/working_copy/mod.rs at line 43
    [5.922]
    [5.310]
    warn!("text file was malformed");
  • replacement in libpijul/src/working_copy/mod.rs at line 45
    [5.327][5.950:961](),[5.950][5.950:961](),[5.961][5.328:365](),[5.588][5.588:609](),[5.588][5.588:609]()
    };
    buffer.append(&mut uncoded);
    Ok(encoding)
    [5.327]
    [5.938]
    })
  • edit in libpijul/src/working_copy/filesystem.rs at line 341
    [4.91][4.91:92]()
  • edit in libpijul/Cargo.toml at line 119
    [5.730][2.51:72]()
    tree_magic = "0.2.0"
  • replacement in Cargo.lock at line 39
    [5.1031102][5.17171:17188]()
    "memchr 2.4.0",
    [5.1031102]
    [5.1031113]
    "memchr",
  • replacement in Cargo.lock at line 155
    [3.927][3.927:944]()
    "memchr 2.4.0",
    [3.927]
    [3.944]
    "memchr",
  • replacement in Cargo.lock at line 211
    [3.2563][3.2563:2580]()
    "memchr 2.4.0",
    [3.2563]
    [3.2580]
    "memchr",
  • replacement in Cargo.lock at line 569
    [3.11117][3.11117:11141]()
    "redox_syscall 0.2.9",
    [3.11117]
    [3.11141]
    "redox_syscall",
  • edit in Cargo.lock at line 572
    [3.11154][3.11154:11349]()
    [[package]]
    name = "fixedbitset"
    version = "0.2.0"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "37ab347416e802de484e4d03c7316c48f1ecb56574dfd4a46a80f173ce1de04d"
  • replacement in Cargo.lock at line 718
    [3.15179][3.15179:15196]()
    "memchr 2.4.0",
    [3.15179]
    [3.15196]
    "memchr",
  • replacement in Cargo.lock at line 964
    [3.20839][3.20839:20856]()
    "memchr 2.4.0",
    [3.20839]
    [3.20856]
    "memchr",
  • edit in Cargo.lock at line 1092
    [3.23528][3.23528:23543]()
    "tree_magic",
  • edit in Cargo.lock at line 1142
    [3.24603][3.24603:24829]()
    version = "0.3.4"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "c4da24a77a3d8a6d4862d95f72e6fdb9c09a643ecdb402d754004a557f2bec75"
    dependencies = [
    "scopeguard",
    ]
    [[package]]
    name = "lock_api"
  • edit in Cargo.lock at line 1175
    [3.25700][3.25700:25918]()
    version = "1.0.2"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "148fab2e51b4f1cfc66da2a7c32981d1d3c083a803978268bb11fe4b86925e7a"
    dependencies = [
    "libc",
    ]
    [[package]]
    name = "memchr"
  • edit in Cargo.lock at line 1264
    [3.28048][3.28048:28271]()
    ]
    [[package]]
    name = "nom"
    version = "3.2.1"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "05aec50c70fd288702bcd93284a8444607f3292dbdf2a30de5ea5dcdbe72287b"
    dependencies = [
    "memchr 1.0.2",
  • replacement in Cargo.lock at line 1321
    [3.29660][3.29660:29677]()
    "memchr 2.4.0",
    [3.29660]
    [3.29677]
    "memchr",
  • edit in Cargo.lock at line 1396
    [3.31510][3.31510:31771]()
    version = "0.10.2"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "d3a704eb390aafdc107b0e392f56a82b668e3a71366993b5340f5833fd62505e"
    dependencies = [
    "lock_api 0.3.4",
    "parking_lot_core 0.7.2",
    ]
    [[package]]
    name = "parking_lot"
  • replacement in Cargo.lock at line 1401
    [3.31962][3.31962:32316]()
    "lock_api 0.4.4",
    "parking_lot_core 0.8.3",
    ]
    [[package]]
    name = "parking_lot_core"
    version = "0.7.2"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "d58c7c768d4ba344e3e8d72518ac13e259d7c7ade24167003b8488e10b6740a3"
    dependencies = [
    "cfg-if 0.1.10",
    "cloudabi",
    "libc",
    "redox_syscall 0.1.57",
    "smallvec",
    "winapi",
    [3.31962]
    [3.32316]
    "lock_api",
    "parking_lot_core",
  • replacement in Cargo.lock at line 1414
    [3.32573][3.32573:32597]()
    "redox_syscall 0.2.9",
    [3.32573]
    [3.32597]
    "redox_syscall",
  • edit in Cargo.lock at line 1441
    [3.33261][3.33261:33501]()
    name = "petgraph"
    version = "0.5.1"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "467d164a6de56270bd7c4d070df81d07beace25012d5103ced4e9ff08d6afdb7"
    dependencies = [
    "fixedbitset",
    "indexmap",
    ]
    [[package]]
  • edit in Cargo.lock at line 1682
    [3.39099][3.39099:39297]()
    [[package]]
    name = "redox_syscall"
    version = "0.1.57"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce"
  • replacement in Cargo.lock at line 1699
    [3.39758][3.39758:39782]()
    "redox_syscall 0.2.9",
    [3.39758]
    [3.39782]
    "redox_syscall",
  • replacement in Cargo.lock at line 1709
    [3.40007][3.40007:40024]()
    "memchr 2.4.0",
    [3.40007]
    [3.40024]
    "memchr",
  • replacement in Cargo.lock at line 1805
    [3.42171][3.42171:42194]()
    "parking_lot 0.11.1",
    [3.42171]
    [3.42194]
    "parking_lot",
  • replacement in Cargo.lock at line 2021
    [3.47581][3.47581:47605]()
    "redox_syscall 0.2.9",
    [3.47581]
    [3.47605]
    "redox_syscall",
  • replacement in Cargo.lock at line 2196
    [3.51531][3.51531:51548]()
    "memchr 2.4.0",
    [3.51531]
    [3.51548]
    "memchr",
  • edit in Cargo.lock at line 2277
    [3.53373][3.53373:53654]()
    name = "tree_magic"
    version = "0.2.3"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "b1d99367ce3e553a84738f73bd626ccca541ef90ae757fdcdc4cbe728e6cb629"
    dependencies = [
    "fnv",
    "lazy_static",
    "nom",
    "parking_lot 0.10.2",
    "petgraph",
    ]
    [[package]]