LHETILNNYVQ27JCR5WGRAPDRQBFORK77UDJDMAZ4E47PC7ZPS7RAC
use std::{io::Cursor, str::SplitWhitespace};
extern crate murmur3;
extern crate ndarray;
extern crate unicode_normalization;
use murmur3::murmur3_32;
use ndarray::Array;
use unicode_normalization::UnicodeNormalization;
#[derive(Debug, Clone)]
pub struct HashingVectorizer {
n_min: usize,
n_max: usize,
n_samples: usize,
}
impl HashingVectorizer {
pub fn new(n_samples: usize, n_min: usize, n_max: usize) -> Self {
Self {
n_samples,
n_min,
n_max,
}
}
pub fn fit_transform(&self, input: &[&str]) {
assert!(input.len() != 0);
let mut target_arr: Array<f64, _> = ndarray::Array::zeros((input.len(), self.n_samples));
for (idx, &s) in input.iter().enumerate() {
for mut ng in NGramIter::new(&normalize(s), 5) {
let h = murmur3_32(&mut Cursor::new(ng), 0).unwrap();
}
}
}
}
fn normalize(input: &str) -> String {
input.nfkd().filter(char::is_ascii).collect()
}
/// Iterator over ngrams analogous to those used by scikit-learn's HashingVectorizer
#[derive(Debug, Clone)]
struct NGramIter<'a> {
source: SplitWhitespace<'a>,
cur_word: Option<String>,
size: usize,
pos: usize,
}
impl<'a> NGramIter<'a> {
pub fn new(source: &'a str, size: usize) -> Self {
assert!(size != 0);
Self {
source: source.split_whitespace(),
cur_word: None,
size,
pos: 0,
}
}
}
impl<'a> Iterator for NGramIter<'a> {
type Item = String;
fn next(&mut self) -> Option<Self::Item> {
let word = match self.cur_word.as_ref() {
Some(w) => Some(w),
None => {
self.cur_word = self.source.next().map(|w| format!(" {} ", w));
self.pos = 0;
self.cur_word.as_ref()
}
};
let mut next_word = false;
let res = match word {
Some(w) => {
// Be careful not to slice inside a multibyte char
let mut end = self.pos + self.size;
if end >= w.len() {
end = w.len();
next_word = true;
}
while !w.is_char_boundary(end) {
end += 1;
}
let ret = &w[self.pos..end];
// Also don't start inside a multibyte char next time
self.pos += 1;
while !w.is_char_boundary(self.pos) {
self.pos += 1;
}
Some(ret.to_owned())
}
None => None,
};
if next_word {
self.cur_word = None;
}
res
}
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use crate::NGramIter;
#[test]
fn five_gram_tests() {
let input = "this is a test!";
let expected: HashSet<_> = [" this", "this ", " is ", " a ", " test", "test!", "est! "]
.iter()
.map(|&s| s.to_owned())
.collect();
let res: HashSet<_> = NGramIter::new(input, 5).collect();
assert_eq!(expected, res);
}
}
[package]
name = "ml-preproc"
version = "0.1.0"
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
murmur3 = "0.5"
ndarray = "0.15"
unicode-normalization ="0.1"
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "autocfg"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
[[package]]
name = "matrixmultiply"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a8a15b776d9dfaecd44b03c5828c2199cddff5247215858aac14624f8d6b741"
dependencies = [
"rawpointer",
]
[[package]]
name = "ml-preproc"
version = "0.1.0"
dependencies = [
"murmur3",
"ndarray",
"unicode-normalization",
]
[[package]]
name = "murmur3"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ead5388e485d38e622630c6b05afd3761a6701ff15c55b279ea5b31dcb62cff"
[[package]]
name = "ndarray"
version = "0.15.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08e854964160a323e65baa19a0b1a027f76d590faba01f05c0cbc3187221a8c9"
dependencies = [
"matrixmultiply",
"num-complex",
"num-integer",
"num-traits",
"rawpointer",
]
[[package]]
name = "num-complex"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26873667bbbb7c5182d4a37c1add32cdf09f841af72da53318fdb81543c15085"
dependencies = [
"num-traits",
]
[[package]]
name = "num-integer"
version = "0.1.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db"
dependencies = [
"autocfg",
"num-traits",
]
[[package]]
name = "num-traits"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
dependencies = [
"autocfg",
]
[[package]]
name = "rawpointer"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
[[package]]
name = "tinyvec"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b5220f05bb7de7f3f53c7c065e1199b3172696fe2db9f9c4d8ad9b4ee74c342"
dependencies = [
"tinyvec_macros",
]
[[package]]
name = "tinyvec_macros"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
[[package]]
name = "unicode-normalization"
version = "0.1.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9"
dependencies = [
"tinyvec",
]
target/