Get the scraper part of this program working

[?]
Jan 12, 2022, 3:34 AM
LPVC545KD6R5KNOTDPFDOD5FVA3KFBZARKVOLLXJXW3YK3RMNZKQC

Dependencies

  • [2] YCWYAX6K Functions for scraping names integrated enough to test (they don't work yet)
  • [3] C376NCOV Prevent flooding the website (still need to better mimic browser requests)
  • [4] HMOBTVJ4 Initialize crate and add expected dependencies
  • [5] KUANIPWF Add function for adding name to database
  • [6] PQ4BG3ZJ The web scrape functions
  • [7] AV73DYWQ Initial functions for using sqlite in async environment
  • [8] RNW6D777 Minor tidy

Change contents

  • replacement in src/main.rs at line 16
    [2.216][2.216:244]()
    #[derive(Debug,Subcommand)]
    [2.216]
    [2.244]
    #[derive(Debug, Subcommand)]
  • replacement in src/main.rs at line 21
    [2.331][2.331:337]()
    }
    [2.331]
    [2.337]
    },
  • replacement in src/main.rs at line 27
    [2.370][4.0:52](),[4.301][4.0:52](),[4.52][2.371:394](),[2.394][4.1379:1398](),[4.1379][4.1379:1398]()
    let db = names_database::AsyncConnection::open(
    args.database,
    )
    .await?;
    [2.370]
    [2.395]
    let db = names_database::AsyncConnection::open(args.database).await?;
  • replacement in src/main.rs at line 32
    [2.444][2.444:534]()
    async fn run(&self, db: &AsyncConnection) -> Result<(), Box<dyn std::error::Error>> {
    [2.444]
    [4.1548]
    async fn run(
    &self,
    db: &AsyncConnection,
    ) -> Result<(), Box<dyn std::error::Error>> {
  • replacement in src/main.rs at line 37
    [4.1569][2.535:575]()
    Self::Gather{ gender } => {
    [4.1569]
    [2.575]
    Self::Gather { gender } => {
  • edit in src/gather.rs at line 1
    [4.87][4.88:113]()
    use std::borrow::Borrow;
  • edit in src/gather.rs at line 2
    [2.815]
    [4.113]
    use std::borrow::Borrow;
  • edit in src/gather.rs at line 23
    [4.611][4.611:652]()
    let client = reqwest::Client::new();
  • replacement in src/gather.rs at line 24
    [3.30][3.30:65](),[3.65][4.783:1123](),[4.783][4.783:1123]()
    for initial in 'A'..='Z' {
    let mut page = 0;
    loop {
    let results =
    fetch_name_page(&client, initial, page).await.unwrap();
    for (name, gender) in &results.names {
    if filter.allow(gender) {
    tx.send(name.clone()).await.unwrap();
    }
    [3.30]
    [4.1123]
    let client = reqwest::Client::builder().cookie_store(true).user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:95.0) Gecko/20100101 Firefox/95.0").build().unwrap();
    let mut fetch = Fetch::start(&client);
    loop {
    dbg!(fetch.current_letter, &fetch.form);
    let (delay, page) = match fetch.next().await {
    Err(err) => {
    dbg!(err);
    (120, Vec::with_capacity(0))
  • replacement in src/gather.rs at line 33
    [4.1141][3.66:140](),[3.140][4.1141:1235](),[4.1141][4.1141:1235]()
    tokio::time::sleep(std::time::Duration::new(10,0)).await;
    if results.has_next {
    page += 1;
    } else {
    [4.1141]
    [4.1235]
    Ok(None) => {
  • edit in src/gather.rs at line 36
    [4.1280]
    [4.1280]
    Ok(Some(page)) => (5, page),
    };
    for (name, gender) in page {
    if filter.allow(&gender) {
    tx.send(name).await.unwrap();
    }
  • edit in src/gather.rs at line 43
    [4.1294]
    [3.141]
    tokio::time::sleep(std::time::Duration::new(delay, 0)).await;
  • replacement in src/gather.rs at line 49
    [4.1322][4.1322:1396]()
    struct ResultPage {
    names: Vec<(String, Gender)>,
    has_next: bool,
    [4.1322]
    [4.1396]
    struct Fetch<'a> {
    client: &'a reqwest::Client,
    current_letter: char,
    url: String,
    form: Option<Vec<(String, String)>>,
  • replacement in src/gather.rs at line 56
    [4.1399][4.1399:2008](),[4.2008][3.160:176](),[3.176][4.2044:2220](),[4.2044][4.2044:2220]()
    async fn fetch_name_page(
    client: &reqwest::Client,
    letter: char,
    page: usize,
    ) -> Result<ResultPage, Box<dyn std::error::Error>> {
    let page_header = format!("+Baby+Names+starting+with+{}", letter);
    let starts = String::from(letter);
    let mut form = vec![
    ("advanced", "1"),
    ("starts", starts.borrow()),
    ("end", ""),
    ("meaning", ""),
    ("origin", ""),
    ("nat", ""),
    ("startswith", ""),
    ("endswith", ""),
    ("gender", ""),
    ("cat", ""),
    ("syl", ""),
    ("page_header", page_header.borrow()),
    ];
    let offset;
    match page {
    0 => {
    form.extend(
    [("offset", "66"), ("offset", "66"), ("Next", "Previous")]
    .iter(),
    );
    [4.1399]
    [4.2220]
    impl<'a> Fetch<'a> {
    fn start(client: &'a reqwest::Client) -> Self {
    Self {
    client,
    current_letter: 'A',
    url: String::from("https://babynames.com/names/A"),
    form: None,
  • replacement in src/gather.rs at line 64
    [4.2230][4.2230:2314]()
    1 => {
    form.extend([("offset", "0"), ("Next", "Next")].iter());
    [4.2230]
    [4.2314]
    }
    async fn next(
    &mut self,
    ) -> Result<Option<Vec<(String, Gender)>>, Box<dyn std::error::Error>> {
    if self.current_letter > 'Z' {
    return Ok(None);
  • replacement in src/gather.rs at line 72
    [4.2324][4.2324:2629]()
    _ => {
    offset = format!("{}", (page - 1) * 66);
    form.extend(
    [
    ("offset", offset.borrow()),
    ("offset", offset.borrow()),
    ("Next", "Next"),
    ]
    .iter(),
    );
    [4.2324]
    [4.2629]
    let response = match &self.form {
    None => self.client.get(&self.url),
    Some(fields) => self.client.post(&self.url).form(fields),
  • edit in src/gather.rs at line 76
    [4.2639][4.2639:2752]()
    }
    let response = client
    .post("https://babynames.com/names/search.php")
    .form(&form[..])
  • replacement in src/gather.rs at line 78
    [4.2785][4.2785:3763]()
    StatusError::ensure_success(response.status())?;
    let text = response.text().await?;
    Ok(tokio::task::spawn_blocking(move || {
    let doc = scraper::Html::parse_document(text.borrow());
    let next_button = scraper::Selector::parse("input.next-btn").unwrap();
    let mut next_button = doc.select(&next_button);
    let has_next = match next_button.next() {
    None => false,
    Some(button) => match button.value().attr("type").map(str::trim) {
    Some("hidden") => true,
    _ => false,
    },
    };
    let name_selector =
    scraper::Selector::parse("ul.searchresults a").unwrap();
    let names = doc
    .select(&name_selector)
    .map(|item| {
    let gender = match item.value().attr("class") {
    Some("M") => Gender::Masculine,
    Some("F") => Gender::Feminine,
    _ => Gender::Neutral,
    [4.2785]
    [4.3763]
    StatusError::ensure_success(response.status())?;
    let text = response.text().await?;
    let (names, next_url, form) = tokio::task::spawn_blocking(move || {
    let doc = scraper::Html::parse_document(text.borrow());
    let next_button =
    scraper::Selector::parse("input.next-btn").unwrap();
    let mut next_button = doc.select(&next_button);
    let has_next = match next_button.next() {
    None => false,
    Some(_) => true,
    };
    let name_selector =
    scraper::Selector::parse("ul.searchresults a").unwrap();
    let names = doc
    .select(&name_selector)
    .map(|item| {
    let gender = match item.value().attr("class") {
    Some("M") => Gender::Masculine,
    Some("F") => Gender::Feminine,
    _ => Gender::Neutral,
    };
    let name: String = item.text().map(|s| s.trim()).collect();
    (name, gender)
    })
    .collect();
    let stepper =
    scraper::Selector::parse("div.next-previous form").unwrap();
    match doc.select(&stepper).next() {
    Some(stepper) => {
    let action = stepper.value().attr("action");
    let form = scraper::Selector::parse("input").unwrap();
    let form = stepper
    .select(&form)
    .filter_map(|input| {
    let input = input.value();
    if input.attr("type") == Some("submit")
    && input.attr("class") != Some("next-btn")
    {
    None
    } else {
    Some((
    input.attr("name").map(String::from)?,
    input.attr("value").map(String::from)?,
    ))
    }
    })
    .collect();
    if has_next {
    (names, action.map(|url| {
    if url.starts_with("http") {
    String::from(url)
    } else {
    format!("https://babynames.com{}", url)
    }
    }), Some(form))
    } else {
    (names, None, None)
    }
    }
    None => (names, None, None),
    }
    })
    .await?;
    match form {
    None => {
    self.current_letter = unsafe {
    char::from_u32_unchecked(self.current_letter as u32 + 1)
  • replacement in src/gather.rs at line 146
    [4.3782][4.3782:3987]()
    let name: String = item.text().map(|s| s.trim()).collect();
    (name, gender)
    })
    .collect();
    ResultPage { names, has_next }
    })
    .await?)
    [4.3782]
    [4.3987]
    self.form = None;
    }
    Some(form) => {
    self.form = Some(form);
    }
    }
    self.url = match next_url {
    None => {
    format!("https://babynames.com/names/{}", self.current_letter)
    }
    Some(url) => url,
    };
    Ok(Some(names))
    }
  • replacement in Cargo.toml at line 10
    [2.923][4.309:329](),[4.309][4.309:329]()
    reqwest = "^0.11.8"
    [2.923]
    [4.329]
    reqwest = { version = "^0.11.8", features = ["default","cookies"] }
  • edit in Cargo.lock at line 23
    [4.777]
    [4.777]
    name = "base-x"
    version = "0.2.8"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "a4521f3e3d031370679b3b140beb36dfe4801b09ac77e30c61941f97df3ef28b"
    [[package]]
  • edit in Cargo.lock at line 99
    [2.1763]
    [2.1763]
    [[package]]
    name = "const_fn"
    version = "0.4.9"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "fbdcdcb6d86f71c5e97409ad45898af11cbc995b4ee8112d59095a28d376c935"
  • edit in Cargo.lock at line 113
    [4.2306]
    [4.2306]
    name = "cookie"
    version = "0.15.1"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "d5f1c7727e460397e56abc4bddc1d49e07a1ad78fc98eb2e1c8f032a58a2f80d"
    dependencies = [
    "percent-encoding",
    "time",
    "version_check",
    ]
    [[package]]
    name = "cookie_store"
    version = "0.15.1"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "b3f7034c0932dc36f5bd8ec37368d971346809435824f277cb3b8299fc56167c"
    dependencies = [
    "cookie",
    "idna",
    "log",
    "publicsuffix",
    "serde",
    "serde_json",
    "time",
    "url",
    ]
    [[package]]
  • replacement in Cargo.lock at line 191
    [4.3572][4.3572:3590]()
    "rustc_version",
    [4.3572]
    [4.3590]
    "rustc_version 0.4.0",
  • edit in Cargo.lock at line 196
    [4.3613]
    [4.3613]
    name = "discard"
    version = "1.0.4"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "212d0f5754cb6769937f4501cc0e67f4f4483c8d2c3e1e922ee9edbe4ab4c7c0"
    [[package]]
  • edit in Cargo.lock at line 886
    [4.19430]
    [4.19430]
    name = "psl-types"
    version = "2.0.10"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "e8eda7c62d9ecaafdf8b62374c006de0adf61666ae96a96ba74a37134aa4e470"
    [[package]]
    name = "publicsuffix"
    version = "2.1.1"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "292972edad6bbecc137ab84c5e36421a4a6c979ea31d3cc73540dd04315b33e1"
    dependencies = [
    "byteorder",
    "hashbrown",
    "idna",
    "psl-types",
    ]
    [[package]]
  • edit in Cargo.lock at line 1029
    [4.22570]
    [4.22570]
    "cookie",
    "cookie_store",
  • edit in Cargo.lock at line 1046
    [4.22778]
    [4.22778]
    "proc-macro-hack",
  • edit in Cargo.lock at line 1061
    [4.22966]
    [4.22966]
    version = "0.2.3"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
    dependencies = [
    "semver 0.9.0",
    ]
    [[package]]
    name = "rustc_version"
  • replacement in Cargo.lock at line 1074
    [4.23144][4.23144:23155]()
    "semver",
    [4.23144]
    [4.23155]
    "semver 1.0.4",
  • edit in Cargo.lock at line 1160
    [4.25067]
    [4.25067]
    version = "0.9.0"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
    dependencies = [
    "semver-parser",
    ]
    [[package]]
    name = "semver"
  • edit in Cargo.lock at line 1172
    [4.25228]
    [4.25228]
    [[package]]
    name = "semver-parser"
    version = "0.7.0"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
  • edit in Cargo.lock at line 1184
    [4.25419]
    [4.25419]
    dependencies = [
    "serde_derive",
    ]
    [[package]]
    name = "serde_derive"
    version = "1.0.132"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "ecc0db5cb2556c0e558887d9bbdcf6ac4471e83ff66cf696e5419024d1606276"
    dependencies = [
    "proc-macro2",
    "quote",
    "syn",
    ]
  • edit in Cargo.lock at line 1231
    [4.26184]
    [4.26184]
    [[package]]
    name = "sha1"
    version = "0.6.0"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "2579985fda508104f7587689507983eadd6a6e84dd35d6d115361f530916fa0d"
  • edit in Cargo.lock at line 1303
    [4.27913]
    [4.27913]
    name = "standback"
    version = "0.2.17"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "e113fb6f3de07a243d434a56ec6f186dfd51cb08448239fe7bcae73f87ff28ff"
    dependencies = [
    "version_check",
    ]
    [[package]]
    name = "stdweb"
    version = "0.4.20"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "d022496b16281348b52d0e30ae99e01a73d737b2f45d38fed4edf79f9325a1d5"
    dependencies = [
    "discard",
    "rustc_version 0.2.3",
    "stdweb-derive",
    "stdweb-internal-macros",
    "stdweb-internal-runtime",
    "wasm-bindgen",
    ]
    [[package]]
    name = "stdweb-derive"
    version = "0.5.3"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "c87a60a40fccc84bef0652345bbbbbe20a605bf5d0ce81719fc476f5c03b50ef"
    dependencies = [
    "proc-macro2",
    "quote",
    "serde",
    "serde_derive",
    "syn",
    ]
    [[package]]
    name = "stdweb-internal-macros"
    version = "0.2.9"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "58fa5ff6ad0d98d1ffa8cb115892b6e69d67799f6763e162a1c9db421dc22e11"
    dependencies = [
    "base-x",
    "proc-macro2",
    "quote",
    "serde",
    "serde_derive",
    "serde_json",
    "sha1",
    "syn",
    ]
    [[package]]
    name = "stdweb-internal-runtime"
    version = "0.1.5"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "213701ba3370744dcd1a12960caa4843b3d68b4d1c0a5d575e0d65b2ee9d16c0"
    [[package]]
  • edit in Cargo.lock at line 1448
    [4.29475]
    [4.29475]
    [[package]]
    name = "time"
    version = "0.2.27"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "4752a97f8eebd6854ff91f1c1824cd6160626ac4bd44287f7f4ea2035a02a242"
    dependencies = [
    "const_fn",
    "libc",
    "standback",
    "stdweb",
    "time-macros",
    "version_check",
    "winapi",
    ]
    [[package]]
    name = "time-macros"
    version = "0.1.1"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "957e9c6e26f12cb6d0dd7fc776bb67a706312e7299aed74c8dd5b17ebb27e2f1"
    dependencies = [
    "proc-macro-hack",
    "time-macros-impl",
    ]
  • edit in Cargo.lock at line 1475
    [4.29488]
    [4.29488]
    name = "time-macros-impl"
    version = "0.1.2"
    source = "registry+https://github.com/rust-lang/crates.io-index"
    checksum = "fd3c141a1b43194f3f56a1411225df8646c55781d5f26db825b3d98507eb482f"
    dependencies = [
    "proc-macro-hack",
    "proc-macro2",
    "quote",
    "standback",
    "syn",
    ]
    [[package]]