Get the scraper part of this program working
[?]
Jan 12, 2022, 3:34 AM
LPVC545KD6R5KNOTDPFDOD5FVA3KFBZARKVOLLXJXW3YK3RMNZKQCDependencies
- [2]
YCWYAX6KFunctions for scraping names integrated enough to test (they don't work yet) - [3]
C376NCOVPrevent flooding the website (still need to better mimic browser requests) - [4]
HMOBTVJ4Initialize crate and add expected dependencies - [5]
KUANIPWFAdd function for adding name to database - [6]
PQ4BG3ZJThe web scrape functions - [7]
AV73DYWQInitial functions for using sqlite in async environment - [8]
RNW6D777Minor tidy
Change contents
- replacement in src/main.rs at line 16
#[derive(Debug,Subcommand)]#[derive(Debug, Subcommand)] - replacement in src/main.rs at line 21
}}, - replacement in src/main.rs at line 27[2.370]→[4.0:52](∅→∅),[4.301]→[4.0:52](∅→∅),[4.52]→[2.371:394](∅→∅),[2.394]→[4.1379:1398](∅→∅),[4.1379]→[4.1379:1398](∅→∅)
let db = names_database::AsyncConnection::open(args.database,).await?;let db = names_database::AsyncConnection::open(args.database).await?; - replacement in src/main.rs at line 32
async fn run(&self, db: &AsyncConnection) -> Result<(), Box<dyn std::error::Error>> {async fn run(&self,db: &AsyncConnection,) -> Result<(), Box<dyn std::error::Error>> { - replacement in src/main.rs at line 37
Self::Gather{ gender } => {Self::Gather { gender } => { - edit in src/gather.rs at line 1
use std::borrow::Borrow; - edit in src/gather.rs at line 2
use std::borrow::Borrow; - edit in src/gather.rs at line 23
let client = reqwest::Client::new(); - replacement in src/gather.rs at line 24
for initial in 'A'..='Z' {let mut page = 0;loop {let results =fetch_name_page(&client, initial, page).await.unwrap();for (name, gender) in &results.names {if filter.allow(gender) {tx.send(name.clone()).await.unwrap();}let client = reqwest::Client::builder().cookie_store(true).user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:95.0) Gecko/20100101 Firefox/95.0").build().unwrap();let mut fetch = Fetch::start(&client);loop {dbg!(fetch.current_letter, &fetch.form);let (delay, page) = match fetch.next().await {Err(err) => {dbg!(err);(120, Vec::with_capacity(0)) - replacement in src/gather.rs at line 33
tokio::time::sleep(std::time::Duration::new(10,0)).await;if results.has_next {page += 1;} else {Ok(None) => { - edit in src/gather.rs at line 36
Ok(Some(page)) => (5, page),};for (name, gender) in page {if filter.allow(&gender) {tx.send(name).await.unwrap();} - edit in src/gather.rs at line 43
tokio::time::sleep(std::time::Duration::new(delay, 0)).await; - replacement in src/gather.rs at line 49
struct ResultPage {names: Vec<(String, Gender)>,has_next: bool,struct Fetch<'a> {client: &'a reqwest::Client,current_letter: char,url: String,form: Option<Vec<(String, String)>>, - replacement in src/gather.rs at line 56[4.1399]→[4.1399:2008](∅→∅),[4.2008]→[3.160:176](∅→∅),[3.176]→[4.2044:2220](∅→∅),[4.2044]→[4.2044:2220](∅→∅)
async fn fetch_name_page(client: &reqwest::Client,letter: char,page: usize,) -> Result<ResultPage, Box<dyn std::error::Error>> {let page_header = format!("+Baby+Names+starting+with+{}", letter);let starts = String::from(letter);let mut form = vec![("advanced", "1"),("starts", starts.borrow()),("end", ""),("meaning", ""),("origin", ""),("nat", ""),("startswith", ""),("endswith", ""),("gender", ""),("cat", ""),("syl", ""),("page_header", page_header.borrow()),];let offset;match page {0 => {form.extend([("offset", "66"), ("offset", "66"), ("Next", "Previous")].iter(),);impl<'a> Fetch<'a> {fn start(client: &'a reqwest::Client) -> Self {Self {client,current_letter: 'A',url: String::from("https://babynames.com/names/A"),form: None, - replacement in src/gather.rs at line 64
1 => {form.extend([("offset", "0"), ("Next", "Next")].iter());}async fn next(&mut self,) -> Result<Option<Vec<(String, Gender)>>, Box<dyn std::error::Error>> {if self.current_letter > 'Z' {return Ok(None); - replacement in src/gather.rs at line 72
_ => {offset = format!("{}", (page - 1) * 66);form.extend([("offset", offset.borrow()),("offset", offset.borrow()),("Next", "Next"),].iter(),);let response = match &self.form {None => self.client.get(&self.url),Some(fields) => self.client.post(&self.url).form(fields), - edit in src/gather.rs at line 76
}let response = client.post("https://babynames.com/names/search.php").form(&form[..]) - replacement in src/gather.rs at line 78
StatusError::ensure_success(response.status())?;let text = response.text().await?;Ok(tokio::task::spawn_blocking(move || {let doc = scraper::Html::parse_document(text.borrow());let next_button = scraper::Selector::parse("input.next-btn").unwrap();let mut next_button = doc.select(&next_button);let has_next = match next_button.next() {None => false,Some(button) => match button.value().attr("type").map(str::trim) {Some("hidden") => true,_ => false,},};let name_selector =scraper::Selector::parse("ul.searchresults a").unwrap();let names = doc.select(&name_selector).map(|item| {let gender = match item.value().attr("class") {Some("M") => Gender::Masculine,Some("F") => Gender::Feminine,_ => Gender::Neutral,StatusError::ensure_success(response.status())?;let text = response.text().await?;let (names, next_url, form) = tokio::task::spawn_blocking(move || {let doc = scraper::Html::parse_document(text.borrow());let next_button =scraper::Selector::parse("input.next-btn").unwrap();let mut next_button = doc.select(&next_button);let has_next = match next_button.next() {None => false,Some(_) => true,};let name_selector =scraper::Selector::parse("ul.searchresults a").unwrap();let names = doc.select(&name_selector).map(|item| {let gender = match item.value().attr("class") {Some("M") => Gender::Masculine,Some("F") => Gender::Feminine,_ => Gender::Neutral,};let name: String = item.text().map(|s| s.trim()).collect();(name, gender)}).collect();let stepper =scraper::Selector::parse("div.next-previous form").unwrap();match doc.select(&stepper).next() {Some(stepper) => {let action = stepper.value().attr("action");let form = scraper::Selector::parse("input").unwrap();let form = stepper.select(&form).filter_map(|input| {let input = input.value();if input.attr("type") == Some("submit")&& input.attr("class") != Some("next-btn"){None} else {Some((input.attr("name").map(String::from)?,input.attr("value").map(String::from)?,))}}).collect();if has_next {(names, action.map(|url| {if url.starts_with("http") {String::from(url)} else {format!("https://babynames.com{}", url)}}), Some(form))} else {(names, None, None)}}None => (names, None, None),}}).await?;match form {None => {self.current_letter = unsafe {char::from_u32_unchecked(self.current_letter as u32 + 1) - replacement in src/gather.rs at line 146
let name: String = item.text().map(|s| s.trim()).collect();(name, gender)}).collect();ResultPage { names, has_next }}).await?)self.form = None;}Some(form) => {self.form = Some(form);}}self.url = match next_url {None => {format!("https://babynames.com/names/{}", self.current_letter)}Some(url) => url,};Ok(Some(names))} - replacement in Cargo.toml at line 10
reqwest = "^0.11.8"reqwest = { version = "^0.11.8", features = ["default","cookies"] } - edit in Cargo.lock at line 23
name = "base-x"version = "0.2.8"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "a4521f3e3d031370679b3b140beb36dfe4801b09ac77e30c61941f97df3ef28b"[[package]] - edit in Cargo.lock at line 99
[[package]]name = "const_fn"version = "0.4.9"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "fbdcdcb6d86f71c5e97409ad45898af11cbc995b4ee8112d59095a28d376c935" - edit in Cargo.lock at line 113
name = "cookie"version = "0.15.1"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "d5f1c7727e460397e56abc4bddc1d49e07a1ad78fc98eb2e1c8f032a58a2f80d"dependencies = ["percent-encoding","time","version_check",][[package]]name = "cookie_store"version = "0.15.1"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "b3f7034c0932dc36f5bd8ec37368d971346809435824f277cb3b8299fc56167c"dependencies = ["cookie","idna","log","publicsuffix","serde","serde_json","time","url",][[package]] - replacement in Cargo.lock at line 191
"rustc_version","rustc_version 0.4.0", - edit in Cargo.lock at line 196
name = "discard"version = "1.0.4"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "212d0f5754cb6769937f4501cc0e67f4f4483c8d2c3e1e922ee9edbe4ab4c7c0"[[package]] - edit in Cargo.lock at line 886
name = "psl-types"version = "2.0.10"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "e8eda7c62d9ecaafdf8b62374c006de0adf61666ae96a96ba74a37134aa4e470"[[package]]name = "publicsuffix"version = "2.1.1"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "292972edad6bbecc137ab84c5e36421a4a6c979ea31d3cc73540dd04315b33e1"dependencies = ["byteorder","hashbrown","idna","psl-types",][[package]] - edit in Cargo.lock at line 1029
"cookie","cookie_store", - edit in Cargo.lock at line 1046
"proc-macro-hack", - edit in Cargo.lock at line 1061
version = "0.2.3"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"dependencies = ["semver 0.9.0",][[package]]name = "rustc_version" - replacement in Cargo.lock at line 1074
"semver","semver 1.0.4", - edit in Cargo.lock at line 1160
version = "0.9.0"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"dependencies = ["semver-parser",][[package]]name = "semver" - edit in Cargo.lock at line 1172
[[package]]name = "semver-parser"version = "0.7.0"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" - edit in Cargo.lock at line 1184
dependencies = ["serde_derive",][[package]]name = "serde_derive"version = "1.0.132"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "ecc0db5cb2556c0e558887d9bbdcf6ac4471e83ff66cf696e5419024d1606276"dependencies = ["proc-macro2","quote","syn",] - edit in Cargo.lock at line 1231
[[package]]name = "sha1"version = "0.6.0"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "2579985fda508104f7587689507983eadd6a6e84dd35d6d115361f530916fa0d" - edit in Cargo.lock at line 1303
name = "standback"version = "0.2.17"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "e113fb6f3de07a243d434a56ec6f186dfd51cb08448239fe7bcae73f87ff28ff"dependencies = ["version_check",][[package]]name = "stdweb"version = "0.4.20"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "d022496b16281348b52d0e30ae99e01a73d737b2f45d38fed4edf79f9325a1d5"dependencies = ["discard","rustc_version 0.2.3","stdweb-derive","stdweb-internal-macros","stdweb-internal-runtime","wasm-bindgen",][[package]]name = "stdweb-derive"version = "0.5.3"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "c87a60a40fccc84bef0652345bbbbbe20a605bf5d0ce81719fc476f5c03b50ef"dependencies = ["proc-macro2","quote","serde","serde_derive","syn",][[package]]name = "stdweb-internal-macros"version = "0.2.9"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "58fa5ff6ad0d98d1ffa8cb115892b6e69d67799f6763e162a1c9db421dc22e11"dependencies = ["base-x","proc-macro2","quote","serde","serde_derive","serde_json","sha1","syn",][[package]]name = "stdweb-internal-runtime"version = "0.1.5"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "213701ba3370744dcd1a12960caa4843b3d68b4d1c0a5d575e0d65b2ee9d16c0"[[package]] - edit in Cargo.lock at line 1448
[[package]]name = "time"version = "0.2.27"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "4752a97f8eebd6854ff91f1c1824cd6160626ac4bd44287f7f4ea2035a02a242"dependencies = ["const_fn","libc","standback","stdweb","time-macros","version_check","winapi",][[package]]name = "time-macros"version = "0.1.1"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "957e9c6e26f12cb6d0dd7fc776bb67a706312e7299aed74c8dd5b17ebb27e2f1"dependencies = ["proc-macro-hack","time-macros-impl",] - edit in Cargo.lock at line 1475
name = "time-macros-impl"version = "0.1.2"source = "registry+https://github.com/rust-lang/crates.io-index"checksum = "fd3c141a1b43194f3f56a1411225df8646c55781d5f26db825b3d98507eb482f"dependencies = ["proc-macro-hack","proc-macro2","quote","standback","syn",][[package]]