The web scrape functions
[?]
Jan 11, 2022, 12:29 AM
PQ4BG3ZJU5SY6XQDJ3SURLAOWGXGXLONIRZNUJXZVMHLYXWOORXQCDependencies
- [2]
KUANIPWFAdd function for adding name to database - [3]
AV73DYWQInitial functions for using sqlite in async environment - [4]
RNW6D777Minor tidy - [*]
HMOBTVJ4Initialize crate and add expected dependencies
Change contents
- replacement in src/main.rs at line 7
let _db = names_database::AsyncConnection::open(let db = names_database::AsyncConnection::open( - file addition: gather.rs[6.15]
use std::borrow::Borrow;#[derive(PartialEq, Eq, Copy, Clone)]pub enum Gender {Masculine,Feminine,Neutral,}impl Gender {pub fn allow(&self, other: &Self) -> bool {match self {Self::Masculine => other != &Self::Feminine,Self::Feminine => other != &Self::Masculine,Self::Neutral => other == &Self::Neutral,}}}pub fn all_names(filter: Gender) -> tokio::sync::mpsc::Receiver<String> {let (tx, rx) = tokio::sync::mpsc::channel(256);let client = reqwest::Client::new();for initial in 'A'..='Z' {let tx = tx.clone();let client = client.clone();tokio::spawn(async move {let mut page = 0;loop {let results =fetch_name_page(&client, initial, page).await.unwrap();for (name, gender) in &results.names {if filter.allow(gender) {tx.send(name.clone()).await.unwrap();}}if results.has_next {page += 1;} else {break;}}});}rx}struct ResultPage {names: Vec<(String, Gender)>,has_next: bool,}async fn fetch_name_page(client: &reqwest::Client,letter: char,page: usize,) -> Result<ResultPage, Box<dyn std::error::Error>> {let page_header = format!("+Baby+Names+starting+with+{}", letter);let starts = String::from(letter);let mut form = vec![("advanced", "1"),("starts", starts.borrow()),("end", ""),("meaning", ""),("origin", ""),("nat", ""),("startswith", ""),("endswith", ""),("gender", ""),("cat", ""),("syl", ""),("page_header", page_header.borrow()),];let mut offset = String::new();match page {0 => {form.extend([("offset", "66"), ("offset", "66"), ("Next", "Previous")].iter(),);}1 => {form.extend([("offset", "0"), ("Next", "Next")].iter());}_ => {offset = format!("{}", (page - 1) * 66);form.extend([("offset", offset.borrow()),("offset", offset.borrow()),("Next", "Next"),].iter(),);}}let response = client.post("https://babynames.com/names/search.php").form(&form[..]).send().await?;StatusError::ensure_success(response.status())?;let text = response.text().await?;Ok(tokio::task::spawn_blocking(move || {let doc = scraper::Html::parse_document(text.borrow());let next_button = scraper::Selector::parse("input.next-btn").unwrap();let mut next_button = doc.select(&next_button);let has_next = match next_button.next() {None => false,Some(button) => match button.value().attr("type").map(str::trim) {Some("hidden") => true,_ => false,},};let name_selector =scraper::Selector::parse("ul.searchresults a").unwrap();let names = doc.select(&name_selector).map(|item| {let gender = match item.value().attr("class") {Some("M") => Gender::Masculine,Some("F") => Gender::Feminine,_ => Gender::Neutral,};let name: String = item.text().map(|s| s.trim()).collect();(name, gender)}).collect();ResultPage { names, has_next }}).await?)}pub struct StatusError(reqwest::StatusCode);impl StatusError {fn ensure_success(status: reqwest::StatusCode) -> Result<(), Self> {if status.is_success() {Ok(())} else {Err(Self(status))}}}impl std::error::Error for StatusError {}impl std::fmt::Debug for StatusError {fn fmt(&self,formatter: &mut std::fmt::Formatter<'_>,) -> Result<(), std::fmt::Error> {<reqwest::StatusCode as std::fmt::Debug>::fmt(&self.0, formatter)}}impl std::fmt::Display for StatusError {fn fmt(&self,formatter: &mut std::fmt::Formatter<'_>,) -> Result<(), std::fmt::Error> {<reqwest::StatusCode as std::fmt::Display>::fmt(&self.0, formatter)}}