The web scrape functions

[?]
Jan 11, 2022, 12:29 AM
PQ4BG3ZJU5SY6XQDJ3SURLAOWGXGXLONIRZNUJXZVMHLYXWOORXQC

Dependencies

  • [2] KUANIPWF Add function for adding name to database
  • [3] AV73DYWQ Initial functions for using sqlite in async environment
  • [4] RNW6D777 Minor tidy
  • [*] HMOBTVJ4 Initialize crate and add expected dependencies

Change contents

  • replacement in src/main.rs at line 7
    [3.301][2.1272:1325]()
    let _db = names_database::AsyncConnection::open(
    [3.301]
    [2.1325]
    let db = names_database::AsyncConnection::open(
  • file addition: gather.rs (----------)
    [6.15]
    use std::borrow::Borrow;
    #[derive(PartialEq, Eq, Copy, Clone)]
    pub enum Gender {
    Masculine,
    Feminine,
    Neutral,
    }
    impl Gender {
    pub fn allow(&self, other: &Self) -> bool {
    match self {
    Self::Masculine => other != &Self::Feminine,
    Self::Feminine => other != &Self::Masculine,
    Self::Neutral => other == &Self::Neutral,
    }
    }
    }
    pub fn all_names(filter: Gender) -> tokio::sync::mpsc::Receiver<String> {
    let (tx, rx) = tokio::sync::mpsc::channel(256);
    let client = reqwest::Client::new();
    for initial in 'A'..='Z' {
    let tx = tx.clone();
    let client = client.clone();
    tokio::spawn(async move {
    let mut page = 0;
    loop {
    let results =
    fetch_name_page(&client, initial, page).await.unwrap();
    for (name, gender) in &results.names {
    if filter.allow(gender) {
    tx.send(name.clone()).await.unwrap();
    }
    }
    if results.has_next {
    page += 1;
    } else {
    break;
    }
    }
    });
    }
    rx
    }
    struct ResultPage {
    names: Vec<(String, Gender)>,
    has_next: bool,
    }
    async fn fetch_name_page(
    client: &reqwest::Client,
    letter: char,
    page: usize,
    ) -> Result<ResultPage, Box<dyn std::error::Error>> {
    let page_header = format!("+Baby+Names+starting+with+{}", letter);
    let starts = String::from(letter);
    let mut form = vec![
    ("advanced", "1"),
    ("starts", starts.borrow()),
    ("end", ""),
    ("meaning", ""),
    ("origin", ""),
    ("nat", ""),
    ("startswith", ""),
    ("endswith", ""),
    ("gender", ""),
    ("cat", ""),
    ("syl", ""),
    ("page_header", page_header.borrow()),
    ];
    let mut offset = String::new();
    match page {
    0 => {
    form.extend(
    [("offset", "66"), ("offset", "66"), ("Next", "Previous")]
    .iter(),
    );
    }
    1 => {
    form.extend([("offset", "0"), ("Next", "Next")].iter());
    }
    _ => {
    offset = format!("{}", (page - 1) * 66);
    form.extend(
    [
    ("offset", offset.borrow()),
    ("offset", offset.borrow()),
    ("Next", "Next"),
    ]
    .iter(),
    );
    }
    }
    let response = client
    .post("https://babynames.com/names/search.php")
    .form(&form[..])
    .send()
    .await?;
    StatusError::ensure_success(response.status())?;
    let text = response.text().await?;
    Ok(tokio::task::spawn_blocking(move || {
    let doc = scraper::Html::parse_document(text.borrow());
    let next_button = scraper::Selector::parse("input.next-btn").unwrap();
    let mut next_button = doc.select(&next_button);
    let has_next = match next_button.next() {
    None => false,
    Some(button) => match button.value().attr("type").map(str::trim) {
    Some("hidden") => true,
    _ => false,
    },
    };
    let name_selector =
    scraper::Selector::parse("ul.searchresults a").unwrap();
    let names = doc
    .select(&name_selector)
    .map(|item| {
    let gender = match item.value().attr("class") {
    Some("M") => Gender::Masculine,
    Some("F") => Gender::Feminine,
    _ => Gender::Neutral,
    };
    let name: String = item.text().map(|s| s.trim()).collect();
    (name, gender)
    })
    .collect();
    ResultPage { names, has_next }
    })
    .await?)
    }
    pub struct StatusError(reqwest::StatusCode);
    impl StatusError {
    fn ensure_success(status: reqwest::StatusCode) -> Result<(), Self> {
    if status.is_success() {
    Ok(())
    } else {
    Err(Self(status))
    }
    }
    }
    impl std::error::Error for StatusError {}
    impl std::fmt::Debug for StatusError {
    fn fmt(
    &self,
    formatter: &mut std::fmt::Formatter<'_>,
    ) -> Result<(), std::fmt::Error> {
    <reqwest::StatusCode as std::fmt::Debug>::fmt(&self.0, formatter)
    }
    }
    impl std::fmt::Display for StatusError {
    fn fmt(
    &self,
    formatter: &mut std::fmt::Formatter<'_>,
    ) -> Result<(), std::fmt::Error> {
    <reqwest::StatusCode as std::fmt::Display>::fmt(&self.0, formatter)
    }
    }