IJS5OIDK4YRQVHTN2MZWP3SFLS7CJMIMCDMWZBLDV5PDNMEU7MYAC B7GL4BVL42PAIVIBIQF5TFEMWFMO3LDVS2LIPFTLFSYG6JVJEDNAC 6RFPVLVLGAC62JWVZQF3CNTP74YRNDGDWXFCHKYFIYUNCCF7CGFQC HHALDA72CDQWZLBHY57FDDKJFN76KJJJWF7C7IU2DNR4AHJKNVEQC FAXOU7MRT62Y2SBC5PWCPLXF6KIRZOEHAIGBQU6NNL36D6MOOKEAC B3H475WM3JE532SL7IGJIQBRXWHNDTHP2LH5IL67N46Z6QM75SFAC YALSRCUNFJB6222FZKFTHLOMQX3W4E5YS26Y652T4DPVOWKVKT4QC let mut pending: HashSet<Url> = if confirmed.is_empty() {info!("starting from root, with no cache");iter::once("https://mediabiasfactcheck.com/".try_into()?).collect()} else {let mut new = HashSet::new();confirmed.values().map(|v| {find_new_links(std::str::from_utf8(&v.data).expect("nonutf8"),&confirmed,&mut new,)}).for_each(|_| ());info!("scraping through {:?}", new);new
let mut pending: HashSet<Url> = {let confirmed = confirmed.read();if confirmed.is_empty() {info!("starting from root, with no cache");iter::once("https://mediabiasfactcheck.com/".try_into()?).collect()} else {let mut new = HashSet::new();confirmed.values().map(|v| {if let Ok(utf8) = std::str::from_utf8(&v.data) {if let Err(e) = find_new_links(utf8, &confirmed, &mut new) {warn!("error finding links on {}: {}", v.final_url, e);}}}).for_each(|_| ());info!("scraping through {:?}", new);new}
while let Some(next_url) = pending.iter().cloned().next() {pending.remove(&next_url);
while let Some(next_url) = pending_rx.recv().await {let cl = cl.clone();let confirmed = confirmed.clone();let pending_tx = pending_tx.clone();let _: JoinHandle<anyhow::Result<()>> = tokio::task::spawn(async move {required_sleep().await;
confirmed.insert(next_url.clone(), PageCandidate { final_url, data });println!("finished {}/{}: {}", confirmed.len(), confirmed.len()+pending.len(), next_url);
let mut pending = HashSet::new();if let Ok(utf8) = std::str::from_utf8(&data) {if let Err(e) = find_new_links(utf8, &confirmed.read(), &mut pending) {warn!("error finding links on {}: {}", final_url, e);}}pending.drain().for_each(|pnd| pending_tx.send(pnd.clone()).expect("couldn't send ):"));confirmed.write().insert(next_url.clone(), PageCandidate { final_url, data });println!("finished {}/{}: {}",confirmed.read().len(),confirmed.read().len() + pending.len(),next_url);Ok(())});
fn phase<T, U>(fname: &str, func: impl FnOnce(&mut T) -> anyhow::Result<U>) -> anyhow::Result<U>whereT: for<'de> Deserialize<'de> + Serialize + Default,{let f = std::fs::File::open(fname).map_err(|e| anyhow!(e));let mut rehydrated = f.and_then(|f| {bincode::deserialize_from(std::io::BufReader::new(f)).map_err(|e| anyhow!(e))}).unwrap_or(T::default());
if let Some(phase2) = args.subcommand_matches("find_media_outlets") {let all_pages: HashMap<Url, PageCandidate> = {let f = std::fs::File::open("phase1.blob").map_err(|e| anyhow!(e));f.and_then(|f| {bincode::deserialize_from(std::io::BufReader::new(f)).map_err(|e| anyhow!(e))})?};
let sel = Selector::parse("header.entry-header > h2 > img + img").expect("fix the phase2 selector");let mut outlets: HashMap<Url, MediaOutlet> = {let f = std::fs::File::open("phase2.blob").map_err(|e| anyhow!(e));f.and_then(|f| {bincode::deserialize_from(std::io::BufReader::new(f)).map_err(|e| anyhow!(e))})?};for (orig_url, candidate) in all_pages.into_iter() {if !outlets.contains_key(&orig_url) {continue;} // a filter would borrow outlets during the iterationsif let Some(outlet) = consider_page(&sel, &candidate)? {println!("found outlet!");outlets.insert(orig_url, outlet);}}}