IJS5OIDK4YRQVHTN2MZWP3SFLS7CJMIMCDMWZBLDV5PDNMEU7MYAC
B7GL4BVL42PAIVIBIQF5TFEMWFMO3LDVS2LIPFTLFSYG6JVJEDNAC
6RFPVLVLGAC62JWVZQF3CNTP74YRNDGDWXFCHKYFIYUNCCF7CGFQC
HHALDA72CDQWZLBHY57FDDKJFN76KJJJWF7C7IU2DNR4AHJKNVEQC
FAXOU7MRT62Y2SBC5PWCPLXF6KIRZOEHAIGBQU6NNL36D6MOOKEAC
B3H475WM3JE532SL7IGJIQBRXWHNDTHP2LH5IL67N46Z6QM75SFAC
YALSRCUNFJB6222FZKFTHLOMQX3W4E5YS26Y652T4DPVOWKVKT4QC
let mut pending: HashSet<Url> = if confirmed.is_empty() {
info!("starting from root, with no cache");
iter::once("https://mediabiasfactcheck.com/".try_into()?).collect()
} else {
let mut new = HashSet::new();
confirmed
.values()
.map(|v| {
find_new_links(
std::str::from_utf8(&v.data).expect("nonutf8"),
&confirmed,
&mut new,
)
})
.for_each(|_| ());
info!("scraping through {:?}", new);
new
let mut pending: HashSet<Url> = {
let confirmed = confirmed.read();
if confirmed.is_empty() {
info!("starting from root, with no cache");
iter::once("https://mediabiasfactcheck.com/".try_into()?).collect()
} else {
let mut new = HashSet::new();
confirmed
.values()
.map(|v| {
if let Ok(utf8) = std::str::from_utf8(&v.data) {
if let Err(e) = find_new_links(utf8, &confirmed, &mut new) {
warn!("error finding links on {}: {}", v.final_url, e);
}
}
})
.for_each(|_| ());
info!("scraping through {:?}", new);
new
}
while let Some(next_url) = pending.iter().cloned().next() {
pending.remove(&next_url);
while let Some(next_url) = pending_rx.recv().await {
let cl = cl.clone();
let confirmed = confirmed.clone();
let pending_tx = pending_tx.clone();
let _: JoinHandle<anyhow::Result<()>> = tokio::task::spawn(async move {
required_sleep().await;
confirmed.insert(next_url.clone(), PageCandidate { final_url, data });
println!("finished {}/{}: {}", confirmed.len(), confirmed.len()+pending.len(), next_url);
let mut pending = HashSet::new();
if let Ok(utf8) = std::str::from_utf8(&data) {
if let Err(e) = find_new_links(utf8, &confirmed.read(), &mut pending) {
warn!("error finding links on {}: {}", final_url, e);
}
}
pending
.drain()
.for_each(|pnd| pending_tx.send(pnd.clone()).expect("couldn't send ):"));
confirmed
.write()
.insert(next_url.clone(), PageCandidate { final_url, data });
println!(
"finished {}/{}: {}",
confirmed.read().len(),
confirmed.read().len() + pending.len(),
next_url
);
Ok(())
});
fn phase<T, U>(fname: &str, func: impl FnOnce(&mut T) -> anyhow::Result<U>) -> anyhow::Result<U>
where
T: for<'de> Deserialize<'de> + Serialize + Default,
{
let f = std::fs::File::open(fname).map_err(|e| anyhow!(e));
let mut rehydrated = f
.and_then(|f| {
bincode::deserialize_from(std::io::BufReader::new(f)).map_err(|e| anyhow!(e))
})
.unwrap_or(T::default());
if let Some(phase2) = args.subcommand_matches("find_media_outlets") {
let all_pages: HashMap<Url, PageCandidate> = {
let f = std::fs::File::open("phase1.blob").map_err(|e| anyhow!(e));
f.and_then(|f| {
bincode::deserialize_from(std::io::BufReader::new(f)).map_err(|e| anyhow!(e))
})?
};
let sel = Selector::parse("header.entry-header > h2 > img + img")
.expect("fix the phase2 selector");
let mut outlets: HashMap<Url, MediaOutlet> = {
let f = std::fs::File::open("phase2.blob").map_err(|e| anyhow!(e));
f.and_then(|f| {
bincode::deserialize_from(std::io::BufReader::new(f)).map_err(|e| anyhow!(e))
})?
};
for (orig_url, candidate) in all_pages.into_iter() {
if !outlets.contains_key(&orig_url) {
continue;
} // a filter would borrow outlets during the iterations
if let Some(outlet) = consider_page(&sel, &candidate)? {
println!("found outlet!");
outlets.insert(orig_url, outlet);
}
}
}