FAXOU7MRT62Y2SBC5PWCPLXF6KIRZOEHAIGBQU6NNL36D6MOOKEAC
let doc = Html::parse_document(data);
for a_elem in doc.select(&afinder) {
if let Some(link) = a_elem.value().attr("href") {
match Url::parse(link) {
Ok(potentially_new_url) => {
if !old_links.contains_key(&potentially_new_url)
&& !pending.contains(&potentially_new_url)
&& potentially_new_url.domain() == Some("mediabiasfactcheck.com")
{
println!("chosing to visit {}", potentially_new_url);
pending.insert(potentially_new_url);
} else {
tracing::trace!("skipping {}", potentially_new_url);
}
}
_ => {}
}
}
}
Ok(())
}
let mut pending: HashSet<Url> =
iter::once("https://mediabiasfactcheck.com/".try_into()?).collect();
let mut pending: HashSet<Url> = if confirmed.is_empty() {
info!("starting from root, with no cache");
iter::once("https://mediabiasfactcheck.com/".try_into()?).collect()
} else {
let mut new = HashSet::new();
confirmed
.values()
.map(|v| {
find_new_links(
std::str::from_utf8(&v.data).expect("nonutf8"),
&confirmed,
&mut new,
)
})
.for_each(|_| ());
info!("scraping through {:?}", new);
new
};
let doc = Html::parse_document(std::str::from_utf8(&data)?);
for a_elem in doc.select(&afinder) {
if let Some(link) = a_elem.value().attr("href") {
match Url::parse(link) {
Ok(potentially_new_url) => {
if !confirmed.contains_key(&potentially_new_url)
&& !pending.contains(&potentially_new_url)
&& potentially_new_url.domain() == Some("mediabiasfactcheck.com")
{
tracing::trace!("chosing to visit {}", potentially_new_url);
pending.insert(potentially_new_url);
} else {
tracing::trace!("skipping {}", potentially_new_url);
}
}
_ => {}
}
}
}
let mut confirmed = bincode::deserialize_from(std::io::BufReader::new(
std::fs::File::open("phase1.blob").unwrap(),
))
let mut confirmed = {
let f = std::fs::File::open("phase1.blob").map_err(|e| anyhow!(e));
f.and_then(|f| {
bincode::deserialize_from(std::io::BufReader::new(f)).map_err(|e| anyhow!(e))
})
}