B3H475WM3JE532SL7IGJIQBRXWHNDTHP2LH5IL67N46Z6QM75SFAC
// map from original scraped url to retrieved page.
async fn all_urls_on_site_everywhere(cl: &Client) -> anyhow::Result<HashMap<Url, PageCandidate>> {
/// add elements to confirmed
async fn all_urls_on_site_everywhere(
mut stop: broadcast::Receiver<()>,
cl: &Client,
confirmed: &mut HashMap<Url, PageCandidate>,
) -> anyhow::Result<()> {
required_sleep();
let pg = cl.get(next_url).send().await?;
confirmed.push(PageCandidate {
final_url: pg.url.clone(),
data: pg.bytes().await?,
});
let doc = Html::parse_document(data);
required_sleep().await;
let pg = cl.get(next_url.clone()).send().await?;
let final_url = pg.url().clone();
let data = pg.bytes().await?;
let doc = Html::parse_document(std::str::from_utf8(&data)?);
println!("Hello, world!");
let app = clap_app!(myapp =>
(version: "0.1")
(author: "ember")
(about: "create a plot of media bias according to mediabiasfactcheck.com")
(@subcommand first_scrape =>
(about: "crawl all the website's urls ")
(@arg verbose: -v --verbose "Verbose logging")
)
);
let args = app.get_matches();
let (stop_tx, stop) = broadcast::channel(1);
let interrupter = tokio::task::spawn(async move {
tokio::signal::ctrl_c()
.await
.expect("couldn't wait for signal?");
stop_tx.send(()).expect("failed to broadcast stop signal");
});
let mut cl = reqwest::Client::new();
tracing_subscriber::fmt().init();
if let Some(phase1) = args.subcommand_matches("first-scrape") {
if phase1.is_present("verbose") {
// TODO: figure out how to decrease log level unless this is passed?
}
// load prev state from blob
let mut confirmed = bincode::deserialize_from(std::io::BufReader::new(
std::fs::File::open("phase1.blob").unwrap(),
))
.unwrap_or(HashMap::new());
all_urls_on_site_everywhere(stop, &mut cl, &mut confirmed).await?;
bincode::serialize_into(
std::io::BufWriter::new(std::fs::File::create("phase1.blob").expect("sad")),
&confirmed,
)?;
}
std::mem::forget(interrupter);
Ok(())
name = "clap"
version = "3.0.0-beta.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4bd1061998a501ee7d4b6d449020df3266ca3124b941ec56cf2005c3779ca142"
dependencies = [
"atty",
"bitflags",
"clap_derive",
"indexmap",
"lazy_static",
"os_str_bytes",
"strsim",
"termcolor",
"textwrap",
"unicode-width",
"vec_map",
]
[[package]]
name = "clap_derive"
version = "3.0.0-beta.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "370f715b81112975b1b69db93e0b56ea4cd4e5002ac43b2da8474106a54096a1"
dependencies = [
"heck",
"proc-macro-error",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "proc-macro-error"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
dependencies = [
"proc-macro-error-attr",
"proc-macro2",
"quote",
"syn",
"version_check",
]
[[package]]
name = "proc-macro-error-attr"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
dependencies = [
"proc-macro2",
"quote",
"version_check",
]
[[package]]
name = "regex"
version = "1.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9251239e129e16308e70d853559389de218ac275b515068abc96829d05b948a"
dependencies = [
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4"
dependencies = [
"byteorder",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.6.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5eb417147ba9860a96cfe72a0b93bf88fee1744b5636ec99ab20c1aa9376581"
[[package]]
name = "termcolor"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4"
dependencies = [
"winapi-util",
]
[[package]]
name = "textwrap"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "203008d98caf094106cfaba70acfed15e18ed3ddb7d94e49baec153a2b462789"
dependencies = [
"unicode-width",
]
[[package]]
name = "tracing-log"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e0f8c7178e13481ff6765bd169b33e8d554c5d2bbede5e32c356194be02b9b9"
dependencies = [
"lazy_static",
"log",
"tracing-core",
]
[[package]]
name = "tracing-serde"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b"
dependencies = [
"serde",
"tracing-core",
]
[[package]]
name = "tracing-subscriber"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1fa8f0c8f4c594e4fc9debc1990deab13238077271ba84dd853d54902ee3401"
dependencies = [
"ansi_term",
"chrono",
"lazy_static",
"matchers",
"regex",
"serde",
"serde_json",
"sharded-slab",
"smallvec",
"thread_local",
"tracing",
"tracing-core",
"tracing-log",
"tracing-serde",
]
[[package]]
name = "vec_map"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
[[package]]
name = "version_check"
version = "0.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed"
[[package]]