ember/mediabias-plotter - Change FAXOU7MRT62Y2SBC5PWCPLXF6KIRZOEHAIGBQU6NNL36D6MOOKEAC

actual downloading now

Created by ember on January 10, 2021

FAXOU7MRT62Y2SBC5PWCPLXF6KIRZOEHAIGBQU6NNL36D6MOOKEAC

Dependencies

In channels

main

Change contents

Replacement in src/main.rs at line 8 [3.10]
B:BD[2.18] → [2.18:40]
```
use clap::{clap_app};
```
[2.18]
[4.94]
```
use clap::clap_app;
```

Replacement in src/main.rs at line 13 [3.10]

B:BD[4.188] → [2.41:71]

use tokio::{sync::broadcast};

[4.188]

[3.48]

use tokio::sync::broadcast;
use tracing::{error, info, metadata::LevelFilter, Level};
use tracing_subscriber::EnvFilter;

Insertion in src/main.rs at line 53 [3.10]

[4.309]


fn find_new_links(
    data: &str,
    old_links: &HashMap<Url, PageCandidate>,
    pending: &mut HashSet<Url>,
) -> anyhow::Result<()> {
    let afinder = Selector::parse("a").map_err(|e| anyhow!("this is no good :( {:?}", e))?;

Insertion in src/main.rs at line 61 [3.10]

[4.310]

[2.217]

    let doc = Html::parse_document(data);
    for a_elem in doc.select(&afinder) {
        if let Some(link) = a_elem.value().attr("href") {
            match Url::parse(link) {
                Ok(potentially_new_url) => {
                    if !old_links.contains_key(&potentially_new_url)
                        && !pending.contains(&potentially_new_url)
                        && potentially_new_url.domain() == Some("mediabiasfactcheck.com")
                    {
                        println!("chosing to visit {}", potentially_new_url);
                        pending.insert(potentially_new_url);
                    } else {
                        tracing::trace!("skipping {}", potentially_new_url);
                    }
                }
                _ => {}
            }
        }
    }
    Ok(())
}

Replacement in src/main.rs at line 89 [3.10]

∅:D[2.416] → [4.461:574]

B:BD[4.461] → [4.461:574]

    let mut pending: HashSet<Url> =
        iter::once("https://mediabiasfactcheck.com/".try_into()?).collect();

[2.416]

[4.643]

    let mut pending: HashSet<Url> = if confirmed.is_empty() {
        info!("starting from root, with no cache");
        iter::once("https://mediabiasfactcheck.com/".try_into()?).collect()
    } else {
        let mut new = HashSet::new();
        confirmed
            .values()
            .map(|v| {
                find_new_links(
                    std::str::from_utf8(&v.data).expect("nonutf8"),
                    &confirmed,
                    &mut new,
                )
            })
            .for_each(|_| ());
        info!("scraping through {:?}", new);
        new
    };

Replacement in src/main.rs at line 108 [3.10]

B:BD[4.644] → [4.644:826]

    let required_sleep = || tokio::time::sleep(std::time::Duration::from_secs_f32(0.25));
    let afinder = Selector::parse("a").map_err(|e| anyhow!("this is no good :( {:?}", e))?;

[4.644]

[4.826]

    let required_sleep = || tokio::time::sleep(std::time::Duration::from_secs_f32(0.05));

Insertion in src/main.rs at line 114 [3.10]
[2.549]
[2.549]

Replacement in src/main.rs at line 119 [3.10]

B:BD[2.687] → [2.687:756]

∅:D[2.756] → [4.1165:1362]

B:BD[4.1165] → [4.1165:1362]

B:BD[4.1362] → [2.757:995]

∅:D[2.995] → [4.1504:1530]

B:BD[4.1504] → [4.1504:1530]

B:BD[4.1530] → [2.996:1264]

∅:D[2.1264] → [4.1593:1711]

B:BD[4.1593] → [4.1593:1711]

        let doc = Html::parse_document(std::str::from_utf8(&data)?);
        for a_elem in doc.select(&afinder) {
            if let Some(link) = a_elem.value().attr("href") {
                match Url::parse(link) {
                    Ok(potentially_new_url) => {
                        if !confirmed.contains_key(&potentially_new_url)
                            && !pending.contains(&potentially_new_url)
                            && potentially_new_url.domain() == Some("mediabiasfactcheck.com")
                        {
                            tracing::trace!("chosing to visit {}", potentially_new_url);
                            pending.insert(potentially_new_url);
                        } else {
                            tracing::trace!("skipping {}", potentially_new_url);
                        }
                    }
                    _ => {}
                }
            }
        }

[2.687]

[2.1265]

Replacement in src/main.rs at line 121 [3.10]

B:BD[2.1344] → [2.1344:1393]

        tracing::info!("finished {}", next_url);

[2.1344]

[2.1393]

        println!("finished {}", next_url);

Insertion in src/main.rs at line 132 [3.10]

[4.1794]

[2.1472]

    tracing_subscriber::fmt()
        .with_max_level(Level::WARN)
        .with_env_filter(
            EnvFilter::from_default_env()
                .add_directive(LevelFilter::from_level(Level::WARN).into())
                .add_directive("mediabias-parsing=trace".parse()?),
        )
        .init();

Deletion in src/main.rs at line 163 [3.10]
B:BD[2.2164] → [2.2164:2203]
```
    tracing_subscriber::fmt().init();
```

Replacement in src/main.rs at line 164 [3.10]

B:BD[2.2204] → [2.2204:2272]

    if let Some(phase1) = args.subcommand_matches("first-scrape") {

[2.2204]

[2.2272]

    if let Some(phase1) = args.subcommand_matches("first_scrape") {

Replacement in src/main.rs at line 169 [3.10]

B:BD[2.2443] → [2.2443:2590]

        let mut confirmed = bincode::deserialize_from(std::io::BufReader::new(
            std::fs::File::open("phase1.blob").unwrap(),
        ))

[2.2443]

[2.2590]

        let mut confirmed = {
            let f = std::fs::File::open("phase1.blob").map_err(|e| anyhow!(e));
            f.and_then(|f| {
                bincode::deserialize_from(std::io::BufReader::new(f)).map_err(|e| anyhow!(e))
            })
        }

Replacement in src/main.rs at line 176 [3.10]

B:BD[2.2626] → [2.2626:2701]

        all_urls_on_site_everywhere(stop, &mut cl, &mut confirmed).await?;

[2.2626]

[2.2701]

        let r = all_urls_on_site_everywhere(stop, &mut cl, &mut confirmed).await;
        match r {
            Err(e) => error!("big sad: {}", e),
            Ok(_) => {}
        }