diff --git a/components/site/src/link_checking.rs b/components/site/src/link_checking.rs index 57d72111..f385cbd6 100644 --- a/components/site/src/link_checking.rs +++ b/components/site/src/link_checking.rs @@ -96,6 +96,10 @@ pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> { } } +fn should_skip_by_prefix(link: &String, skip_prefixes: &Vec) -> bool { + skip_prefixes.iter().any(|prefix| link.starts_with(prefix)) +} + fn get_link_domain(link: &str) -> Result { return match Url::parse(link) { Ok(url) => match url.host_str().map(String::from) { @@ -109,36 +113,58 @@ fn get_link_domain(link: &str) -> Result { pub fn check_external_links(site: &Site) -> Result<()> { let library = site.library.write().expect("Get lock for check_external_links"); - let mut all_links: Vec<(PathBuf, String, String)> = vec![]; + struct LinkDef { + file_path: PathBuf, + external_link: String, + domain: String, + } + + impl LinkDef { + pub fn new(file_path: PathBuf, external_link: String, domain: String) -> Self { + Self { file_path, external_link, domain } + } + } + + let mut checked_links: Vec = vec![]; + let mut skipped_link_count: u32 = 0; for p in library.pages_values().into_iter() { for external_link in p.clone().external_links.into_iter() { - let domain = get_link_domain(&external_link)?; - all_links.push((p.file.path.clone(), external_link, domain)); + if should_skip_by_prefix(&external_link, &site.config.link_checker.skip_prefixes) { + skipped_link_count += 1; + } else { + let domain = get_link_domain(&external_link)?; + checked_links.push(LinkDef::new(p.file.path.clone(), external_link, domain)); + } } } for s in library.sections_values().into_iter() { for external_link in s.clone().external_links.into_iter() { - let domain = get_link_domain(&external_link)?; - all_links.push((s.file.path.clone(), external_link, domain)); + if should_skip_by_prefix(&external_link, &site.config.link_checker.skip_prefixes) { + skipped_link_count += 1; + } else { + let domain = get_link_domain(&external_link)?; + checked_links.push(LinkDef::new(s.file.path.clone(), external_link, domain)); + } } } - println!("Checking {} external link(s).", all_links.len()); + println!( + "Checking {} external link(s). Skipping {} external link(s).", + checked_links.len(), + skipped_link_count + ); - let mut links_by_domain: HashMap> = HashMap::new(); + let mut links_by_domain: HashMap> = HashMap::new(); - for link in all_links.iter() { - links_by_domain.entry(link.2.to_string()).or_default(); + for link in checked_links.iter() { + links_by_domain.entry(link.domain.to_string()).or_default(); // Insert content path and link under the domain key - links_by_domain - .get_mut(&link.2.to_string()) - .unwrap() - .push((link.0.clone(), link.1.clone())); + links_by_domain.get_mut(&link.domain).unwrap().push(&link); } - if all_links.is_empty() { + if checked_links.is_empty() { return Ok(()); } @@ -155,20 +181,13 @@ pub fn check_external_links(site: &Site) -> Result<()> { let mut links_to_process = links.len(); links .iter() - .filter_map(move |(page_path, link)| { + .filter_map(move |link_def| { links_to_process -= 1; - if site - .config - .link_checker - .skip_prefixes - .iter() - .any(|prefix| link.starts_with(prefix)) - { - return None; - } - - let res = link_checker::check_url(link, &site.config.link_checker); + let res = link_checker::check_url( + &link_def.external_link, + &site.config.link_checker, + ); if links_to_process > 0 { // Prevent rate-limiting, wait before next crawl unless we're done with this domain @@ -178,7 +197,7 @@ pub fn check_external_links(site: &Site) -> Result<()> { if link_checker::is_valid(&res) { None } else { - Some((page_path, link, res)) + Some((&link_def.file_path, &link_def.external_link, res)) } }) .collect::>() @@ -187,7 +206,11 @@ pub fn check_external_links(site: &Site) -> Result<()> { .collect::>() }); - println!("> Checked {} external link(s): {} error(s) found.", all_links.len(), errors.len()); + println!( + "> Checked {} external link(s): {} error(s) found.", + checked_links.len(), + errors.len() + ); if errors.is_empty() { return Ok(()); diff --git a/components/site/tests/site.rs b/components/site/tests/site.rs index 2511af0c..69b838f4 100644 --- a/components/site/tests/site.rs +++ b/components/site/tests/site.rs @@ -19,7 +19,7 @@ fn can_parse_site() { let library = site.library.read().unwrap(); // Correct number of pages (sections do not count as pages, draft are ignored) - assert_eq!(library.pages().len(), 32); + assert_eq!(library.pages().len(), 33); let posts_path = path.join("content").join("posts"); // Make sure the page with a url doesn't have any sections @@ -596,7 +596,7 @@ fn can_build_site_with_pagination_for_taxonomy() { "tags/a/page/1/index.html", "http-equiv=\"refresh\" content=\"0; url=https://replace-this-with-your-url.com/tags/a/\"" )); - assert!(file_contains!(public, "tags/a/index.html", "Num pagers: 8")); + assert!(file_contains!(public, "tags/a/index.html", "Num pagers: 9")); assert!(file_contains!(public, "tags/a/index.html", "Page size: 2")); assert!(file_contains!(public, "tags/a/index.html", "Current index: 1")); assert!(!file_contains!(public, "tags/a/index.html", "has_prev")); @@ -609,7 +609,7 @@ fn can_build_site_with_pagination_for_taxonomy() { assert!(file_contains!( public, "tags/a/index.html", - "Last: https://replace-this-with-your-url.com/tags/a/page/8/" + "Last: https://replace-this-with-your-url.com/tags/a/page/9/" )); assert!(!file_contains!(public, "tags/a/index.html", "has_prev")); @@ -774,8 +774,35 @@ fn check_site() { site.config.link_checker.skip_anchor_prefixes, vec!["https://github.com/rust-lang/rust/blob/"] ); + assert_eq!( + site.config.link_checker.skip_prefixes, + vec!["http://[2001:db8::]/", "http://invaliddomain"] + ); + + site.config.enable_check_mode(); + site.load().expect("link check test_site"); +} + +#[test] +#[should_panic] +fn panics_on_invalid_external_domain() { + let (mut site, _tmp_dir, _public) = build_site("test_site"); + + // remove the invalid domain skip prefix + let i = site + .config + .link_checker + .skip_prefixes + .iter() + .position(|prefix| prefix == "http://invaliddomain") + .unwrap(); + site.config.link_checker.skip_prefixes.remove(i); + + // confirm the invalid domain skip prefix was removed assert_eq!(site.config.link_checker.skip_prefixes, vec!["http://[2001:db8::]/"]); + // check the test site, this time without the invalid domain skip prefix, which should cause a + // panic site.config.enable_check_mode(); site.load().expect("link check test_site"); } diff --git a/test_site/config.toml b/test_site/config.toml index 58f650ed..21b9e749 100644 --- a/test_site/config.toml +++ b/test_site/config.toml @@ -24,6 +24,7 @@ anchors = "on" [link_checker] skip_prefixes = [ "http://[2001:db8::]/", + "http://invaliddomain", ] skip_anchor_prefixes = [ diff --git a/test_site/content/posts/skip_prefixes.md b/test_site/content/posts/skip_prefixes.md new file mode 100644 index 00000000..49f1ac30 --- /dev/null +++ b/test_site/content/posts/skip_prefixes.md @@ -0,0 +1,4 @@ ++++ ++++ + +[test skip 1](http://invaliddomain