apply skip_prefixes before parsing external link domain (#1833)
* apply skip_prefixes before parsing external link domain * log number of links skipped by skip_prefixes
This commit is contained in:
parent
896ea596fd
commit
92e80b5451
@ -96,6 +96,10 @@ pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn should_skip_by_prefix(link: &String, skip_prefixes: &Vec<String>) -> bool {
|
||||||
|
skip_prefixes.iter().any(|prefix| link.starts_with(prefix))
|
||||||
|
}
|
||||||
|
|
||||||
fn get_link_domain(link: &str) -> Result<String> {
|
fn get_link_domain(link: &str) -> Result<String> {
|
||||||
return match Url::parse(link) {
|
return match Url::parse(link) {
|
||||||
Ok(url) => match url.host_str().map(String::from) {
|
Ok(url) => match url.host_str().map(String::from) {
|
||||||
@ -109,36 +113,58 @@ fn get_link_domain(link: &str) -> Result<String> {
|
|||||||
pub fn check_external_links(site: &Site) -> Result<()> {
|
pub fn check_external_links(site: &Site) -> Result<()> {
|
||||||
let library = site.library.write().expect("Get lock for check_external_links");
|
let library = site.library.write().expect("Get lock for check_external_links");
|
||||||
|
|
||||||
let mut all_links: Vec<(PathBuf, String, String)> = vec![];
|
struct LinkDef {
|
||||||
|
file_path: PathBuf,
|
||||||
|
external_link: String,
|
||||||
|
domain: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LinkDef {
|
||||||
|
pub fn new(file_path: PathBuf, external_link: String, domain: String) -> Self {
|
||||||
|
Self { file_path, external_link, domain }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut checked_links: Vec<LinkDef> = vec![];
|
||||||
|
let mut skipped_link_count: u32 = 0;
|
||||||
|
|
||||||
for p in library.pages_values().into_iter() {
|
for p in library.pages_values().into_iter() {
|
||||||
for external_link in p.clone().external_links.into_iter() {
|
for external_link in p.clone().external_links.into_iter() {
|
||||||
let domain = get_link_domain(&external_link)?;
|
if should_skip_by_prefix(&external_link, &site.config.link_checker.skip_prefixes) {
|
||||||
all_links.push((p.file.path.clone(), external_link, domain));
|
skipped_link_count += 1;
|
||||||
|
} else {
|
||||||
|
let domain = get_link_domain(&external_link)?;
|
||||||
|
checked_links.push(LinkDef::new(p.file.path.clone(), external_link, domain));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for s in library.sections_values().into_iter() {
|
for s in library.sections_values().into_iter() {
|
||||||
for external_link in s.clone().external_links.into_iter() {
|
for external_link in s.clone().external_links.into_iter() {
|
||||||
let domain = get_link_domain(&external_link)?;
|
if should_skip_by_prefix(&external_link, &site.config.link_checker.skip_prefixes) {
|
||||||
all_links.push((s.file.path.clone(), external_link, domain));
|
skipped_link_count += 1;
|
||||||
|
} else {
|
||||||
|
let domain = get_link_domain(&external_link)?;
|
||||||
|
checked_links.push(LinkDef::new(s.file.path.clone(), external_link, domain));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("Checking {} external link(s).", all_links.len());
|
println!(
|
||||||
|
"Checking {} external link(s). Skipping {} external link(s).",
|
||||||
|
checked_links.len(),
|
||||||
|
skipped_link_count
|
||||||
|
);
|
||||||
|
|
||||||
let mut links_by_domain: HashMap<String, Vec<(PathBuf, String)>> = HashMap::new();
|
let mut links_by_domain: HashMap<String, Vec<&LinkDef>> = HashMap::new();
|
||||||
|
|
||||||
for link in all_links.iter() {
|
for link in checked_links.iter() {
|
||||||
links_by_domain.entry(link.2.to_string()).or_default();
|
links_by_domain.entry(link.domain.to_string()).or_default();
|
||||||
// Insert content path and link under the domain key
|
// Insert content path and link under the domain key
|
||||||
links_by_domain
|
links_by_domain.get_mut(&link.domain).unwrap().push(&link);
|
||||||
.get_mut(&link.2.to_string())
|
|
||||||
.unwrap()
|
|
||||||
.push((link.0.clone(), link.1.clone()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if all_links.is_empty() {
|
if checked_links.is_empty() {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -155,20 +181,13 @@ pub fn check_external_links(site: &Site) -> Result<()> {
|
|||||||
let mut links_to_process = links.len();
|
let mut links_to_process = links.len();
|
||||||
links
|
links
|
||||||
.iter()
|
.iter()
|
||||||
.filter_map(move |(page_path, link)| {
|
.filter_map(move |link_def| {
|
||||||
links_to_process -= 1;
|
links_to_process -= 1;
|
||||||
|
|
||||||
if site
|
let res = link_checker::check_url(
|
||||||
.config
|
&link_def.external_link,
|
||||||
.link_checker
|
&site.config.link_checker,
|
||||||
.skip_prefixes
|
);
|
||||||
.iter()
|
|
||||||
.any(|prefix| link.starts_with(prefix))
|
|
||||||
{
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
let res = link_checker::check_url(link, &site.config.link_checker);
|
|
||||||
|
|
||||||
if links_to_process > 0 {
|
if links_to_process > 0 {
|
||||||
// Prevent rate-limiting, wait before next crawl unless we're done with this domain
|
// Prevent rate-limiting, wait before next crawl unless we're done with this domain
|
||||||
@ -178,7 +197,7 @@ pub fn check_external_links(site: &Site) -> Result<()> {
|
|||||||
if link_checker::is_valid(&res) {
|
if link_checker::is_valid(&res) {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
Some((page_path, link, res))
|
Some((&link_def.file_path, &link_def.external_link, res))
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
@ -187,7 +206,11 @@ pub fn check_external_links(site: &Site) -> Result<()> {
|
|||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
});
|
});
|
||||||
|
|
||||||
println!("> Checked {} external link(s): {} error(s) found.", all_links.len(), errors.len());
|
println!(
|
||||||
|
"> Checked {} external link(s): {} error(s) found.",
|
||||||
|
checked_links.len(),
|
||||||
|
errors.len()
|
||||||
|
);
|
||||||
|
|
||||||
if errors.is_empty() {
|
if errors.is_empty() {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
|
@ -19,7 +19,7 @@ fn can_parse_site() {
|
|||||||
let library = site.library.read().unwrap();
|
let library = site.library.read().unwrap();
|
||||||
|
|
||||||
// Correct number of pages (sections do not count as pages, draft are ignored)
|
// Correct number of pages (sections do not count as pages, draft are ignored)
|
||||||
assert_eq!(library.pages().len(), 32);
|
assert_eq!(library.pages().len(), 33);
|
||||||
let posts_path = path.join("content").join("posts");
|
let posts_path = path.join("content").join("posts");
|
||||||
|
|
||||||
// Make sure the page with a url doesn't have any sections
|
// Make sure the page with a url doesn't have any sections
|
||||||
@ -596,7 +596,7 @@ fn can_build_site_with_pagination_for_taxonomy() {
|
|||||||
"tags/a/page/1/index.html",
|
"tags/a/page/1/index.html",
|
||||||
"http-equiv=\"refresh\" content=\"0; url=https://replace-this-with-your-url.com/tags/a/\""
|
"http-equiv=\"refresh\" content=\"0; url=https://replace-this-with-your-url.com/tags/a/\""
|
||||||
));
|
));
|
||||||
assert!(file_contains!(public, "tags/a/index.html", "Num pagers: 8"));
|
assert!(file_contains!(public, "tags/a/index.html", "Num pagers: 9"));
|
||||||
assert!(file_contains!(public, "tags/a/index.html", "Page size: 2"));
|
assert!(file_contains!(public, "tags/a/index.html", "Page size: 2"));
|
||||||
assert!(file_contains!(public, "tags/a/index.html", "Current index: 1"));
|
assert!(file_contains!(public, "tags/a/index.html", "Current index: 1"));
|
||||||
assert!(!file_contains!(public, "tags/a/index.html", "has_prev"));
|
assert!(!file_contains!(public, "tags/a/index.html", "has_prev"));
|
||||||
@ -609,7 +609,7 @@ fn can_build_site_with_pagination_for_taxonomy() {
|
|||||||
assert!(file_contains!(
|
assert!(file_contains!(
|
||||||
public,
|
public,
|
||||||
"tags/a/index.html",
|
"tags/a/index.html",
|
||||||
"Last: https://replace-this-with-your-url.com/tags/a/page/8/"
|
"Last: https://replace-this-with-your-url.com/tags/a/page/9/"
|
||||||
));
|
));
|
||||||
assert!(!file_contains!(public, "tags/a/index.html", "has_prev"));
|
assert!(!file_contains!(public, "tags/a/index.html", "has_prev"));
|
||||||
|
|
||||||
@ -774,8 +774,35 @@ fn check_site() {
|
|||||||
site.config.link_checker.skip_anchor_prefixes,
|
site.config.link_checker.skip_anchor_prefixes,
|
||||||
vec!["https://github.com/rust-lang/rust/blob/"]
|
vec!["https://github.com/rust-lang/rust/blob/"]
|
||||||
);
|
);
|
||||||
|
assert_eq!(
|
||||||
|
site.config.link_checker.skip_prefixes,
|
||||||
|
vec!["http://[2001:db8::]/", "http://invaliddomain"]
|
||||||
|
);
|
||||||
|
|
||||||
|
site.config.enable_check_mode();
|
||||||
|
site.load().expect("link check test_site");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn panics_on_invalid_external_domain() {
|
||||||
|
let (mut site, _tmp_dir, _public) = build_site("test_site");
|
||||||
|
|
||||||
|
// remove the invalid domain skip prefix
|
||||||
|
let i = site
|
||||||
|
.config
|
||||||
|
.link_checker
|
||||||
|
.skip_prefixes
|
||||||
|
.iter()
|
||||||
|
.position(|prefix| prefix == "http://invaliddomain")
|
||||||
|
.unwrap();
|
||||||
|
site.config.link_checker.skip_prefixes.remove(i);
|
||||||
|
|
||||||
|
// confirm the invalid domain skip prefix was removed
|
||||||
assert_eq!(site.config.link_checker.skip_prefixes, vec!["http://[2001:db8::]/"]);
|
assert_eq!(site.config.link_checker.skip_prefixes, vec!["http://[2001:db8::]/"]);
|
||||||
|
|
||||||
|
// check the test site, this time without the invalid domain skip prefix, which should cause a
|
||||||
|
// panic
|
||||||
site.config.enable_check_mode();
|
site.config.enable_check_mode();
|
||||||
site.load().expect("link check test_site");
|
site.load().expect("link check test_site");
|
||||||
}
|
}
|
||||||
|
@ -24,6 +24,7 @@ anchors = "on"
|
|||||||
[link_checker]
|
[link_checker]
|
||||||
skip_prefixes = [
|
skip_prefixes = [
|
||||||
"http://[2001:db8::]/",
|
"http://[2001:db8::]/",
|
||||||
|
"http://invaliddomain",
|
||||||
]
|
]
|
||||||
|
|
||||||
skip_anchor_prefixes = [
|
skip_anchor_prefixes = [
|
||||||
|
4
test_site/content/posts/skip_prefixes.md
Normal file
4
test_site/content/posts/skip_prefixes.md
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
+++
|
||||||
|
+++
|
||||||
|
|
||||||
|
[test skip 1](http://invaliddomain</)
|
Loading…
Reference in New Issue
Block a user