From 13682e1342e2c0135d7a99ffb4c065f677665afb Mon Sep 17 00:00:00 2001 From: Timon Van Overveldt Date: Sun, 18 Feb 2024 12:57:17 -0800 Subject: [PATCH] Update pulldown_cmark dep to v0.10, and add pulldown_cmark_escape dep. (#2432) The pulldown_cmark escaping functionality is now shipped in a separate pulldown_cmark_escape crate (https://crates.io/crates/pulldown-cmark-escape), starting with v0.10.0. The markdown.rs module has to be adapted to a few API changes in pulldown_cmark, and we have to introduce explicit handling of alt text to ensure it continues to be properly escaped. There are also a few other behavior changes that are caught by the tests, but these actually seem to be desired, so I've updated the insta snapshot files for those tests to incorporate those changes. Specifically, one footnote-parsing case seems to be handled better now, and pulldown-cmark's `push_html` now doesn't escape quotes in text nodes anymore (see https://github.com/pulldown-cmark/pulldown-cmark/pull/836). --- CHANGELOG.md | 1 + Cargo.lock | 23 ++++- components/libs/Cargo.toml | 3 +- components/libs/src/lib.rs | 1 + components/markdown/src/markdown.rs | 88 +++++++++++++------ ...wn__all_markdown_features_integration.snap | 6 +- .../markdown__can_handle_heading_ids-2.snap | 8 +- .../markdown__can_handle_heading_ids.snap | 6 +- ...des__doesnt_render_ignored_shortcodes.snap | 6 +- 9 files changed, 94 insertions(+), 48 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 44bd027a..7edc406a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## 0.19.0 (unreleased) +- Updates the pulldown-cmark dependency to v0.10.0. This improves footnote handling, and may also introduce some minor behavior changes such as reducing the amount of unnecessary HTML-escaping of text content. ## 0.18.0 (2023-12-18) diff --git a/Cargo.lock b/Cargo.lock index 26050e06..0d90eea7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1736,7 +1736,8 @@ dependencies = [ "num-format", "once_cell", "percent-encoding", - "pulldown-cmark", + "pulldown-cmark 0.10.0", + "pulldown-cmark-escape", "quickxml_to_serde", "rayon", "regex", @@ -2775,6 +2776,24 @@ dependencies = [ "unicase", ] +[[package]] +name = "pulldown-cmark" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce76ce678ffc8e5675b22aa1405de0b7037e2fdf8913fea40d1926c6fe1e6e7" +dependencies = [ + "bitflags 2.4.1", + "memchr", + "pulldown-cmark-escape", + "unicase", +] + +[[package]] +name = "pulldown-cmark-escape" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5d8f9aa0e3cbcfaf8bf00300004ee3b72f74770f9cbac93f6928771f613276b" + [[package]] name = "pure-rust-locales" version = "0.7.0" @@ -3341,7 +3360,7 @@ dependencies = [ "cargo_metadata", "error-chain", "glob", - "pulldown-cmark", + "pulldown-cmark 0.9.3", "tempfile", "walkdir", ] diff --git a/components/libs/Cargo.toml b/components/libs/Cargo.toml index 5f551ea6..9f6abf1d 100644 --- a/components/libs/Cargo.toml +++ b/components/libs/Cargo.toml @@ -21,7 +21,8 @@ nom-bibtex = "0.5" num-format = "0.4" once_cell = "1" percent-encoding = "2" -pulldown-cmark = { version = "0.9", default-features = false, features = ["simd"] } +pulldown-cmark = { version = "0.10", default-features = false, features = ["html", "simd"] } +pulldown-cmark-escape = { version = "0.10", default-features = false } quickxml_to_serde = "0.5" rayon = "1" regex = "1" diff --git a/components/libs/src/lib.rs b/components/libs/src/lib.rs index d0772dc0..3992c2ed 100644 --- a/components/libs/src/lib.rs +++ b/components/libs/src/lib.rs @@ -23,6 +23,7 @@ pub use num_format; pub use once_cell; pub use percent_encoding; pub use pulldown_cmark; +pub use pulldown_cmark_escape; pub use quickxml_to_serde; pub use rayon; pub use regex; diff --git a/components/markdown/src/markdown.rs b/components/markdown/src/markdown.rs index 4d30b2ac..40c94afa 100644 --- a/components/markdown/src/markdown.rs +++ b/components/markdown/src/markdown.rs @@ -4,19 +4,20 @@ use errors::bail; use libs::gh_emoji::Replacer as EmojiReplacer; use libs::once_cell::sync::Lazy; use libs::pulldown_cmark as cmark; +use libs::pulldown_cmark_escape as cmark_escape; use libs::tera; use utils::net::is_external_link; use crate::context::RenderContext; use errors::{Context, Error, Result}; -use libs::pulldown_cmark::escape::escape_html; +use libs::pulldown_cmark_escape::escape_html; use libs::regex::{Regex, RegexBuilder}; use utils::site::resolve_internal_link; use utils::slugs::slugify_anchors; use utils::table_of_contents::{make_table_of_contents, Heading}; use utils::types::InsertAnchor; -use self::cmark::{Event, LinkType, Options, Parser, Tag}; +use self::cmark::{Event, LinkType, Options, Parser, Tag, TagEnd}; use crate::codeblock::{CodeBlock, FenceSettings}; use crate::shortcode::{Shortcode, SHORTCODE_PLACEHOLDER}; @@ -220,15 +221,15 @@ fn get_heading_refs(events: &[Event]) -> Vec { for (i, event) in events.iter().enumerate() { match event { - Event::Start(Tag::Heading(level, anchor, classes)) => { + Event::Start(Tag::Heading { level, id, classes, .. }) => { heading_refs.push(HeadingRef::new( i, *level as u32, - anchor.map(|a| a.to_owned()), + id.clone().map(|a| a.to_string()), &classes.iter().map(|x| x.to_string()).collect::>(), )); } - Event::End(Tag::Heading(_, _, _)) => { + Event::End(TagEnd::Heading { .. }) => { heading_refs.last_mut().expect("Heading end before start?").end_idx = i; } _ => (), @@ -254,6 +255,10 @@ pub fn markdown_to_html( let mut error = None; let mut code_block: Option = None; + // Indicates whether we're in the middle of parsing a text node which will be placed in an HTML + // attribute, and which hence has to be escaped using escape_html rather than push_html's + // default HTML body escaping for text nodes. + let mut inside_attribute = false; let mut headings: Vec = vec![]; let mut internal_links = Vec::new(); @@ -294,12 +299,19 @@ pub fn markdown_to_html( // we have some text before the shortcode, push that first if $range.start != sc_span.start { - let content = $text[($range.start - orig_range_start) - ..(sc_span.start - orig_range_start)] - .to_string() - .into(); + let content: cmark::CowStr<'_> = + $text[($range.start - orig_range_start) + ..(sc_span.start - orig_range_start)] + .to_string() + .into(); events.push(if $is_text { - Event::Text(content) + if inside_attribute { + let mut buffer = "".to_string(); + escape_html(&mut buffer, content.as_ref()).unwrap(); + Event::Html(buffer.into()) + } else { + Event::Text(content) + } } else { Event::Html(content) }); @@ -370,7 +382,13 @@ pub fn markdown_to_html( }; if !contains_shortcode(text.as_ref()) { - events.push(Event::Text(text)); + if inside_attribute { + let mut buffer = "".to_string(); + escape_html(&mut buffer, text.as_ref()).unwrap(); + events.push(Event::Html(buffer.into())); + } else { + events.push(Event::Text(text)); + } continue; } @@ -386,7 +404,7 @@ pub fn markdown_to_html( code_block = Some(block); events.push(Event::Html(begin.into())); } - Event::End(Tag::CodeBlock(_)) => { + Event::End(TagEnd::CodeBlock { .. }) => { if let Some(ref mut code_block) = code_block { let html = code_block.highlight(&accumulated_block); events.push(Event::Html(html.into())); @@ -397,44 +415,53 @@ pub fn markdown_to_html( code_block = None; events.push(Event::Html("\n".into())); } - Event::Start(Tag::Image(link_type, src, title)) => { - let link = if is_colocated_asset_link(&src) { - let link = format!("{}{}", context.current_page_permalink, &*src); + Event::Start(Tag::Image { link_type, dest_url, title, id }) => { + let link = if is_colocated_asset_link(&dest_url) { + let link = format!("{}{}", context.current_page_permalink, &*dest_url); link.into() } else { - src + dest_url }; events.push(if lazy_async_image { let mut img_before_alt: String = "\"").expect("Could events.push(if lazy_async_image { + Event::End(TagEnd::Image) => events.push(if lazy_async_image { Event::Html("\" loading=\"lazy\" decoding=\"async\" />".into()) } else { event }), - Event::Start(Tag::Link(link_type, link, title)) if link.is_empty() => { + Event::Start(Tag::Link { link_type, dest_url, title, id }) + if dest_url.is_empty() => + { error = Some(Error::msg("There is a link that is missing a URL")); - events.push(Event::Start(Tag::Link(link_type, "#".into(), title))); + events.push(Event::Start(Tag::Link { + link_type, + dest_url: "#".into(), + title, + id, + })); } - Event::Start(Tag::Link(link_type, link, title)) => { + Event::Start(Tag::Link { link_type, dest_url, title, id }) => { let fixed_link = match fix_link( link_type, - &link, + &dest_url, context, &mut internal_links, &mut external_links, @@ -448,12 +475,12 @@ pub fn markdown_to_html( }; events.push( - if is_external_link(&link) + if is_external_link(&dest_url) && context.config.markdown.has_external_link_tweaks() { let mut escaped = String::new(); // write_str can fail but here there are no reasons it should (afaik?) - cmark::escape::escape_href(&mut escaped, &link) + cmark_escape::escape_href(&mut escaped, &dest_url) .expect("Could not write to buffer"); Event::Html( context @@ -463,7 +490,12 @@ pub fn markdown_to_html( .into(), ) } else { - Event::Start(Tag::Link(link_type, fixed_link.into(), title)) + Event::Start(Tag::Link { + link_type, + dest_url: fixed_link.into(), + title, + id, + }) }, ) } @@ -485,7 +517,7 @@ pub fn markdown_to_html( events.push(event); } - Event::End(Tag::Paragraph) => { + Event::End(TagEnd::Paragraph) => { events.push(if stop_next_end_p { stop_next_end_p = false; Event::Html("".into()) diff --git a/components/markdown/tests/snapshots/markdown__all_markdown_features_integration.snap b/components/markdown/tests/snapshots/markdown__all_markdown_features_integration.snap index 72aedc4c..9e95d418 100644 --- a/components/markdown/tests/snapshots/markdown__all_markdown_features_integration.snap +++ b/components/markdown/tests/snapshots/markdown__all_markdown_features_integration.snap @@ -1,8 +1,6 @@ --- -source: components/rendering/tests/markdown.rs -assertion_line: 358 +source: components/markdown/tests/markdown.rs expression: body - ---

h1 Heading

@@ -83,7 +81,7 @@ line 1 of code line 2 of code line 3 of code -

Block code "fences"

+

Block code "fences"

Sample text here...
 

Syntax highlighting

diff --git a/components/markdown/tests/snapshots/markdown__can_handle_heading_ids-2.snap b/components/markdown/tests/snapshots/markdown__can_handle_heading_ids-2.snap index d2847a8e..e9fabcc4 100644 --- a/components/markdown/tests/snapshots/markdown__can_handle_heading_ids-2.snap +++ b/components/markdown/tests/snapshots/markdown__can_handle_heading_ids-2.snap @@ -1,12 +1,10 @@ --- -source: components/rendering/tests/markdown.rs -assertion_line: 84 +source: components/markdown/tests/markdown.rs expression: body - ---

Hello

Hello

-

L'écologie et vous

+

L'écologie et vous

Hello

Hello

Hello

@@ -22,6 +20,6 @@ expression: body

text 1 there

1

footnote

-

Classes

+

Classes

diff --git a/components/markdown/tests/snapshots/markdown__can_handle_heading_ids.snap b/components/markdown/tests/snapshots/markdown__can_handle_heading_ids.snap index 87cce483..663be594 100644 --- a/components/markdown/tests/snapshots/markdown__can_handle_heading_ids.snap +++ b/components/markdown/tests/snapshots/markdown__can_handle_heading_ids.snap @@ -1,8 +1,6 @@ --- -source: components/rendering/tests/markdown.rs -assertion_line: 79 +source: components/markdown/tests/markdown.rs expression: body - ---

Hello

Hello

@@ -22,6 +20,6 @@ expression: body

text 1 there

1

footnote

-

Classes

+

Classes

diff --git a/components/markdown/tests/snapshots/shortcodes__doesnt_render_ignored_shortcodes.snap b/components/markdown/tests/snapshots/shortcodes__doesnt_render_ignored_shortcodes.snap index 64d9bfba..15c94e46 100644 --- a/components/markdown/tests/snapshots/shortcodes__doesnt_render_ignored_shortcodes.snap +++ b/components/markdown/tests/snapshots/shortcodes__doesnt_render_ignored_shortcodes.snap @@ -1,8 +1,6 @@ --- -source: components/rendering/tests/shortcodes.rs -assertion_line: 104 +source: components/markdown/tests/shortcodes.rs expression: body - --- -

{{ youtube(id="w7Ft2ymGmfc") }}

+

{{ youtube(id="w7Ft2ymGmfc") }}