From 053fafa90ead15ede22aee67f1f5ed4aa8e48819 Mon Sep 17 00:00:00 2001 From: Conrad Irwin Date: Fri, 25 Apr 2025 16:53:44 -0600 Subject: [PATCH] Fix markdown escaping Closes #29255 Release Notes: - Improved handling of markdown escape sequences --- crates/markdown/examples/markdown.rs | 1 - crates/markdown/src/markdown.rs | 14 ++ crates/markdown/src/parser.rs | 297 +++++++++++++++++---------- 3 files changed, 200 insertions(+), 112 deletions(-) diff --git a/crates/markdown/examples/markdown.rs b/crates/markdown/examples/markdown.rs index d3e0fdd326..fbe2d4326f 100644 --- a/crates/markdown/examples/markdown.rs +++ b/crates/markdown/examples/markdown.rs @@ -21,7 +21,6 @@ function a(b: T) { } ``` - Remember, markdown processors may have slight differences and extensions, so always refer to the specific documentation or guides relevant to your platform or editor for the best practices and additional features. "#; diff --git a/crates/markdown/src/markdown.rs b/crates/markdown/src/markdown.rs index a6b7914d65..52376fe232 100644 --- a/crates/markdown/src/markdown.rs +++ b/crates/markdown/src/markdown.rs @@ -221,7 +221,12 @@ impl Markdown { .count(); if count > 0 { let mut output = String::with_capacity(s.len() + count); + let mut is_newline = false; for c in s.chars() { + if is_newline && c == ' ' { + continue; + } + is_newline = c == '\n'; if c == '\n' { output.push('\n') } else if c.is_ascii_punctuation() { @@ -1722,6 +1727,15 @@ mod tests { rendered.text } + #[test] + fn test_escape() { + assert_eq!(Markdown::escape("hello `world`"), "hello \\`world\\`"); + assert_eq!( + Markdown::escape("hello\n cool world"), + "hello\n\ncool world" + ); + } + #[track_caller] fn assert_mappings(rendered: &RenderedText, expected: Vec>) { assert_eq!(rendered.lines.len(), expected.len(), "line count mismatch"); diff --git a/crates/markdown/src/parser.rs b/crates/markdown/src/parser.rs index 84553f36d5..d039254906 100644 --- a/crates/markdown/src/parser.rs +++ b/crates/markdown/src/parser.rs @@ -2,14 +2,9 @@ use gpui::SharedString; use linkify::LinkFinder; pub use pulldown_cmark::TagEnd as MarkdownTagEnd; use pulldown_cmark::{ - Alignment, HeadingLevel, InlineStr, LinkType, MetadataBlockKind, Options, Parser, -}; -use std::{ - collections::HashSet, - ops::{Deref, Range}, - path::Path, - sync::Arc, + Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser, }; +use std::{collections::HashSet, ops::Range, path::Path, sync::Arc}; use crate::path_range::PathWithRange; @@ -35,7 +30,10 @@ pub fn parse_markdown( let mut language_paths = HashSet::new(); let mut within_link = false; let mut within_metadata = false; - for (pulldown_event, mut range) in Parser::new_ext(text, PARSE_OPTIONS).into_offset_iter() { + let mut parser = Parser::new_ext(text, PARSE_OPTIONS) + .into_offset_iter() + .peekable(); + while let Some((pulldown_event, mut range)) = parser.next() { if within_metadata { if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) = pulldown_event @@ -175,47 +173,125 @@ pub fn parse_markdown( events.push((range, MarkdownEvent::End(tag))); } pulldown_cmark::Event::Text(parsed) => { - // `parsed` will share bytes with the input unless a substitution like handling of - // HTML entities or smart punctuation has occurred. When these substitutions occur, - // `parsed` only consists of the result of a single substitution. - if !cow_str_points_inside(&parsed, text) { - events.push((range, MarkdownEvent::SubstitutedText(parsed.into()))); - } else { - // Automatically detect links in text if not already within a markdown link. - if !within_link { - let mut finder = LinkFinder::new(); - finder.kinds(&[linkify::LinkKind::Url]); - let text_range = range.clone(); - for link in finder.links(&text[text_range.clone()]) { - let link_range = - text_range.start + link.start()..text_range.start + link.end(); + fn event_for( + text: &str, + range: Range, + str: &str, + ) -> (Range, MarkdownEvent) { + if str == &text[range.clone()] { + (range, MarkdownEvent::Text) + } else { + (range, MarkdownEvent::SubstitutedText(str.to_owned())) + } + } + struct TextRange<'a> { + source_range: Range, + merged_range: Range, + parsed: CowStr<'a>, + } - if link_range.start > range.start { - events.push((range.start..link_range.start, MarkdownEvent::Text)); - } + let mut last_len = parsed.len(); + let mut ranges = vec![TextRange { + source_range: range.clone(), + merged_range: 0..last_len, + parsed, + }]; - events.push(( - link_range.clone(), - MarkdownEvent::Start(MarkdownTag::Link { - link_type: LinkType::Autolink, - dest_url: SharedString::from(link.as_str().to_string()), - title: SharedString::default(), - id: SharedString::default(), - }), - )); + while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _))) { + let Some((pulldown_cmark::Event::Text(next_event), next_range)) = parser.next() + else { + unreachable!() + }; + let next_len = last_len + next_event.len(); + ranges.push(TextRange { + source_range: next_range.clone(), + merged_range: last_len..next_len, + parsed: next_event, + }); + last_len = next_len; + } - events.push((link_range.clone(), MarkdownEvent::Text)); - events.push(( - link_range.clone(), - MarkdownEvent::End(MarkdownTagEnd::Link), - )); + let mut merged_text = + String::with_capacity(ranges.last().unwrap().merged_range.end); + for range in &ranges { + merged_text.push_str(&range.parsed); + } - range.start = link_range.end; + let mut ranges = ranges.into_iter().peekable(); + + if !within_link { + let mut finder = LinkFinder::new(); + finder.kinds(&[linkify::LinkKind::Url]); + + // Find links in the merged text + for link in finder.links(&merged_text) { + let link_start_in_merged = link.start(); + let link_end_in_merged = link.end(); + + while ranges + .peek() + .is_some_and(|range| range.merged_range.end <= link_start_in_merged) + { + let range = ranges.next().unwrap(); + events.push(event_for(text, range.source_range, &range.parsed)); } + + let range = ranges.peek_mut().unwrap(); + let prefix_len = link_start_in_merged - range.merged_range.start; + if prefix_len > 0 { + let (head, tail) = range.parsed.split_at(prefix_len); + events.push(event_for( + text, + range.source_range.start..range.source_range.start + prefix_len, + &head, + )); + range.parsed = CowStr::Boxed(tail.into()); + range.merged_range.start += prefix_len; + range.source_range.start += prefix_len; + } + + let link_start_in_source = range.source_range.start; + let mut link_events = Vec::new(); + + while ranges + .peek() + .is_some_and(|range| range.merged_range.end <= link_end_in_merged) + { + let range = ranges.next().unwrap(); + link_events.push(event_for(text, range.source_range, &range.parsed)); + } + + let range = ranges.peek_mut().unwrap(); + let prefix_len = link_end_in_merged - range.merged_range.start; + if prefix_len > 0 { + let (head, tail) = range.parsed.split_at(prefix_len); + link_events.push(event_for( + text, + range.source_range.start..range.source_range.start + prefix_len, + head, + )); + range.parsed = CowStr::Boxed(tail.into()); + range.merged_range.start += prefix_len; + range.source_range.start += prefix_len; + } + let link_range = link_start_in_source..range.source_range.start; + + events.push(( + link_range.clone(), + MarkdownEvent::Start(MarkdownTag::Link { + link_type: LinkType::Autolink, + dest_url: SharedString::from(link.as_str().to_string()), + title: SharedString::default(), + id: SharedString::default(), + }), + )); + events.extend(link_events); + events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link))); } - if range.start < range.end { - events.push((range, MarkdownEvent::Text)); - } + } + + for range in ranges { + events.push(event_for(text, range.source_range, &range.parsed)); } } pulldown_cmark::Event::Code(_) => { @@ -291,7 +367,7 @@ pub enum MarkdownEvent { Text, /// Text that differs from the markdown source - typically due to substitution of HTML entities /// and smart punctuation. - SubstitutedText(CompactStr), + SubstitutedText(String), /// An inline code node. Code, /// An HTML node. @@ -429,73 +505,6 @@ pub(crate) fn extract_code_block_content_range(text: &str) -> Range { range } -/// Represents either an owned or inline string. Motivation for this is to make `SubstitutedText` -/// more efficient - it fits within a `pulldown_cmark::InlineStr` in all known cases. -/// -/// Same as `pulldown_cmark::CowStr` but without the `Borrow` case. -#[derive(Clone)] -pub enum CompactStr { - Boxed(Box), - Inlined(InlineStr), -} - -impl std::fmt::Debug for CompactStr { - fn fmt(&self, formatter: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { - self.deref().fmt(formatter) - } -} - -impl Deref for CompactStr { - type Target = str; - - fn deref(&self) -> &str { - match self { - CompactStr::Boxed(b) => b, - CompactStr::Inlined(i) => i, - } - } -} - -impl From<&str> for CompactStr { - fn from(s: &str) -> Self { - if let Ok(inlined) = s.try_into() { - CompactStr::Inlined(inlined) - } else { - CompactStr::Boxed(s.into()) - } - } -} - -impl From> for CompactStr { - fn from(cow_str: pulldown_cmark::CowStr) -> Self { - match cow_str { - pulldown_cmark::CowStr::Boxed(b) => CompactStr::Boxed(b), - pulldown_cmark::CowStr::Borrowed(b) => b.into(), - pulldown_cmark::CowStr::Inlined(i) => CompactStr::Inlined(i), - } - } -} - -impl PartialEq for CompactStr { - fn eq(&self, other: &Self) -> bool { - self.deref() == other.deref() - } -} - -fn cow_str_points_inside(substring: &pulldown_cmark::CowStr, container: &str) -> bool { - match substring { - pulldown_cmark::CowStr::Boxed(b) => str_points_inside(b, container), - pulldown_cmark::CowStr::Borrowed(b) => str_points_inside(b, container), - pulldown_cmark::CowStr::Inlined(_) => false, - } -} - -fn str_points_inside(substring: &str, container: &str) -> bool { - let substring_ptr = substring.as_ptr(); - let container_ptr = container.as_ptr(); - unsafe { substring_ptr >= container_ptr && substring_ptr < container_ptr.add(container.len()) } -} - #[cfg(test)] mod tests { use super::MarkdownEvent::*; @@ -621,4 +630,70 @@ mod tests { let input = "```python\nprint('hello')\nprint('world')\n```"; assert_eq!(extract_code_block_content_range(input), 10..40); } + + #[test] + fn test_links_split_across_fragments() { + // This test verifies that links split across multiple text fragments due to escaping or other issues + // are correctly detected and processed + // Note: In real usage, pulldown_cmark creates separate text events for the escaped character + // We're verifying our parser can handle this correctly + assert_eq!( + parse_markdown("https:/\\/example.com is equivalent to https://example.com!").0, + vec![ + (0..62, Start(Paragraph)), + ( + 0..20, + Start(Link { + link_type: LinkType::Autolink, + dest_url: "https://example.com".into(), + title: "".into(), + id: "".into() + }) + ), + (0..7, Text), + (8..20, Text), + (0..20, End(MarkdownTagEnd::Link)), + (20..38, Text), + ( + 38..61, + Start(Link { + link_type: LinkType::Autolink, + dest_url: "https://example.com".into(), + title: "".into(), + id: "".into() + }) + ), + (38..53, Text), + (53..58, SubstitutedText(".".into())), + (58..61, Text), + (38..61, End(MarkdownTagEnd::Link)), + (61..62, Text), + (0..62, End(MarkdownTagEnd::Paragraph)) + ], + ); + + assert_eq!( + parse_markdown("Visit https://example.com/cat\\/é‍☕ for coffee!").0, + [ + (0..55, Start(Paragraph)), + (0..6, Text), + ( + 6..43, + Start(Link { + link_type: LinkType::Autolink, + dest_url: "https://example.com/cat/é\u{200d}☕".into(), + title: "".into(), + id: "".into() + }) + ), + (6..29, Text), + (30..33, Text), + (33..40, SubstitutedText("\u{200d}".into())), + (40..43, Text), + (6..43, End(MarkdownTagEnd::Link)), + (43..55, Text), + (0..55, End(MarkdownTagEnd::Paragraph)) + ] + ); + } }