Fix markdown escaping
Closes #29255 Release Notes: - Improved handling of markdown escape sequences
This commit is contained in:
parent
d23024609f
commit
053fafa90e
@ -21,7 +21,6 @@ function a(b: T) {
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
Remember, markdown processors may have slight differences and extensions, so always refer to the specific documentation or guides relevant to your platform or editor for the best practices and additional features.
|
||||
"#;
|
||||
|
||||
|
@ -221,7 +221,12 @@ impl Markdown {
|
||||
.count();
|
||||
if count > 0 {
|
||||
let mut output = String::with_capacity(s.len() + count);
|
||||
let mut is_newline = false;
|
||||
for c in s.chars() {
|
||||
if is_newline && c == ' ' {
|
||||
continue;
|
||||
}
|
||||
is_newline = c == '\n';
|
||||
if c == '\n' {
|
||||
output.push('\n')
|
||||
} else if c.is_ascii_punctuation() {
|
||||
@ -1722,6 +1727,15 @@ mod tests {
|
||||
rendered.text
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escape() {
|
||||
assert_eq!(Markdown::escape("hello `world`"), "hello \\`world\\`");
|
||||
assert_eq!(
|
||||
Markdown::escape("hello\n cool world"),
|
||||
"hello\n\ncool world"
|
||||
);
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
fn assert_mappings(rendered: &RenderedText, expected: Vec<Vec<(usize, usize)>>) {
|
||||
assert_eq!(rendered.lines.len(), expected.len(), "line count mismatch");
|
||||
|
@ -2,14 +2,9 @@ use gpui::SharedString;
|
||||
use linkify::LinkFinder;
|
||||
pub use pulldown_cmark::TagEnd as MarkdownTagEnd;
|
||||
use pulldown_cmark::{
|
||||
Alignment, HeadingLevel, InlineStr, LinkType, MetadataBlockKind, Options, Parser,
|
||||
};
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
ops::{Deref, Range},
|
||||
path::Path,
|
||||
sync::Arc,
|
||||
Alignment, CowStr, HeadingLevel, LinkType, MetadataBlockKind, Options, Parser,
|
||||
};
|
||||
use std::{collections::HashSet, ops::Range, path::Path, sync::Arc};
|
||||
|
||||
use crate::path_range::PathWithRange;
|
||||
|
||||
@ -35,7 +30,10 @@ pub fn parse_markdown(
|
||||
let mut language_paths = HashSet::new();
|
||||
let mut within_link = false;
|
||||
let mut within_metadata = false;
|
||||
for (pulldown_event, mut range) in Parser::new_ext(text, PARSE_OPTIONS).into_offset_iter() {
|
||||
let mut parser = Parser::new_ext(text, PARSE_OPTIONS)
|
||||
.into_offset_iter()
|
||||
.peekable();
|
||||
while let Some((pulldown_event, mut range)) = parser.next() {
|
||||
if within_metadata {
|
||||
if let pulldown_cmark::Event::End(pulldown_cmark::TagEnd::MetadataBlock { .. }) =
|
||||
pulldown_event
|
||||
@ -175,47 +173,125 @@ pub fn parse_markdown(
|
||||
events.push((range, MarkdownEvent::End(tag)));
|
||||
}
|
||||
pulldown_cmark::Event::Text(parsed) => {
|
||||
// `parsed` will share bytes with the input unless a substitution like handling of
|
||||
// HTML entities or smart punctuation has occurred. When these substitutions occur,
|
||||
// `parsed` only consists of the result of a single substitution.
|
||||
if !cow_str_points_inside(&parsed, text) {
|
||||
events.push((range, MarkdownEvent::SubstitutedText(parsed.into())));
|
||||
} else {
|
||||
// Automatically detect links in text if not already within a markdown link.
|
||||
if !within_link {
|
||||
let mut finder = LinkFinder::new();
|
||||
finder.kinds(&[linkify::LinkKind::Url]);
|
||||
let text_range = range.clone();
|
||||
for link in finder.links(&text[text_range.clone()]) {
|
||||
let link_range =
|
||||
text_range.start + link.start()..text_range.start + link.end();
|
||||
fn event_for(
|
||||
text: &str,
|
||||
range: Range<usize>,
|
||||
str: &str,
|
||||
) -> (Range<usize>, MarkdownEvent) {
|
||||
if str == &text[range.clone()] {
|
||||
(range, MarkdownEvent::Text)
|
||||
} else {
|
||||
(range, MarkdownEvent::SubstitutedText(str.to_owned()))
|
||||
}
|
||||
}
|
||||
struct TextRange<'a> {
|
||||
source_range: Range<usize>,
|
||||
merged_range: Range<usize>,
|
||||
parsed: CowStr<'a>,
|
||||
}
|
||||
|
||||
if link_range.start > range.start {
|
||||
events.push((range.start..link_range.start, MarkdownEvent::Text));
|
||||
}
|
||||
let mut last_len = parsed.len();
|
||||
let mut ranges = vec![TextRange {
|
||||
source_range: range.clone(),
|
||||
merged_range: 0..last_len,
|
||||
parsed,
|
||||
}];
|
||||
|
||||
events.push((
|
||||
link_range.clone(),
|
||||
MarkdownEvent::Start(MarkdownTag::Link {
|
||||
link_type: LinkType::Autolink,
|
||||
dest_url: SharedString::from(link.as_str().to_string()),
|
||||
title: SharedString::default(),
|
||||
id: SharedString::default(),
|
||||
}),
|
||||
));
|
||||
while matches!(parser.peek(), Some((pulldown_cmark::Event::Text(_), _))) {
|
||||
let Some((pulldown_cmark::Event::Text(next_event), next_range)) = parser.next()
|
||||
else {
|
||||
unreachable!()
|
||||
};
|
||||
let next_len = last_len + next_event.len();
|
||||
ranges.push(TextRange {
|
||||
source_range: next_range.clone(),
|
||||
merged_range: last_len..next_len,
|
||||
parsed: next_event,
|
||||
});
|
||||
last_len = next_len;
|
||||
}
|
||||
|
||||
events.push((link_range.clone(), MarkdownEvent::Text));
|
||||
events.push((
|
||||
link_range.clone(),
|
||||
MarkdownEvent::End(MarkdownTagEnd::Link),
|
||||
));
|
||||
let mut merged_text =
|
||||
String::with_capacity(ranges.last().unwrap().merged_range.end);
|
||||
for range in &ranges {
|
||||
merged_text.push_str(&range.parsed);
|
||||
}
|
||||
|
||||
range.start = link_range.end;
|
||||
let mut ranges = ranges.into_iter().peekable();
|
||||
|
||||
if !within_link {
|
||||
let mut finder = LinkFinder::new();
|
||||
finder.kinds(&[linkify::LinkKind::Url]);
|
||||
|
||||
// Find links in the merged text
|
||||
for link in finder.links(&merged_text) {
|
||||
let link_start_in_merged = link.start();
|
||||
let link_end_in_merged = link.end();
|
||||
|
||||
while ranges
|
||||
.peek()
|
||||
.is_some_and(|range| range.merged_range.end <= link_start_in_merged)
|
||||
{
|
||||
let range = ranges.next().unwrap();
|
||||
events.push(event_for(text, range.source_range, &range.parsed));
|
||||
}
|
||||
|
||||
let range = ranges.peek_mut().unwrap();
|
||||
let prefix_len = link_start_in_merged - range.merged_range.start;
|
||||
if prefix_len > 0 {
|
||||
let (head, tail) = range.parsed.split_at(prefix_len);
|
||||
events.push(event_for(
|
||||
text,
|
||||
range.source_range.start..range.source_range.start + prefix_len,
|
||||
&head,
|
||||
));
|
||||
range.parsed = CowStr::Boxed(tail.into());
|
||||
range.merged_range.start += prefix_len;
|
||||
range.source_range.start += prefix_len;
|
||||
}
|
||||
|
||||
let link_start_in_source = range.source_range.start;
|
||||
let mut link_events = Vec::new();
|
||||
|
||||
while ranges
|
||||
.peek()
|
||||
.is_some_and(|range| range.merged_range.end <= link_end_in_merged)
|
||||
{
|
||||
let range = ranges.next().unwrap();
|
||||
link_events.push(event_for(text, range.source_range, &range.parsed));
|
||||
}
|
||||
|
||||
let range = ranges.peek_mut().unwrap();
|
||||
let prefix_len = link_end_in_merged - range.merged_range.start;
|
||||
if prefix_len > 0 {
|
||||
let (head, tail) = range.parsed.split_at(prefix_len);
|
||||
link_events.push(event_for(
|
||||
text,
|
||||
range.source_range.start..range.source_range.start + prefix_len,
|
||||
head,
|
||||
));
|
||||
range.parsed = CowStr::Boxed(tail.into());
|
||||
range.merged_range.start += prefix_len;
|
||||
range.source_range.start += prefix_len;
|
||||
}
|
||||
let link_range = link_start_in_source..range.source_range.start;
|
||||
|
||||
events.push((
|
||||
link_range.clone(),
|
||||
MarkdownEvent::Start(MarkdownTag::Link {
|
||||
link_type: LinkType::Autolink,
|
||||
dest_url: SharedString::from(link.as_str().to_string()),
|
||||
title: SharedString::default(),
|
||||
id: SharedString::default(),
|
||||
}),
|
||||
));
|
||||
events.extend(link_events);
|
||||
events.push((link_range.clone(), MarkdownEvent::End(MarkdownTagEnd::Link)));
|
||||
}
|
||||
if range.start < range.end {
|
||||
events.push((range, MarkdownEvent::Text));
|
||||
}
|
||||
}
|
||||
|
||||
for range in ranges {
|
||||
events.push(event_for(text, range.source_range, &range.parsed));
|
||||
}
|
||||
}
|
||||
pulldown_cmark::Event::Code(_) => {
|
||||
@ -291,7 +367,7 @@ pub enum MarkdownEvent {
|
||||
Text,
|
||||
/// Text that differs from the markdown source - typically due to substitution of HTML entities
|
||||
/// and smart punctuation.
|
||||
SubstitutedText(CompactStr),
|
||||
SubstitutedText(String),
|
||||
/// An inline code node.
|
||||
Code,
|
||||
/// An HTML node.
|
||||
@ -429,73 +505,6 @@ pub(crate) fn extract_code_block_content_range(text: &str) -> Range<usize> {
|
||||
range
|
||||
}
|
||||
|
||||
/// Represents either an owned or inline string. Motivation for this is to make `SubstitutedText`
|
||||
/// more efficient - it fits within a `pulldown_cmark::InlineStr` in all known cases.
|
||||
///
|
||||
/// Same as `pulldown_cmark::CowStr` but without the `Borrow` case.
|
||||
#[derive(Clone)]
|
||||
pub enum CompactStr {
|
||||
Boxed(Box<str>),
|
||||
Inlined(InlineStr),
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for CompactStr {
|
||||
fn fmt(&self, formatter: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
|
||||
self.deref().fmt(formatter)
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for CompactStr {
|
||||
type Target = str;
|
||||
|
||||
fn deref(&self) -> &str {
|
||||
match self {
|
||||
CompactStr::Boxed(b) => b,
|
||||
CompactStr::Inlined(i) => i,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&str> for CompactStr {
|
||||
fn from(s: &str) -> Self {
|
||||
if let Ok(inlined) = s.try_into() {
|
||||
CompactStr::Inlined(inlined)
|
||||
} else {
|
||||
CompactStr::Boxed(s.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<pulldown_cmark::CowStr<'_>> for CompactStr {
|
||||
fn from(cow_str: pulldown_cmark::CowStr) -> Self {
|
||||
match cow_str {
|
||||
pulldown_cmark::CowStr::Boxed(b) => CompactStr::Boxed(b),
|
||||
pulldown_cmark::CowStr::Borrowed(b) => b.into(),
|
||||
pulldown_cmark::CowStr::Inlined(i) => CompactStr::Inlined(i),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for CompactStr {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.deref() == other.deref()
|
||||
}
|
||||
}
|
||||
|
||||
fn cow_str_points_inside(substring: &pulldown_cmark::CowStr, container: &str) -> bool {
|
||||
match substring {
|
||||
pulldown_cmark::CowStr::Boxed(b) => str_points_inside(b, container),
|
||||
pulldown_cmark::CowStr::Borrowed(b) => str_points_inside(b, container),
|
||||
pulldown_cmark::CowStr::Inlined(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn str_points_inside(substring: &str, container: &str) -> bool {
|
||||
let substring_ptr = substring.as_ptr();
|
||||
let container_ptr = container.as_ptr();
|
||||
unsafe { substring_ptr >= container_ptr && substring_ptr < container_ptr.add(container.len()) }
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::MarkdownEvent::*;
|
||||
@ -621,4 +630,70 @@ mod tests {
|
||||
let input = "```python\nprint('hello')\nprint('world')\n```";
|
||||
assert_eq!(extract_code_block_content_range(input), 10..40);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_links_split_across_fragments() {
|
||||
// This test verifies that links split across multiple text fragments due to escaping or other issues
|
||||
// are correctly detected and processed
|
||||
// Note: In real usage, pulldown_cmark creates separate text events for the escaped character
|
||||
// We're verifying our parser can handle this correctly
|
||||
assert_eq!(
|
||||
parse_markdown("https:/\\/example.com is equivalent to https://example.com!").0,
|
||||
vec![
|
||||
(0..62, Start(Paragraph)),
|
||||
(
|
||||
0..20,
|
||||
Start(Link {
|
||||
link_type: LinkType::Autolink,
|
||||
dest_url: "https://example.com".into(),
|
||||
title: "".into(),
|
||||
id: "".into()
|
||||
})
|
||||
),
|
||||
(0..7, Text),
|
||||
(8..20, Text),
|
||||
(0..20, End(MarkdownTagEnd::Link)),
|
||||
(20..38, Text),
|
||||
(
|
||||
38..61,
|
||||
Start(Link {
|
||||
link_type: LinkType::Autolink,
|
||||
dest_url: "https://example.com".into(),
|
||||
title: "".into(),
|
||||
id: "".into()
|
||||
})
|
||||
),
|
||||
(38..53, Text),
|
||||
(53..58, SubstitutedText(".".into())),
|
||||
(58..61, Text),
|
||||
(38..61, End(MarkdownTagEnd::Link)),
|
||||
(61..62, Text),
|
||||
(0..62, End(MarkdownTagEnd::Paragraph))
|
||||
],
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
parse_markdown("Visit https://example.com/cat\\/é‍☕ for coffee!").0,
|
||||
[
|
||||
(0..55, Start(Paragraph)),
|
||||
(0..6, Text),
|
||||
(
|
||||
6..43,
|
||||
Start(Link {
|
||||
link_type: LinkType::Autolink,
|
||||
dest_url: "https://example.com/cat/é\u{200d}☕".into(),
|
||||
title: "".into(),
|
||||
id: "".into()
|
||||
})
|
||||
),
|
||||
(6..29, Text),
|
||||
(30..33, Text),
|
||||
(33..40, SubstitutedText("\u{200d}".into())),
|
||||
(40..43, Text),
|
||||
(6..43, End(MarkdownTagEnd::Link)),
|
||||
(43..55, Text),
|
||||
(0..55, End(MarkdownTagEnd::Paragraph))
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user