diff --git a/quickpeep_html_charset_detection/src/lib.rs b/quickpeep_html_charset_detection/src/lib.rs index e5d21d6..7d3c6c2 100644 --- a/quickpeep_html_charset_detection/src/lib.rs +++ b/quickpeep_html_charset_detection/src/lib.rs @@ -76,9 +76,39 @@ pub fn extract_encoding_from_content_type_header( let key_value: Vec<&str> = header_part_ascii_ish.trim().split("=").collect(); let key = key_value.get(0).cloned().unwrap_or(""); if key.to_ascii_lowercase() == "charset" { - let value = key_value.get(0).cloned().unwrap_or(""); + let value = key_value.get(1).cloned().unwrap_or(""); return Encoding::for_label(value.as_bytes()); } } None } + +#[cfg(test)] +mod test { + use crate::sniff; + + #[test] + fn test_simple_cases() { + assert_eq!(sniff(b"hi", true, None).name(), "windows-1252"); + + assert_eq!( + sniff(b"hi", true, None).name(), + "UTF-8" + ); + + assert_eq!( + sniff( + b"hi", + true, + Some("text/html; charset=Shift-JIS".as_bytes()) + ) + .name(), + "Shift_JIS" + ); + + assert_eq!( + sniff(b"hi", true, None).name(), + "Shift_JIS" + ); + } +} diff --git a/quickpeep_html_charset_detection/src/steps.rs b/quickpeep_html_charset_detection/src/steps.rs index 72c2f1b..39dc63f 100644 --- a/quickpeep_html_charset_detection/src/steps.rs +++ b/quickpeep_html_charset_detection/src/steps.rs @@ -39,12 +39,12 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> { // Loop: If position points to: while position < bytes.len() { // A sequence of bytes starting with: 0x3C 0x21 0x2D 0x2D (`' sequence) and comes // after the 0x3C byte that was found. (The two 0x2D bytes can be the same as those in // the '") { Some(location_of_closer) => { position += location_of_closer + 3; continue; @@ -56,12 +56,15 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> { } // A sequence of bytes starting with: 0x3C, 0x4D or 0x6D, 0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61, and one of 0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F (case-insensitive ASCII ' Option<&'static Encoding> { // Attributes: Get an attribute and its value. If no attribute was sniffed, then jump to the processing step below. while let Some((key, value)) = prescan_get_attribute(&bytes, &mut position) { + println!("att {key:?} {value:?}"); // If the attribute's name is already in attribute list, then return to the step labeled attributes. if attributes.contains(&key) { continue; @@ -218,6 +222,7 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> { pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec, Vec)> { // If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), // or 0x2F (/) then advance position to the next byte and redo this step. + println!("A{position}"); while matches!( bytes.get(*position), Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F) @@ -260,6 +265,8 @@ pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec< attribute_name.push(new_byte.to_ascii_lowercase()); // Advance position to the next byte and return to the previous step. + *position += 1; + continue; } // Spaces: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP) then advance position to the next byte, then, repeat this step. @@ -323,7 +330,10 @@ pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec< // Process the byte at position as follows: // // If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>) - if matches!(bytes.get(*position), Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20)) { + if matches!( + bytes.get(*position), + Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x3E) + ) { // Abort the get an attribute algorithm. The attribute's name is the value of attribute name and its value is the value of attribute value. return Some((attribute_name, attribute_value)); }