diff --git a/quickpeep_html_charset_detection/src/lib.rs b/quickpeep_html_charset_detection/src/lib.rs
index e5d21d6..7d3c6c2 100644
--- a/quickpeep_html_charset_detection/src/lib.rs
+++ b/quickpeep_html_charset_detection/src/lib.rs
@@ -76,9 +76,39 @@ pub fn extract_encoding_from_content_type_header(
let key_value: Vec<&str> = header_part_ascii_ish.trim().split("=").collect();
let key = key_value.get(0).cloned().unwrap_or("");
if key.to_ascii_lowercase() == "charset" {
- let value = key_value.get(0).cloned().unwrap_or("");
+ let value = key_value.get(1).cloned().unwrap_or("");
return Encoding::for_label(value.as_bytes());
}
}
None
}
+
+#[cfg(test)]
+mod test {
+ use crate::sniff;
+
+ #[test]
+ fn test_simple_cases() {
+ assert_eq!(sniff(b"hi", true, None).name(), "windows-1252");
+
+ assert_eq!(
+ sniff(b"hi", true, None).name(),
+ "UTF-8"
+ );
+
+ assert_eq!(
+ sniff(
+ b"hi",
+ true,
+ Some("text/html; charset=Shift-JIS".as_bytes())
+ )
+ .name(),
+ "Shift_JIS"
+ );
+
+ assert_eq!(
+ sniff(b"hi", true, None).name(),
+ "Shift_JIS"
+ );
+ }
+}
diff --git a/quickpeep_html_charset_detection/src/steps.rs b/quickpeep_html_charset_detection/src/steps.rs
index 72c2f1b..39dc63f 100644
--- a/quickpeep_html_charset_detection/src/steps.rs
+++ b/quickpeep_html_charset_detection/src/steps.rs
@@ -39,12 +39,12 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
// Loop: If position points to:
while position < bytes.len() {
// A sequence of bytes starting with: 0x3C 0x21 0x2D 0x2D (`' sequence) and comes
// after the 0x3C byte that was found. (The two 0x2D bytes can be the same as those in
// the '") {
Some(location_of_closer) => {
position += location_of_closer + 3;
continue;
@@ -56,12 +56,15 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
}
// A sequence of bytes starting with: 0x3C, 0x4D or 0x6D, 0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61, and one of 0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F (case-insensitive ASCII ' Option<&'static Encoding> {
// Attributes: Get an attribute and its value. If no attribute was sniffed, then jump to the processing step below.
while let Some((key, value)) = prescan_get_attribute(&bytes, &mut position) {
+ println!("att {key:?} {value:?}");
// If the attribute's name is already in attribute list, then return to the step labeled attributes.
if attributes.contains(&key) {
continue;
@@ -218,6 +222,7 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec, Vec)> {
// If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP),
// or 0x2F (/) then advance position to the next byte and redo this step.
+ println!("A{position}");
while matches!(
bytes.get(*position),
Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F)
@@ -260,6 +265,8 @@ pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec<
attribute_name.push(new_byte.to_ascii_lowercase());
// Advance position to the next byte and return to the previous step.
+ *position += 1;
+ continue;
}
// Spaces: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP) then advance position to the next byte, then, repeat this step.
@@ -323,7 +330,10 @@ pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec<
// Process the byte at position as follows:
//
// If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>)
- if matches!(bytes.get(*position), Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20)) {
+ if matches!(
+ bytes.get(*position),
+ Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x3E)
+ ) {
// Abort the get an attribute algorithm. The attribute's name is the value of attribute name and its value is the value of attribute value.
return Some((attribute_name, attribute_value));
}