Add tests and fixes

2022-06-12 16:30:36 +01:00 · 2022-06-12 16:30:36 +01:00 · 4896ecd426
commit 4896ecd426
parent aa4567c623
2 changed files with 45 additions and 5 deletions
--- a/quickpeep_html_charset_detection/src/lib.rs
+++ b/quickpeep_html_charset_detection/src/lib.rs
@ -76,9 +76,39 @@ pub fn extract_encoding_from_content_type_header(
        let key_value: Vec<&str> = header_part_ascii_ish.trim().split("=").collect();
        let key = key_value.get(0).cloned().unwrap_or("");
        if key.to_ascii_lowercase() == "charset" {
-            let value = key_value.get(0).cloned().unwrap_or("");
+            let value = key_value.get(1).cloned().unwrap_or("");
            return Encoding::for_label(value.as_bytes());
        }
    }
    None
 }
+
+#[cfg(test)]
+mod test {
+    use crate::sniff;
+
+    #[test]
+    fn test_simple_cases() {
+        assert_eq!(sniff(b"<u>hi</u>", true, None).name(), "windows-1252");
+
+        assert_eq!(
+            sniff(b"<meta charset=UTF8><u>hi</u>", true, None).name(),
+            "UTF-8"
+        );
+
+        assert_eq!(
+            sniff(
+                b"<meta charset=UTF8><u>hi</u>",
+                true,
+                Some("text/html; charset=Shift-JIS".as_bytes())
+            )
+            .name(),
+            "Shift_JIS"
+        );
+
+        assert_eq!(
+            sniff(b"<!-- haha we wish <meta charset=UTF8> --><meta http-equiv='content-type' content='text/html; charset=Shift-JIS'><u>hi</u>", true, None).name(),
+            "Shift_JIS"
+        );
+    }
+}
--- a/quickpeep_html_charset_detection/src/steps.rs
+++ b/quickpeep_html_charset_detection/src/steps.rs
@ -39,12 +39,12 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
    // Loop: If position points to:
    while position < bytes.len() {
        // A sequence of bytes starting with: 0x3C 0x21 0x2D 0x2D (`<!--`)
-        if bytes[position..].starts_with(&[0x3C, 0x21, 0x2D, 0x2D]) {
+        if bytes[position..].starts_with(b"<!--") {
            // Advance the position pointer so that it points at the first 0x3E byte which is
            // preceded by two 0x2D bytes (i.e. at the end of an ASCII '-->' sequence) and comes
            // after the 0x3C byte that was found. (The two 0x2D bytes can be the same as those in
            // the '<!--' sequence.)
-            match bytes[position..].find(&[0x2D, 0x2D, 0x3C]) {
+            match bytes[position..].find(b"-->") {
                Some(location_of_closer) => {
                    position += location_of_closer + 3;
                    continue;
@ -56,12 +56,15 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
        }

        // A sequence of bytes starting with: 0x3C, 0x4D or 0x6D, 0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61, and one of 0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F (case-insensitive ASCII '<meta' followed by a space or slash)
-        let check = bytes[position..position + b"<meta ".len()].to_ascii_lowercase();
+        let check = bytes
+            .get(position..position + b"<meta ".len())?
+            .to_ascii_lowercase();
        let ends_in_whitespace = bytes
            .get(position + b"<meta".len())
            .map(|c| matches!(c, 0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F))
            .unwrap_or(false);
        if check.starts_with(b"<meta") && ends_in_whitespace {
+            println!("c");
            // Advance the position pointer so that it points at the next 0x09, 0x0A, 0x0C, 0x0D, 0x20, or 0x2F byte (the one in sequence of characters matched above).
            position += 6;

@ -80,6 +83,7 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {

            // Attributes: Get an attribute and its value. If no attribute was sniffed, then jump to the processing step below.
            while let Some((key, value)) = prescan_get_attribute(&bytes, &mut position) {
+                println!("att {key:?} {value:?}");
                // If the attribute's name is already in attribute list, then return to the step labeled attributes.
                if attributes.contains(&key) {
                    continue;
@ -218,6 +222,7 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
 pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec<u8>, Vec<u8>)> {
    // If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP),
    // or 0x2F (/) then advance position to the next byte and redo this step.
+    println!("A{position}");
    while matches!(
        bytes.get(*position),
        Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F)
@ -260,6 +265,8 @@ pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec<
                attribute_name.push(new_byte.to_ascii_lowercase());

                // Advance position to the next byte and return to the previous step.
+                *position += 1;
+                continue;
            }

            // Spaces: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP) then advance position to the next byte, then, repeat this step.
@ -323,7 +330,10 @@ pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec<
        // Process the byte at position as follows:
        //
        // If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>)
-        if matches!(bytes.get(*position), Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20)) {
+        if matches!(
+            bytes.get(*position),
+            Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x3E)
+        ) {
            // Abort the get an attribute algorithm. The attribute's name is the value of attribute name and its value is the value of attribute value.
            return Some((attribute_name, attribute_value));
        }