Add tests and fixes
Some checks are pending
ci/woodpecker/push/manual Pipeline is pending
ci/woodpecker/push/check Pipeline was successful
ci/woodpecker/push/release Pipeline was successful

This commit is contained in:
Olivier 'reivilibre' 2022-06-12 16:30:36 +01:00
parent aa4567c623
commit 4896ecd426
2 changed files with 45 additions and 5 deletions

View File

@ -76,9 +76,39 @@ pub fn extract_encoding_from_content_type_header(
let key_value: Vec<&str> = header_part_ascii_ish.trim().split("=").collect();
let key = key_value.get(0).cloned().unwrap_or("");
if key.to_ascii_lowercase() == "charset" {
let value = key_value.get(0).cloned().unwrap_or("");
let value = key_value.get(1).cloned().unwrap_or("");
return Encoding::for_label(value.as_bytes());
}
}
None
}
#[cfg(test)]
mod test {
use crate::sniff;
#[test]
fn test_simple_cases() {
assert_eq!(sniff(b"<u>hi</u>", true, None).name(), "windows-1252");
assert_eq!(
sniff(b"<meta charset=UTF8><u>hi</u>", true, None).name(),
"UTF-8"
);
assert_eq!(
sniff(
b"<meta charset=UTF8><u>hi</u>",
true,
Some("text/html; charset=Shift-JIS".as_bytes())
)
.name(),
"Shift_JIS"
);
assert_eq!(
sniff(b"<!-- haha we wish <meta charset=UTF8> --><meta http-equiv='content-type' content='text/html; charset=Shift-JIS'><u>hi</u>", true, None).name(),
"Shift_JIS"
);
}
}

View File

@ -39,12 +39,12 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
// Loop: If position points to:
while position < bytes.len() {
// A sequence of bytes starting with: 0x3C 0x21 0x2D 0x2D (`<!--`)
if bytes[position..].starts_with(&[0x3C, 0x21, 0x2D, 0x2D]) {
if bytes[position..].starts_with(b"<!--") {
// Advance the position pointer so that it points at the first 0x3E byte which is
// preceded by two 0x2D bytes (i.e. at the end of an ASCII '-->' sequence) and comes
// after the 0x3C byte that was found. (The two 0x2D bytes can be the same as those in
// the '<!--' sequence.)
match bytes[position..].find(&[0x2D, 0x2D, 0x3C]) {
match bytes[position..].find(b"-->") {
Some(location_of_closer) => {
position += location_of_closer + 3;
continue;
@ -56,12 +56,15 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
}
// A sequence of bytes starting with: 0x3C, 0x4D or 0x6D, 0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61, and one of 0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F (case-insensitive ASCII '<meta' followed by a space or slash)
let check = bytes[position..position + b"<meta ".len()].to_ascii_lowercase();
let check = bytes
.get(position..position + b"<meta ".len())?
.to_ascii_lowercase();
let ends_in_whitespace = bytes
.get(position + b"<meta".len())
.map(|c| matches!(c, 0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F))
.unwrap_or(false);
if check.starts_with(b"<meta") && ends_in_whitespace {
println!("c");
// Advance the position pointer so that it points at the next 0x09, 0x0A, 0x0C, 0x0D, 0x20, or 0x2F byte (the one in sequence of characters matched above).
position += 6;
@ -80,6 +83,7 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
// Attributes: Get an attribute and its value. If no attribute was sniffed, then jump to the processing step below.
while let Some((key, value)) = prescan_get_attribute(&bytes, &mut position) {
println!("att {key:?} {value:?}");
// If the attribute's name is already in attribute list, then return to the step labeled attributes.
if attributes.contains(&key) {
continue;
@ -218,6 +222,7 @@ pub fn prescan(bytes: &[u8]) -> Option<&'static Encoding> {
pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec<u8>, Vec<u8>)> {
// If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP),
// or 0x2F (/) then advance position to the next byte and redo this step.
println!("A{position}");
while matches!(
bytes.get(*position),
Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F)
@ -260,6 +265,8 @@ pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec<
attribute_name.push(new_byte.to_ascii_lowercase());
// Advance position to the next byte and return to the previous step.
*position += 1;
continue;
}
// Spaces: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP) then advance position to the next byte, then, repeat this step.
@ -323,7 +330,10 @@ pub fn prescan_get_attribute(bytes: &[u8], position: &mut usize) -> Option<(Vec<
// Process the byte at position as follows:
//
// If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>)
if matches!(bytes.get(*position), Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20)) {
if matches!(
bytes.get(*position),
Some(0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x3E)
) {
// Abort the get an attribute algorithm. The attribute's name is the value of attribute name and its value is the value of attribute value.
return Some((attribute_name, attribute_value));
}