Use the sniffed encoding in page extraction

This commit is contained in:
Olivier 'reivilibre' 2022-06-12 15:49:02 +01:00
parent 5701b1e6d8
commit aa4567c623
2 changed files with 16 additions and 6 deletions

View File

@ -44,14 +44,14 @@ pub fn sniff(
sniff_window: &[u8],
is_sniff_window_the_entire_document: bool,
content_type_header: Option<&[u8]>,
) -> Option<&'static Encoding> {
) -> &'static Encoding {
if let Some(certain) = sniff_with_certain_confidence(sniff_window, content_type_header) {
return Some(certain);
return certain;
}
// 5. Optionally prescan the byte stream to determine its encoding
if let Some(prescan_tentative) = prescan(sniff_window) {
return Some(prescan_tentative);
return prescan_tentative;
}
// 8. The user agent may attempt to autodetect the character encoding from applying frequency
@ -63,7 +63,7 @@ pub fn sniff(
detector.feed(sniff_window, is_sniff_window_the_entire_document);
// 'Allow UTF-8' should be set to false for non-file: URIs, apparently.
// We do that here, though I'm not sure this is what we actually want outside of a browser...
Some(detector.guess(None, false))
detector.guess(None, false)
}
pub fn extract_encoding_from_content_type_header(

View File

@ -12,11 +12,13 @@ use itertools::Itertools;
use kuchiki::NodeRef;
use log::{debug, error, trace, warn};
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
use quickpeep_html_charset_detection::sniff;
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
use quickpeep_utils::lazy::Lazy;
use reqwest::header::HeaderMap;
use reqwest::Url;
use serde::{Deserialize, Serialize};
use std::borrow::Borrow;
use tokio::runtime;
use tokio::sync::mpsc::Sender;
use tokio::sync::{mpsc, oneshot};
@ -116,8 +118,16 @@ impl PageExtractionServiceInternal {
headers: HeaderMap,
is_cf: bool,
) -> anyhow::Result<ExtractedPage> {
let content_str: &str = todo!();
let root_node: NodeRef = kuchiki::parse_html().one(content_str.as_ref());
let encoding = sniff(
&content_bytes,
true,
headers.get("content-type").map(|hv| hv.as_bytes()),
);
let (content_text, _actual_codec_used, replacements_made) = encoding.decode(&content_bytes);
if replacements_made {
warn!("Character replacements made!");
}
let root_node: NodeRef = kuchiki::parse_html().one(content_text.borrow());
// See whether this page is at the canonical URL for the page.
// If it's not, then we redirect the raker to the canonical URL.