Use the sniffed encoding in page extraction

This commit is contained in:
Olivier 'reivilibre' 2022-06-12 15:49:02 +01:00
parent 5701b1e6d8
commit aa4567c623
2 changed files with 16 additions and 6 deletions

View File

@ -44,14 +44,14 @@ pub fn sniff(
sniff_window: &[u8], sniff_window: &[u8],
is_sniff_window_the_entire_document: bool, is_sniff_window_the_entire_document: bool,
content_type_header: Option<&[u8]>, content_type_header: Option<&[u8]>,
) -> Option<&'static Encoding> { ) -> &'static Encoding {
if let Some(certain) = sniff_with_certain_confidence(sniff_window, content_type_header) { if let Some(certain) = sniff_with_certain_confidence(sniff_window, content_type_header) {
return Some(certain); return certain;
} }
// 5. Optionally prescan the byte stream to determine its encoding // 5. Optionally prescan the byte stream to determine its encoding
if let Some(prescan_tentative) = prescan(sniff_window) { if let Some(prescan_tentative) = prescan(sniff_window) {
return Some(prescan_tentative); return prescan_tentative;
} }
// 8. The user agent may attempt to autodetect the character encoding from applying frequency // 8. The user agent may attempt to autodetect the character encoding from applying frequency
@ -63,7 +63,7 @@ pub fn sniff(
detector.feed(sniff_window, is_sniff_window_the_entire_document); detector.feed(sniff_window, is_sniff_window_the_entire_document);
// 'Allow UTF-8' should be set to false for non-file: URIs, apparently. // 'Allow UTF-8' should be set to false for non-file: URIs, apparently.
// We do that here, though I'm not sure this is what we actually want outside of a browser... // We do that here, though I'm not sure this is what we actually want outside of a browser...
Some(detector.guess(None, false)) detector.guess(None, false)
} }
pub fn extract_encoding_from_content_type_header( pub fn extract_encoding_from_content_type_header(

View File

@ -12,11 +12,13 @@ use itertools::Itertools;
use kuchiki::NodeRef; use kuchiki::NodeRef;
use log::{debug, error, trace, warn}; use log::{debug, error, trace, warn};
use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree}; use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree};
use quickpeep_html_charset_detection::sniff;
use quickpeep_structs::rake_entries::AnalysisAntifeatures; use quickpeep_structs::rake_entries::AnalysisAntifeatures;
use quickpeep_utils::lazy::Lazy; use quickpeep_utils::lazy::Lazy;
use reqwest::header::HeaderMap; use reqwest::header::HeaderMap;
use reqwest::Url; use reqwest::Url;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::borrow::Borrow;
use tokio::runtime; use tokio::runtime;
use tokio::sync::mpsc::Sender; use tokio::sync::mpsc::Sender;
use tokio::sync::{mpsc, oneshot}; use tokio::sync::{mpsc, oneshot};
@ -116,8 +118,16 @@ impl PageExtractionServiceInternal {
headers: HeaderMap, headers: HeaderMap,
is_cf: bool, is_cf: bool,
) -> anyhow::Result<ExtractedPage> { ) -> anyhow::Result<ExtractedPage> {
let content_str: &str = todo!(); let encoding = sniff(
let root_node: NodeRef = kuchiki::parse_html().one(content_str.as_ref()); &content_bytes,
true,
headers.get("content-type").map(|hv| hv.as_bytes()),
);
let (content_text, _actual_codec_used, replacements_made) = encoding.decode(&content_bytes);
if replacements_made {
warn!("Character replacements made!");
}
let root_node: NodeRef = kuchiki::parse_html().one(content_text.borrow());
// See whether this page is at the canonical URL for the page. // See whether this page is at the canonical URL for the page.
// If it's not, then we redirect the raker to the canonical URL. // If it's not, then we redirect the raker to the canonical URL.