Determine the language for raked HTML pages

This commit is contained in:
Olivier 'reivilibre' 2022-03-14 20:33:09 +00:00
parent 60e906fefd
commit 04b94b16ed
4 changed files with 1085 additions and 2 deletions

1017
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -33,6 +33,8 @@ ipnetwork = "0.18.0"
futures-util = "0.3.21" futures-util = "0.3.21"
lingua = "1.3.3"
### Raking helpers ### Raking helpers
# HTTP Requests # HTTP Requests
reqwest = { version = "0.11.9", features = ["stream"] } reqwest = { version = "0.11.9", features = ["stream"] }

View File

@ -1,16 +1,20 @@
use crate::raking::analysis::{analyse_with_ad_block_cosmetic_filter, IpSet}; use crate::raking::analysis::{
analyse_with_ad_block_cosmetic_filter, guess_document_language, IpSet,
};
use adblock::engine::Engine; use adblock::engine::Engine;
use anyhow::{bail, Context}; use anyhow::{bail, Context};
use chrono::{DateTime, FixedOffset, Utc}; use chrono::{DateTime, FixedOffset, Utc};
use cylon::Cylon; use cylon::Cylon;
use futures_util::stream::StreamExt; use futures_util::stream::StreamExt;
use html5ever::tendril::fmt::Slice; use html5ever::tendril::fmt::Slice;
use itertools::Itertools;
use kuchiki::traits::TendrilSink; use kuchiki::traits::TendrilSink;
use kuchiki::NodeRef; use kuchiki::NodeRef;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use log::debug; use log::debug;
use quickpeep_densedoc::DenseTree; use quickpeep_densedoc::DenseTree;
use quickpeep_structs::rake_entries::AnalysisAntifeatures; use quickpeep_structs::rake_entries::AnalysisAntifeatures;
use reqwest::header::HeaderMap;
use reqwest::{Client, Response, Url}; use reqwest::{Client, Response, Url};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use sitemap::reader::SiteMapEntity; use sitemap::reader::SiteMapEntity;
@ -214,11 +218,12 @@ impl Raker {
})); }));
}; };
let headers = response.headers().clone();
let content = response_to_bytes_limited(response, SIZE_LIMIT, TIME_LIMIT).await?; let content = response_to_bytes_limited(response, SIZE_LIMIT, TIME_LIMIT).await?;
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page)
{ {
match self.rake_html_page(&content, url, is_cf) { match self.rake_html_page(&content, url, is_cf, &headers) {
Ok(page_rake) => { Ok(page_rake) => {
return Ok(page_rake); return Ok(page_rake);
} }
@ -264,6 +269,7 @@ impl Raker {
content: &[u8], content: &[u8],
url: &Url, url: &Url,
is_cf: bool, is_cf: bool,
headers: &HeaderMap,
) -> anyhow::Result<RakeOutcome> { ) -> anyhow::Result<RakeOutcome> {
let content_str = std::str::from_utf8(content)?; let content_str = std::str::from_utf8(content)?;
@ -286,6 +292,32 @@ impl Raker {
} }
} }
// Try and dig up the page's language.
// First try <html lang=...> since this is the modern way, and potentially the most trustworthy...
let mut language = None;
if let Ok(html_node) = root_node.select_first("html") {
if let Some(lang) = html_node.attributes.borrow().get("lang") {
language = Some(lang.trim().to_string());
}
}
if language.is_none() {
// Next fallback: prefer the content-language header baked into the page itself
if let Ok(meta_node) = root_node.select_first("meta[http-equiv=content-language]") {
if let Some(lang) = meta_node.attributes.borrow().get("content") {
language = Some(lang.trim().to_string());
}
}
}
if language.is_none() {
// Next fallback: prefer the content-language received as a header
if let Some(lang) = headers.get("content-language") {
language = Some(lang.to_str()?.to_owned());
}
}
let mut antifeature_flags = AnalysisAntifeatures::empty(); let mut antifeature_flags = AnalysisAntifeatures::empty();
if is_cf { if is_cf {
@ -311,6 +343,18 @@ impl Raker {
let dense_doc = DenseTree::from_body(root_node.clone()); let dense_doc = DenseTree::from_body(root_node.clone());
let dense_doc_text = DenseTree::generate_textual_format(&dense_doc); let dense_doc_text = DenseTree::generate_textual_format(&dense_doc);
if language.is_none() {
// Final fallback: guess the language
language = guess_document_language(&dense_doc_text);
}
// Try and enforce some consistency in the language code;
// we want codes like en_US rather than en-us.
if let Some(language) = language.as_mut() {
normalise_language(language);
}
eprintln!("~~~~~\n{}\n~~~~~", dense_doc_text); eprintln!("~~~~~\n{}\n~~~~~", dense_doc_text);
eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc); eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc);
@ -334,6 +378,18 @@ impl Raker {
} }
} }
pub fn normalise_language(lang_string: &mut String) {
*lang_string = lang_string.to_lowercase();
let mut pieces = lang_string
.replace("-", "_")
.split('_')
.map(|s| s.to_owned())
.collect_vec();
if let Some(dialect) = pieces.get_mut(1) {
*dialect = dialect.to_uppercase();
}
}
pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> { pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
let feed = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?; let feed = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?;

View File

@ -3,6 +3,7 @@ use adblock::lists::{ParseOptions, RuleTypes};
use anyhow::Context; use anyhow::Context;
use ipnetwork::IpNetwork; use ipnetwork::IpNetwork;
use kuchiki::NodeRef; use kuchiki::NodeRef;
use lingua::Language;
use std::collections::{BTreeSet, HashSet}; use std::collections::{BTreeSet, HashSet};
use std::net::IpAddr; use std::net::IpAddr;
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader}; use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
@ -103,6 +104,13 @@ pub fn analyse_with_ad_block_cosmetic_filter(
Ok(matches > 0) Ok(matches > 0)
} }
pub fn guess_document_language(text: &str) -> Option<String> {
let detector = lingua::LanguageDetectorBuilder::from_all_languages().build();
detector
.detect_language_of(text)
.map(|lang: Language| lang.iso_code_639_1().to_string())
}
// TODO this isn't particularly efficient. Probably want a trie if it's important... // TODO this isn't particularly efficient. Probably want a trie if it's important...
pub struct IpSet { pub struct IpSet {
ips: BTreeSet<IpNetwork>, ips: BTreeSet<IpNetwork>,