Determine the language for raked HTML pages

This commit is contained in:
Olivier 'reivilibre' 2022-03-14 20:33:09 +00:00
parent 60e906fefd
commit 04b94b16ed
4 changed files with 1085 additions and 2 deletions

1017
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -33,6 +33,8 @@ ipnetwork = "0.18.0"
futures-util = "0.3.21"
lingua = "1.3.3"
### Raking helpers
# HTTP Requests
reqwest = { version = "0.11.9", features = ["stream"] }

View File

@ -1,16 +1,20 @@
use crate::raking::analysis::{analyse_with_ad_block_cosmetic_filter, IpSet};
use crate::raking::analysis::{
analyse_with_ad_block_cosmetic_filter, guess_document_language, IpSet,
};
use adblock::engine::Engine;
use anyhow::{bail, Context};
use chrono::{DateTime, FixedOffset, Utc};
use cylon::Cylon;
use futures_util::stream::StreamExt;
use html5ever::tendril::fmt::Slice;
use itertools::Itertools;
use kuchiki::traits::TendrilSink;
use kuchiki::NodeRef;
use lazy_static::lazy_static;
use log::debug;
use quickpeep_densedoc::DenseTree;
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
use reqwest::header::HeaderMap;
use reqwest::{Client, Response, Url};
use serde::{Deserialize, Serialize};
use sitemap::reader::SiteMapEntity;
@ -214,11 +218,12 @@ impl Raker {
}));
};
let headers = response.headers().clone();
let content = response_to_bytes_limited(response, SIZE_LIMIT, TIME_LIMIT).await?;
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page)
{
match self.rake_html_page(&content, url, is_cf) {
match self.rake_html_page(&content, url, is_cf, &headers) {
Ok(page_rake) => {
return Ok(page_rake);
}
@ -264,6 +269,7 @@ impl Raker {
content: &[u8],
url: &Url,
is_cf: bool,
headers: &HeaderMap,
) -> anyhow::Result<RakeOutcome> {
let content_str = std::str::from_utf8(content)?;
@ -286,6 +292,32 @@ impl Raker {
}
}
// Try and dig up the page's language.
// First try <html lang=...> since this is the modern way, and potentially the most trustworthy...
let mut language = None;
if let Ok(html_node) = root_node.select_first("html") {
if let Some(lang) = html_node.attributes.borrow().get("lang") {
language = Some(lang.trim().to_string());
}
}
if language.is_none() {
// Next fallback: prefer the content-language header baked into the page itself
if let Ok(meta_node) = root_node.select_first("meta[http-equiv=content-language]") {
if let Some(lang) = meta_node.attributes.borrow().get("content") {
language = Some(lang.trim().to_string());
}
}
}
if language.is_none() {
// Next fallback: prefer the content-language received as a header
if let Some(lang) = headers.get("content-language") {
language = Some(lang.to_str()?.to_owned());
}
}
let mut antifeature_flags = AnalysisAntifeatures::empty();
if is_cf {
@ -311,6 +343,18 @@ impl Raker {
let dense_doc = DenseTree::from_body(root_node.clone());
let dense_doc_text = DenseTree::generate_textual_format(&dense_doc);
if language.is_none() {
// Final fallback: guess the language
language = guess_document_language(&dense_doc_text);
}
// Try and enforce some consistency in the language code;
// we want codes like en_US rather than en-us.
if let Some(language) = language.as_mut() {
normalise_language(language);
}
eprintln!("~~~~~\n{}\n~~~~~", dense_doc_text);
eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc);
@ -334,6 +378,18 @@ impl Raker {
}
}
pub fn normalise_language(lang_string: &mut String) {
*lang_string = lang_string.to_lowercase();
let mut pieces = lang_string
.replace("-", "_")
.split('_')
.map(|s| s.to_owned())
.collect_vec();
if let Some(dialect) = pieces.get_mut(1) {
*dialect = dialect.to_uppercase();
}
}
pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
let feed = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?;

View File

@ -3,6 +3,7 @@ use adblock::lists::{ParseOptions, RuleTypes};
use anyhow::Context;
use ipnetwork::IpNetwork;
use kuchiki::NodeRef;
use lingua::Language;
use std::collections::{BTreeSet, HashSet};
use std::net::IpAddr;
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
@ -103,6 +104,13 @@ pub fn analyse_with_ad_block_cosmetic_filter(
Ok(matches > 0)
}
pub fn guess_document_language(text: &str) -> Option<String> {
let detector = lingua::LanguageDetectorBuilder::from_all_languages().build();
detector
.detect_language_of(text)
.map(|lang: Language| lang.iso_code_639_1().to_string())
}
// TODO this isn't particularly efficient. Probably want a trie if it's important...
pub struct IpSet {
ips: BTreeSet<IpNetwork>,