Determine the language for raked HTML pages
This commit is contained in:
parent
60e906fefd
commit
04b94b16ed
File diff suppressed because it is too large
Load Diff
|
@ -33,6 +33,8 @@ ipnetwork = "0.18.0"
|
||||||
|
|
||||||
futures-util = "0.3.21"
|
futures-util = "0.3.21"
|
||||||
|
|
||||||
|
lingua = "1.3.3"
|
||||||
|
|
||||||
### Raking helpers
|
### Raking helpers
|
||||||
# HTTP Requests
|
# HTTP Requests
|
||||||
reqwest = { version = "0.11.9", features = ["stream"] }
|
reqwest = { version = "0.11.9", features = ["stream"] }
|
||||||
|
|
|
@ -1,16 +1,20 @@
|
||||||
use crate::raking::analysis::{analyse_with_ad_block_cosmetic_filter, IpSet};
|
use crate::raking::analysis::{
|
||||||
|
analyse_with_ad_block_cosmetic_filter, guess_document_language, IpSet,
|
||||||
|
};
|
||||||
use adblock::engine::Engine;
|
use adblock::engine::Engine;
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use chrono::{DateTime, FixedOffset, Utc};
|
use chrono::{DateTime, FixedOffset, Utc};
|
||||||
use cylon::Cylon;
|
use cylon::Cylon;
|
||||||
use futures_util::stream::StreamExt;
|
use futures_util::stream::StreamExt;
|
||||||
use html5ever::tendril::fmt::Slice;
|
use html5ever::tendril::fmt::Slice;
|
||||||
|
use itertools::Itertools;
|
||||||
use kuchiki::traits::TendrilSink;
|
use kuchiki::traits::TendrilSink;
|
||||||
use kuchiki::NodeRef;
|
use kuchiki::NodeRef;
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use quickpeep_densedoc::DenseTree;
|
use quickpeep_densedoc::DenseTree;
|
||||||
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
||||||
|
use reqwest::header::HeaderMap;
|
||||||
use reqwest::{Client, Response, Url};
|
use reqwest::{Client, Response, Url};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use sitemap::reader::SiteMapEntity;
|
use sitemap::reader::SiteMapEntity;
|
||||||
|
@ -214,11 +218,12 @@ impl Raker {
|
||||||
}));
|
}));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let headers = response.headers().clone();
|
||||||
let content = response_to_bytes_limited(response, SIZE_LIMIT, TIME_LIMIT).await?;
|
let content = response_to_bytes_limited(response, SIZE_LIMIT, TIME_LIMIT).await?;
|
||||||
|
|
||||||
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page)
|
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page)
|
||||||
{
|
{
|
||||||
match self.rake_html_page(&content, url, is_cf) {
|
match self.rake_html_page(&content, url, is_cf, &headers) {
|
||||||
Ok(page_rake) => {
|
Ok(page_rake) => {
|
||||||
return Ok(page_rake);
|
return Ok(page_rake);
|
||||||
}
|
}
|
||||||
|
@ -264,6 +269,7 @@ impl Raker {
|
||||||
content: &[u8],
|
content: &[u8],
|
||||||
url: &Url,
|
url: &Url,
|
||||||
is_cf: bool,
|
is_cf: bool,
|
||||||
|
headers: &HeaderMap,
|
||||||
) -> anyhow::Result<RakeOutcome> {
|
) -> anyhow::Result<RakeOutcome> {
|
||||||
let content_str = std::str::from_utf8(content)?;
|
let content_str = std::str::from_utf8(content)?;
|
||||||
|
|
||||||
|
@ -286,6 +292,32 @@ impl Raker {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Try and dig up the page's language.
|
||||||
|
// First try <html lang=...> since this is the modern way, and potentially the most trustworthy...
|
||||||
|
let mut language = None;
|
||||||
|
|
||||||
|
if let Ok(html_node) = root_node.select_first("html") {
|
||||||
|
if let Some(lang) = html_node.attributes.borrow().get("lang") {
|
||||||
|
language = Some(lang.trim().to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if language.is_none() {
|
||||||
|
// Next fallback: prefer the content-language header baked into the page itself
|
||||||
|
if let Ok(meta_node) = root_node.select_first("meta[http-equiv=content-language]") {
|
||||||
|
if let Some(lang) = meta_node.attributes.borrow().get("content") {
|
||||||
|
language = Some(lang.trim().to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if language.is_none() {
|
||||||
|
// Next fallback: prefer the content-language received as a header
|
||||||
|
if let Some(lang) = headers.get("content-language") {
|
||||||
|
language = Some(lang.to_str()?.to_owned());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let mut antifeature_flags = AnalysisAntifeatures::empty();
|
let mut antifeature_flags = AnalysisAntifeatures::empty();
|
||||||
|
|
||||||
if is_cf {
|
if is_cf {
|
||||||
|
@ -311,6 +343,18 @@ impl Raker {
|
||||||
|
|
||||||
let dense_doc = DenseTree::from_body(root_node.clone());
|
let dense_doc = DenseTree::from_body(root_node.clone());
|
||||||
let dense_doc_text = DenseTree::generate_textual_format(&dense_doc);
|
let dense_doc_text = DenseTree::generate_textual_format(&dense_doc);
|
||||||
|
|
||||||
|
if language.is_none() {
|
||||||
|
// Final fallback: guess the language
|
||||||
|
language = guess_document_language(&dense_doc_text);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try and enforce some consistency in the language code;
|
||||||
|
// we want codes like en_US rather than en-us.
|
||||||
|
if let Some(language) = language.as_mut() {
|
||||||
|
normalise_language(language);
|
||||||
|
}
|
||||||
|
|
||||||
eprintln!("~~~~~\n{}\n~~~~~", dense_doc_text);
|
eprintln!("~~~~~\n{}\n~~~~~", dense_doc_text);
|
||||||
eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc);
|
eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc);
|
||||||
|
|
||||||
|
@ -334,6 +378,18 @@ impl Raker {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn normalise_language(lang_string: &mut String) {
|
||||||
|
*lang_string = lang_string.to_lowercase();
|
||||||
|
let mut pieces = lang_string
|
||||||
|
.replace("-", "_")
|
||||||
|
.split('_')
|
||||||
|
.map(|s| s.to_owned())
|
||||||
|
.collect_vec();
|
||||||
|
if let Some(dialect) = pieces.get_mut(1) {
|
||||||
|
*dialect = dialect.to_uppercase();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
|
pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
|
||||||
let feed = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?;
|
let feed = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?;
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@ use adblock::lists::{ParseOptions, RuleTypes};
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use ipnetwork::IpNetwork;
|
use ipnetwork::IpNetwork;
|
||||||
use kuchiki::NodeRef;
|
use kuchiki::NodeRef;
|
||||||
|
use lingua::Language;
|
||||||
use std::collections::{BTreeSet, HashSet};
|
use std::collections::{BTreeSet, HashSet};
|
||||||
use std::net::IpAddr;
|
use std::net::IpAddr;
|
||||||
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
|
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
|
||||||
|
@ -103,6 +104,13 @@ pub fn analyse_with_ad_block_cosmetic_filter(
|
||||||
Ok(matches > 0)
|
Ok(matches > 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn guess_document_language(text: &str) -> Option<String> {
|
||||||
|
let detector = lingua::LanguageDetectorBuilder::from_all_languages().build();
|
||||||
|
detector
|
||||||
|
.detect_language_of(text)
|
||||||
|
.map(|lang: Language| lang.iso_code_639_1().to_string())
|
||||||
|
}
|
||||||
|
|
||||||
// TODO this isn't particularly efficient. Probably want a trie if it's important...
|
// TODO this isn't particularly efficient. Probably want a trie if it's important...
|
||||||
pub struct IpSet {
|
pub struct IpSet {
|
||||||
ips: BTreeSet<IpNetwork>,
|
ips: BTreeSet<IpNetwork>,
|
||||||
|
|
Loading…
Reference in New Issue