Determine the language for raked HTML pages
This commit is contained in:
parent
60e906fefd
commit
04b94b16ed
File diff suppressed because it is too large
Load Diff
|
@ -33,6 +33,8 @@ ipnetwork = "0.18.0"
|
|||
|
||||
futures-util = "0.3.21"
|
||||
|
||||
lingua = "1.3.3"
|
||||
|
||||
### Raking helpers
|
||||
# HTTP Requests
|
||||
reqwest = { version = "0.11.9", features = ["stream"] }
|
||||
|
|
|
@ -1,16 +1,20 @@
|
|||
use crate::raking::analysis::{analyse_with_ad_block_cosmetic_filter, IpSet};
|
||||
use crate::raking::analysis::{
|
||||
analyse_with_ad_block_cosmetic_filter, guess_document_language, IpSet,
|
||||
};
|
||||
use adblock::engine::Engine;
|
||||
use anyhow::{bail, Context};
|
||||
use chrono::{DateTime, FixedOffset, Utc};
|
||||
use cylon::Cylon;
|
||||
use futures_util::stream::StreamExt;
|
||||
use html5ever::tendril::fmt::Slice;
|
||||
use itertools::Itertools;
|
||||
use kuchiki::traits::TendrilSink;
|
||||
use kuchiki::NodeRef;
|
||||
use lazy_static::lazy_static;
|
||||
use log::debug;
|
||||
use quickpeep_densedoc::DenseTree;
|
||||
use quickpeep_structs::rake_entries::AnalysisAntifeatures;
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::{Client, Response, Url};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sitemap::reader::SiteMapEntity;
|
||||
|
@ -214,11 +218,12 @@ impl Raker {
|
|||
}));
|
||||
};
|
||||
|
||||
let headers = response.headers().clone();
|
||||
let content = response_to_bytes_limited(response, SIZE_LIMIT, TIME_LIMIT).await?;
|
||||
|
||||
if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page)
|
||||
{
|
||||
match self.rake_html_page(&content, url, is_cf) {
|
||||
match self.rake_html_page(&content, url, is_cf, &headers) {
|
||||
Ok(page_rake) => {
|
||||
return Ok(page_rake);
|
||||
}
|
||||
|
@ -264,6 +269,7 @@ impl Raker {
|
|||
content: &[u8],
|
||||
url: &Url,
|
||||
is_cf: bool,
|
||||
headers: &HeaderMap,
|
||||
) -> anyhow::Result<RakeOutcome> {
|
||||
let content_str = std::str::from_utf8(content)?;
|
||||
|
||||
|
@ -286,6 +292,32 @@ impl Raker {
|
|||
}
|
||||
}
|
||||
|
||||
// Try and dig up the page's language.
|
||||
// First try <html lang=...> since this is the modern way, and potentially the most trustworthy...
|
||||
let mut language = None;
|
||||
|
||||
if let Ok(html_node) = root_node.select_first("html") {
|
||||
if let Some(lang) = html_node.attributes.borrow().get("lang") {
|
||||
language = Some(lang.trim().to_string());
|
||||
}
|
||||
}
|
||||
|
||||
if language.is_none() {
|
||||
// Next fallback: prefer the content-language header baked into the page itself
|
||||
if let Ok(meta_node) = root_node.select_first("meta[http-equiv=content-language]") {
|
||||
if let Some(lang) = meta_node.attributes.borrow().get("content") {
|
||||
language = Some(lang.trim().to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if language.is_none() {
|
||||
// Next fallback: prefer the content-language received as a header
|
||||
if let Some(lang) = headers.get("content-language") {
|
||||
language = Some(lang.to_str()?.to_owned());
|
||||
}
|
||||
}
|
||||
|
||||
let mut antifeature_flags = AnalysisAntifeatures::empty();
|
||||
|
||||
if is_cf {
|
||||
|
@ -311,6 +343,18 @@ impl Raker {
|
|||
|
||||
let dense_doc = DenseTree::from_body(root_node.clone());
|
||||
let dense_doc_text = DenseTree::generate_textual_format(&dense_doc);
|
||||
|
||||
if language.is_none() {
|
||||
// Final fallback: guess the language
|
||||
language = guess_document_language(&dense_doc_text);
|
||||
}
|
||||
|
||||
// Try and enforce some consistency in the language code;
|
||||
// we want codes like en_US rather than en-us.
|
||||
if let Some(language) = language.as_mut() {
|
||||
normalise_language(language);
|
||||
}
|
||||
|
||||
eprintln!("~~~~~\n{}\n~~~~~", dense_doc_text);
|
||||
eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc);
|
||||
|
||||
|
@ -334,6 +378,18 @@ impl Raker {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn normalise_language(lang_string: &mut String) {
|
||||
*lang_string = lang_string.to_lowercase();
|
||||
let mut pieces = lang_string
|
||||
.replace("-", "_")
|
||||
.split('_')
|
||||
.map(|s| s.to_owned())
|
||||
.collect_vec();
|
||||
if let Some(dialect) = pieces.get_mut(1) {
|
||||
*dialect = dialect.to_uppercase();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
|
||||
let feed = feed_rs::parser::parse_with_uri(content, Some(url.as_str()))?;
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ use adblock::lists::{ParseOptions, RuleTypes};
|
|||
use anyhow::Context;
|
||||
use ipnetwork::IpNetwork;
|
||||
use kuchiki::NodeRef;
|
||||
use lingua::Language;
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::net::IpAddr;
|
||||
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
|
||||
|
@ -103,6 +104,13 @@ pub fn analyse_with_ad_block_cosmetic_filter(
|
|||
Ok(matches > 0)
|
||||
}
|
||||
|
||||
pub fn guess_document_language(text: &str) -> Option<String> {
|
||||
let detector = lingua::LanguageDetectorBuilder::from_all_languages().build();
|
||||
detector
|
||||
.detect_language_of(text)
|
||||
.map(|lang: Language| lang.iso_code_639_1().to_string())
|
||||
}
|
||||
|
||||
// TODO this isn't particularly efficient. Probably want a trie if it's important...
|
||||
pub struct IpSet {
|
||||
ips: BTreeSet<IpNetwork>,
|
||||
|
|
Loading…
Reference in New Issue