diff --git a/Cargo.lock b/Cargo.lock index 40e2fe9..079d0c3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -86,6 +86,16 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags_serde_shim" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25c3d626f0280ec39b33a6fc5c6c1067432b4c41e94aee40ded197a6649bf025" +dependencies = [ + "bitflags", + "serde", +] + [[package]] name = "bumpalo" version = "3.9.1" @@ -159,6 +169,45 @@ dependencies = [ "chrono", ] +[[package]] +name = "clap" +version = "3.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8c93436c21e4698bacadf42917db28b23017027a4deccb35dbe47a7e7840123" +dependencies = [ + "atty", + "bitflags", + "clap_derive", + "indexmap", + "lazy_static", + "os_str_bytes", + "strsim", + "termcolor", + "textwrap", +] + +[[package]] +name = "clap_derive" +version = "3.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da95d038ede1a964ce99f49cbe27a7fb538d1da595e4b4f70b8c8f338d17bf16" +dependencies = [ + "heck 0.4.0", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "colour" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a27e4532f26f510c24bb8477d963c0c3ef27e293c3b2c507cccb0536d493201a" +dependencies = [ + "crossterm", +] + [[package]] name = "convert_case" version = "0.4.0" @@ -234,6 +283,31 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "crossterm" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c36c10130df424b2f3552fcc2ddcd9b28a27b1e54b358b45874f88d1ca6888c" +dependencies = [ + "bitflags", + "crossterm_winapi", + "lazy_static", + "libc", + "mio 0.7.14", + "parking_lot 0.11.2", + "signal-hook", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0da8964ace4d3e4a044fd027919b2237000b24315a37c916f61809f1ff2140b9" +dependencies = [ + "winapi", +] + [[package]] name = "cssparser" version = "0.27.2" @@ -560,6 +634,12 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "heck" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" + [[package]] name = "hermit-abi" version = "0.1.19" @@ -1635,6 +1715,19 @@ dependencies = [ "autocfg", ] +[[package]] +name = "mio" +version = "0.7.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8067b404fe97c70829f082dec8bcf4f71225d7eaea1d8645349cb76fa06205cc" +dependencies = [ + "libc", + "log", + "miow", + "ntapi", + "winapi", +] + [[package]] name = "mio" version = "0.8.1" @@ -1823,6 +1916,15 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "os_str_bytes" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64" +dependencies = [ + "memchr", +] + [[package]] name = "parking_lot" version = "0.11.2" @@ -1970,6 +2072,30 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + [[package]] name = "proc-macro-hack" version = "0.5.19" @@ -2018,6 +2144,8 @@ dependencies = [ "anyhow", "bytes", "chrono", + "clap", + "colour", "cylon", "env_logger", "feed-rs", @@ -2071,7 +2199,9 @@ name = "quickpeep_structs" version = "0.1.0" dependencies = [ "bitflags", + "bitflags_serde_shim", "quickpeep_densedoc", + "serde", ] [[package]] @@ -2459,6 +2589,17 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "signal-hook" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e31d442c16f047a671b5a71e2161d6e68814012b7f5379d269ebd915fac2729" +dependencies = [ + "libc", + "mio 0.7.14", + "signal-hook-registry", +] + [[package]] name = "signal-hook-registry" version = "1.4.0" @@ -2546,6 +2687,12 @@ dependencies = [ "quote", ] +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + [[package]] name = "strum" version = "0.23.0" @@ -2558,7 +2705,7 @@ version = "0.23.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38" dependencies = [ - "heck", + "heck 0.3.3", "proc-macro2", "quote", "rustversion", @@ -2610,6 +2757,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "textwrap" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb" + [[package]] name = "thin-slice" version = "0.1.1" @@ -2670,7 +2823,7 @@ dependencies = [ "bytes", "libc", "memchr", - "mio", + "mio 0.8.1", "num_cpus", "once_cell", "parking_lot 0.12.0", @@ -2841,6 +2994,12 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + [[package]] name = "want" version = "0.3.0" diff --git a/quickpeep/Cargo.toml b/quickpeep/Cargo.toml index 4dce7d1..3f59bb6 100644 --- a/quickpeep/Cargo.toml +++ b/quickpeep/Cargo.toml @@ -6,36 +6,35 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +### Subcrates +quickpeep_moz_readability = { path = "../quickpeep_moz_readability" } +quickpeep_densedoc = { path = "../quickpeep_densedoc" } +quickpeep_structs = { path = "../quickpeep_structs" } +quickpeep_utils = { path = "../quickpeep_utils" } + +### CLI Helpers +clap = { version = "3.1.6", features = ["derive"] } +colour = "0.6.0" + +### Document Parsing +kuchiki = "0.8.1" +html5ever = "0.25.1" +serde = { version = "1.0.136", features = ["derive"] } +serde_bare = "0.5.0" + +### Dates +chrono = "0.4.19" + +### Utils +lazy_static = "1.4.0" +bytes = "1.1.0" +itertools = "0.10.3" +ipnetwork = "0.18.0" +futures-util = "0.3.21" tokio = { version = "1.17.0", features = ["full"] } anyhow = "1.0.55" log = "0.4.14" env_logger = "0.9.0" -quickpeep_moz_readability = { path = "../quickpeep_moz_readability" } -quickpeep_densedoc = { path = "../quickpeep_densedoc" } - -# TODO: why do we need these here? -kuchiki = "0.8.1" -html5ever = "0.25.1" - -serde = { version = "1.0.136", features = ["derive"] } -serde_bare = "0.5.0" - -chrono = "0.4.19" - -lazy_static = "1.4.0" - -bytes = "1.1.0" - -itertools = "0.10.3" - -quickpeep_structs = { path = "../quickpeep_structs" } -ipnetwork = "0.18.0" - -futures-util = "0.3.21" - -lingua = "1.3.3" - -quickpeep_utils = { path = "../quickpeep_utils" } ### Raking helpers # HTTP Requests @@ -53,3 +52,5 @@ sitemap = "0.4.1" ### Filtering helpers # AdBlock adblock = "0.5.0" +# Language detection +lingua = "1.3.3" \ No newline at end of file diff --git a/quickpeep/src/bin/qp-rake1.rs b/quickpeep/src/bin/qp-rake1.rs index d72f389..1139d27 100644 --- a/quickpeep/src/bin/qp-rake1.rs +++ b/quickpeep/src/bin/qp-rake1.rs @@ -1,15 +1,17 @@ use adblock::lists::RuleTypes; use anyhow::Context; +use clap::Parser; +use colour::{blue_ln, green_ln, red_ln, yellow_ln}; +use env_logger::Env; use log::warn; use quickpeep::raking::analysis::{load_adblock_engine, IpSet}; -use quickpeep::raking::RakeIntent; +use quickpeep::raking::{RakeIntent, RakeOutcome}; use quickpeep::raking::{Raker, RAKER_USER_AGENT, TIME_LIMIT}; use quickpeep_structs::rake_entries::AnalysisAntifeatures; use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT}; use reqwest::redirect::Policy; use reqwest::Url; use std::path::PathBuf; -use std::str::FromStr; use tokio::fs::File; pub const ADBLOCK_FILTER_PATHS: [(AnalysisAntifeatures, &'static str); 4] = [ @@ -19,8 +21,18 @@ pub const ADBLOCK_FILTER_PATHS: [(AnalysisAntifeatures, &'static str); 4] = [ (AnalysisAntifeatures::ADVERTS, "adverts"), ]; +/// Rakes one URL and prints out the description of it. +#[derive(Clone, Debug, Parser)] +pub struct Opts { + url: Url, +} + #[tokio::main] pub async fn main() -> anyhow::Result<()> { + env_logger::Builder::from_env(Env::default().default_filter_or("info,quickpeep=debug")).init(); + + let opts: Opts = Opts::parse(); + let mut header_map = HeaderMap::new(); header_map.insert(USER_AGENT, HeaderValue::from_static(RAKER_USER_AGENT)); @@ -59,35 +71,33 @@ pub async fn main() -> anyhow::Result<()> { antifeature_ip_set, }; - // raker.rake( - // &Url::from_str("http://nothings.org/gamedev/ssao/")?, - // RakeIntent::Page, - // &client, - // ) - // .await?; - // - // raker.rake( - // &Url::from_str("https://github.com/kuchiki-rs/kuchiki")?, - // RakeIntent::Page, - // &client, - // ) - // .await?; + let outcome = raker.rake(&opts.url, RakeIntent::Any, &client).await?; - raker - .rake( - &Url::from_str("https://www.thesprucepets.com/")?, - RakeIntent::Page, - &client, - ) - .await?; - - raker - .rake( - &Url::from_str("https://matrix.org/")?, - RakeIntent::Page, - &client, - ) - .await?; + match outcome { + RakeOutcome::RakedPage(page) => { + let content_size = serde_bare::to_vec(&page)?.len(); + green_ln!("Page ({} bytes)", content_size); + // TODO + } + RakeOutcome::RakedFeed(feed) => { + green_ln!("Feed"); + // TODO + } + RakeOutcome::RakedSitemap(sitemap) => { + green_ln!("Sitemap"); + // TODO + } + RakeOutcome::Redirect { reason, new_url } => { + blue_ln!("Redirect ({:?})", reason); + println!(" → {}", new_url.as_str()); + } + RakeOutcome::TemporaryFailure(fail) => { + yellow_ln!("Temporary Failure\n\t{:?}", &fail.reason); + } + RakeOutcome::PermanentFailure(fail) => { + red_ln!("Permanent Failure\n\t{:?}", &fail.reason) + } + } Ok(()) } diff --git a/quickpeep/src/raking.rs b/quickpeep/src/raking.rs index bbe2cde..1f39b95 100644 --- a/quickpeep/src/raking.rs +++ b/quickpeep/src/raking.rs @@ -12,8 +12,10 @@ use kuchiki::traits::TendrilSink; use kuchiki::NodeRef; use lazy_static::lazy_static; use log::debug; -use quickpeep_densedoc::DenseTree; -use quickpeep_structs::rake_entries::AnalysisAntifeatures; +use quickpeep_densedoc::{DenseDocument, DenseHead, DenseTree}; +use quickpeep_structs::rake_entries::{ + AnalysisAntifeatures, RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind, +}; use quickpeep_utils::Lazy; use reqwest::header::HeaderMap; use reqwest::{Client, Response, Url}; @@ -44,6 +46,7 @@ pub enum RakeOutcome { PermanentFailure(PermanentFailure), } +#[derive(Debug)] pub enum RedirectReason { /// The page redirected somewhere else. Redirected { @@ -61,27 +64,35 @@ pub struct UrlRaked { pub intent: RakeIntent, } -pub struct RakedPage {} +#[derive(Serialize)] +pub struct RakedPage { + page_entry: RakedPageEntry, + referrer_entry: RakedReferrerEntry, +} pub struct RobotsTxt { pub sitemaps: Vec, pub rules: Cylon, } +#[derive(Debug)] pub struct TemporaryFailure { pub reason: TemporaryFailureReason, pub backoff_sec: u32, } +#[derive(Debug)] pub struct PermanentFailure { pub reason: PermanentFailureReason, } +#[derive(Debug)] pub enum TemporaryFailureReason { MissingInformation(String), ServerError(u16), } +#[derive(Debug)] pub enum PermanentFailureReason { ResourceDenied(u16), WrongLanguage(String), @@ -99,6 +110,8 @@ pub enum RakeIntent { lazy_static! { static ref SITEMAP_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec!["text/xml", "application/xml",]); + + /// MIME types we might expect in content-type headers static ref FEED_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![ "text/xml", "application/xml", @@ -108,6 +121,14 @@ lazy_static! { "application/json", "application/feed+json" ]); + + /// MIME types we might expect in tags + static ref FEED_LINK_MIME_TYPES: HashSet<&'static str> = HashSet::from_iter(vec![ + "application/atom+xml", + "application/rss+xml", + "application/rdf+xml", + "application/feed+json" + ]); } async fn response_to_bytes_limited( @@ -224,14 +245,10 @@ impl Raker { if content_type == "text/html" && (intent == RakeIntent::Any || intent == RakeIntent::Page) { - match self.rake_html_page(&content, url, is_cf, &headers) { - Ok(page_rake) => { - return Ok(page_rake); - } - Err(error) => { - debug!("Failed to rake HTML page: {:?}", error); - } - } + // We don't try any fallbacks for an HTML page + return Ok(self + .rake_html_page(&content, url, is_cf, &headers) + .context("Raking HTML page")?); } if FEED_MIME_TYPES.contains(content_type.as_str()) @@ -344,6 +361,7 @@ impl Raker { let dense_doc = DenseTree::from_body(root_node.clone()); let dense_doc_text = Lazy::new(Box::new(|| DenseTree::generate_textual_format(&dense_doc))); + //eprintln!("^^^^^\n{}\n^^^^^", *dense_doc_text); if language.is_none() { // Final fallback: guess the language @@ -356,29 +374,149 @@ impl Raker { normalise_language(language); } - eprintln!("~~~~~\n{}\n~~~~~", *dense_doc_text); - eprintln!("^^^^^\n{:#?}\n^^^^^", dense_doc); + let mut title = "".to_owned(); - let mut readability = quickpeep_moz_readability::Readability::new_from_node(root_node); - readability - .parse(url.as_str()) - .context("failed to analyse readability")?; + if let Ok(title_node) = root_node.select_first("head title") { + title = title_node.text_contents(); + } + + let mut feeds = Vec::new(); + let mut icon = None; + + for link_node in root_node.select("head link").into_iter().flatten() { + if let Some(rel) = link_node.attributes.borrow().get("rel") { + let rels = rel.split_whitespace().collect_vec(); + if rels.contains(&"icon") { + // This is an icon + if let Some(href) = link_node.attributes.borrow().get("href") { + let icon_url = url + .join(href) + .context("Failed to resolve or parse canonical URL to icon")?; + + icon = Some(icon_url); + } + } else if rels.contains(&"alternate") { + if let Some(rel_type) = link_node.attributes.borrow().get("type") { + if FEED_LINK_MIME_TYPES.contains(rel_type) { + if let Some(href) = link_node.attributes.borrow().get("href") { + let feed_url = url + .join(href) + .context("Failed to resolve or parse canonical URL to feed")?; + + feeds.push(feed_url); + } + } + } + } + } + } + + let mut readability = + quickpeep_moz_readability::Readability::new_from_node(root_node.clone()); + if let Err(err) = readability.parse(url.as_str()) { + debug!("Failed to analyse readability: {:?}", err); + } eprintln!("{:#?}", readability.metadata); - if let Some(_node) = readability.article_node { - //eprintln!("{}", node.to_string()); + if title.is_empty() && !readability.metadata.title().is_empty() { + // Fall back to the readability-derived page title + title = readability.metadata.title().to_owned(); + } + + let mut document = DenseDocument { + head: DenseHead { + title, + language: language.unwrap_or(String::with_capacity(0)), + icon: icon + .map(|url| url.as_str().to_owned()) + .unwrap_or(String::with_capacity(0)), + }, + body_content: Vec::with_capacity(0), + body_remainder: Vec::with_capacity(0), + }; + + if let Some(article_node) = readability.article_node { + document.body_remainder = DenseTree::from_body(root_node.clone()); + document.body_content = DenseTree::from_body(article_node); } let bare_size = serde_bare::to_vec(&dense_doc)?.len(); eprintln!("CS {:?} → {:?}", content.len(), bare_size); + let references = find_references(&document, &feeds, url); Ok(RakeOutcome::RakedPage(RakedPage { - // TODO + page_entry: RakedPageEntry { + analysed_antifeatures: antifeature_flags, + document, + }, + referrer_entry: RakedReferrerEntry { references }, })) } } +pub fn find_references( + doc: &DenseDocument, + feeds: &Vec, + page_url: &Url, +) -> Vec { + let mut refs = Vec::new(); + + fn add_link_refs(tree: &Vec, refs: &mut Vec, page_url: &Url) { + for node in tree { + match node { + DenseTree::Heading1(children) => { + add_link_refs(children, refs, page_url); + } + DenseTree::Heading2(children) => { + add_link_refs(children, refs, page_url); + } + DenseTree::Heading3(children) => { + add_link_refs(children, refs, page_url); + } + DenseTree::Heading4(children) => { + add_link_refs(children, refs, page_url); + } + DenseTree::Heading5(children) => { + add_link_refs(children, refs, page_url); + } + DenseTree::Heading6(children) => { + add_link_refs(children, refs, page_url); + } + DenseTree::Link { + children, + href, + nofollow, + } => { + if !nofollow { + if let Ok(full_url) = page_url.join(&href) { + refs.push(RakedReference { + target: full_url.to_string(), + kind: ReferenceKind::CanonicalUrl, + }) + } + } + add_link_refs(children, refs, page_url); + } + DenseTree::Image { .. } => {} + DenseTree::Text(_) => {} + } + } + } + + add_link_refs(&doc.body_content, &mut refs, &page_url); + add_link_refs(&doc.body_remainder, &mut refs, &page_url); + + for feed in feeds { + refs.push(RakedReference { + target: feed.as_str().to_owned(), + kind: ReferenceKind::HeaderLinkedFeed, + }); + } + + refs +} + pub fn normalise_language(lang_string: &mut String) { *lang_string = lang_string.to_lowercase(); let mut pieces = lang_string diff --git a/quickpeep_densedoc/src/lib.rs b/quickpeep_densedoc/src/lib.rs index 5ad9c27..a65062a 100644 --- a/quickpeep_densedoc/src/lib.rs +++ b/quickpeep_densedoc/src/lib.rs @@ -7,24 +7,18 @@ use std::ops::Deref; #[derive(Serialize, Deserialize, Clone, Debug)] pub struct DenseDocument { - head: DenseHead, - body: Vec, -} - -impl DenseDocument { - pub fn from_document(_root_node: NodeRef) { - todo!() - } + pub head: DenseHead, + pub body_content: Vec, + pub body_remainder: Vec, } #[derive(Serialize, Deserialize, Clone, Debug)] pub struct DenseHead { - title: String, - feed_urls: Vec, + pub title: String, /// Language of the page. May be empty if not discovered. - language: String, + pub language: String, /// URL to icon of the page. May be empty if none were discovered. - icon: String, + pub icon: String, } #[derive(Serialize, Deserialize, Clone, Debug)] diff --git a/quickpeep_structs/Cargo.toml b/quickpeep_structs/Cargo.toml index fee54ad..358d454 100644 --- a/quickpeep_structs/Cargo.toml +++ b/quickpeep_structs/Cargo.toml @@ -7,5 +7,7 @@ edition = "2021" [dependencies] bitflags = "1.3.2" +bitflags_serde_shim = "0.2.2" #arc-interner = "0.7.0" -quickpeep_densedoc = { path = "../quickpeep_densedoc" } \ No newline at end of file +quickpeep_densedoc = { path = "../quickpeep_densedoc" } +serde = { version = "1.0.136", features = ["derive"] } diff --git a/quickpeep_structs/src/rake_entries.rs b/quickpeep_structs/src/rake_entries.rs index 7389b70..47d90f2 100644 --- a/quickpeep_structs/src/rake_entries.rs +++ b/quickpeep_structs/src/rake_entries.rs @@ -1,4 +1,7 @@ use bitflags::bitflags; +use bitflags_serde_shim::impl_serde_for_bitflags; +use quickpeep_densedoc::DenseDocument; +use serde::{Deserialize, Serialize}; bitflags! { pub struct AnalysisAntifeatures: u8 { @@ -17,8 +20,31 @@ bitflags! { } } +impl_serde_for_bitflags!(AnalysisAntifeatures); + +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct RakedPageEntry { pub analysed_antifeatures: AnalysisAntifeatures, - //pub article: Option, - //pub non_article: Option, + pub document: DenseDocument, +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct RakedReferrerEntry { + pub references: Vec, +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct RakedReference { + pub target: String, + pub kind: ReferenceKind, +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub enum ReferenceKind { + CanonicalUrl, + Redirect, + Link, + HeaderLinkedFeed, + FeedEntry, + SitemapEntry, } diff --git a/scripts/get_psl.sh b/scripts/get_psl.sh new file mode 100755 index 0000000..0376a8c --- /dev/null +++ b/scripts/get_psl.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +set -eu + +dir_path="$(dirname "$0")" + +mkdir -p "$dir_path/../data" +wget -O "$dir_path/../data/public_suffices.list" https://publicsuffix.org/list/public_suffix_list.dat +