From 6f596b54dccdee774f56e21c285726784cacf97c Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Sun, 27 Mar 2022 20:49:20 +0100 Subject: [PATCH] Move get_reduced_domain --- Cargo.lock | 18 ++++++++++++++++-- quickpeep_raker/src/bin/qp-seedrake.rs | 2 +- quickpeep_raker/src/raking/analysis.rs | 11 ----------- quickpeep_raker/src/raking/task.rs | 2 +- quickpeep_raker/src/storage.rs | 2 +- quickpeep_utils/Cargo.toml | 3 ++- quickpeep_utils/src/lib.rs | 1 + quickpeep_utils/src/urls.rs | 12 ++++++++++++ 8 files changed, 34 insertions(+), 17 deletions(-) create mode 100644 quickpeep_utils/src/urls.rs diff --git a/Cargo.lock b/Cargo.lock index 6ee129a..37ad332 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3280,6 +3280,15 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0744126afe1a6dd7f394cb50a716dbe086cb06e255e53d8d0185d82828358fb5" +[[package]] +name = "patricia_tree" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52c4b8ef84caee22395fa083b7d8ee9351e71cdf69a46c832528acdcac402117" +dependencies = [ + "bitflags", +] + [[package]] name = "peeking_take_while" version = "0.1.2" @@ -3649,6 +3658,7 @@ dependencies = [ "colour", "env_logger", "log", + "patricia_tree", "quickpeep_densedoc", "quickpeep_index", "quickpeep_seed_parser", @@ -3656,6 +3666,7 @@ dependencies = [ "serde", "serde_bare", "serde_json", + "smartstring", "tokio", "toml", "zstd", @@ -3757,6 +3768,7 @@ dependencies = [ "lazy_static", "log", "tokio", + "url", ] [[package]] @@ -4328,11 +4340,13 @@ checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" [[package]] name = "smartstring" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea958ad90cacc8ece7f238fde3671e1b350ee1741964edf2a22fd16f60224163" +checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29" dependencies = [ + "autocfg", "static_assertions", + "version_check", ] [[package]] diff --git a/quickpeep_raker/src/bin/qp-seedrake.rs b/quickpeep_raker/src/bin/qp-seedrake.rs index 6c6bda1..0a38b5e 100644 --- a/quickpeep_raker/src/bin/qp-seedrake.rs +++ b/quickpeep_raker/src/bin/qp-seedrake.rs @@ -13,7 +13,6 @@ use tokio::sync::mpsc::Receiver; use quickpeep_raker::config; use quickpeep_raker::config::RakerConfig; -use quickpeep_raker::raking::analysis::get_reduced_domain; use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent}; use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord}; use quickpeep_raker::storage::{maintenance, RakerStore}; @@ -21,6 +20,7 @@ use quickpeep_seed_parser::loader::{ find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION, WEED_EXTENSION, }; use quickpeep_utils::dirty::DirtyTracker; +use quickpeep_utils::urls::get_reduced_domain; /// Seeds a raker's queue with URLs #[derive(Clone, Debug, Parser)] diff --git a/quickpeep_raker/src/raking/analysis.rs b/quickpeep_raker/src/raking/analysis.rs index d8d04a1..8b75598 100644 --- a/quickpeep_raker/src/raking/analysis.rs +++ b/quickpeep_raker/src/raking/analysis.rs @@ -4,8 +4,6 @@ use anyhow::Context; use ipnetwork::IpNetwork; use kuchiki::NodeRef; use lingua::Language; -use reqwest::Url; -use std::borrow::Cow; use std::collections::{BTreeSet, HashSet}; use std::net::IpAddr; use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader}; @@ -192,15 +190,6 @@ impl IpSet { } } -pub fn get_reduced_domain(url: &Url) -> anyhow::Result> { - let domain = url.domain().context("URLs must have domains")?; - - Ok(Cow::Borrowed(match domain.strip_prefix("www.") { - Some(stripped) => stripped, - None => domain, - })) -} - #[cfg(test)] mod test { use crate::raking::analysis::IpSet; diff --git a/quickpeep_raker/src/raking/task.rs b/quickpeep_raker/src/raking/task.rs index 280e9be..56e0b6f 100644 --- a/quickpeep_raker/src/raking/task.rs +++ b/quickpeep_raker/src/raking/task.rs @@ -1,4 +1,3 @@ -use crate::raking::analysis::get_reduced_domain; use crate::raking::references::references_from_urlrakes; use crate::raking::{ get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent, @@ -15,6 +14,7 @@ use quickpeep_structs::rake_entries::{ IconEntry, RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind, }; use quickpeep_utils::dates::date_to_quickpeep_days; +use quickpeep_utils::urls::get_reduced_domain; use reqwest::{Client, Url}; use std::borrow::{Borrow, Cow}; use std::collections::HashSet; diff --git a/quickpeep_raker/src/storage.rs b/quickpeep_raker/src/storage.rs index 1a92c43..d7fe180 100644 --- a/quickpeep_raker/src/storage.rs +++ b/quickpeep_raker/src/storage.rs @@ -1,4 +1,3 @@ -use crate::raking::analysis::get_reduced_domain; use crate::raking::{RakeIntent, TemporaryFailure}; use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString, MdbxU32, MdbxU64}; use crate::storage::migrations::{MIGRATION_KEY, MIGRATION_VERSION}; @@ -14,6 +13,7 @@ use libmdbx::{ use log::info; use metrics::{describe_gauge, gauge, Unit}; use ouroboros::self_referencing; +use quickpeep_utils::urls::get_reduced_domain; use reqwest::Url; use std::borrow::{Borrow, Cow}; use std::collections::HashSet; diff --git a/quickpeep_utils/Cargo.toml b/quickpeep_utils/Cargo.toml index e3f01e4..37c8b54 100644 --- a/quickpeep_utils/Cargo.toml +++ b/quickpeep_utils/Cargo.toml @@ -10,4 +10,5 @@ chrono = "0.4.19" lazy_static = "1.4.0" anyhow = "1.0.56" tokio = { version = "1.17.0", features = ["sync"] } -log = "0.4.14" \ No newline at end of file +log = "0.4.14" +url = "2.2.2" diff --git a/quickpeep_utils/src/lib.rs b/quickpeep_utils/src/lib.rs index ae342d9..3e80b73 100644 --- a/quickpeep_utils/src/lib.rs +++ b/quickpeep_utils/src/lib.rs @@ -1,3 +1,4 @@ pub mod dates; pub mod dirty; pub mod lazy; +pub mod urls; diff --git a/quickpeep_utils/src/urls.rs b/quickpeep_utils/src/urls.rs new file mode 100644 index 0000000..a8ef851 --- /dev/null +++ b/quickpeep_utils/src/urls.rs @@ -0,0 +1,12 @@ +use anyhow::Context; +use std::borrow::Cow; +use url::Url; + +pub fn get_reduced_domain(url: &Url) -> anyhow::Result> { + let domain = url.domain().context("URLs must have domains")?; + + Ok(Cow::Borrowed(match domain.strip_prefix("www.") { + Some(stripped) => stripped, + None => domain, + })) +}