Move get_reduced_domain
This commit is contained in:
parent
ad10b9eb38
commit
6f596b54dc
18
Cargo.lock
generated
18
Cargo.lock
generated
@ -3280,6 +3280,15 @@ version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0744126afe1a6dd7f394cb50a716dbe086cb06e255e53d8d0185d82828358fb5"
|
||||
|
||||
[[package]]
|
||||
name = "patricia_tree"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "52c4b8ef84caee22395fa083b7d8ee9351e71cdf69a46c832528acdcac402117"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "peeking_take_while"
|
||||
version = "0.1.2"
|
||||
@ -3649,6 +3658,7 @@ dependencies = [
|
||||
"colour",
|
||||
"env_logger",
|
||||
"log",
|
||||
"patricia_tree",
|
||||
"quickpeep_densedoc",
|
||||
"quickpeep_index",
|
||||
"quickpeep_seed_parser",
|
||||
@ -3656,6 +3666,7 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_bare",
|
||||
"serde_json",
|
||||
"smartstring",
|
||||
"tokio",
|
||||
"toml",
|
||||
"zstd",
|
||||
@ -3757,6 +3768,7 @@ dependencies = [
|
||||
"lazy_static",
|
||||
"log",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -4328,11 +4340,13 @@ checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83"
|
||||
|
||||
[[package]]
|
||||
name = "smartstring"
|
||||
version = "1.0.0"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ea958ad90cacc8ece7f238fde3671e1b350ee1741964edf2a22fd16f60224163"
|
||||
checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"static_assertions",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -13,7 +13,6 @@ use tokio::sync::mpsc::Receiver;
|
||||
|
||||
use quickpeep_raker::config;
|
||||
use quickpeep_raker::config::RakerConfig;
|
||||
use quickpeep_raker::raking::analysis::get_reduced_domain;
|
||||
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
|
||||
use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord};
|
||||
use quickpeep_raker::storage::{maintenance, RakerStore};
|
||||
@ -21,6 +20,7 @@ use quickpeep_seed_parser::loader::{
|
||||
find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION, WEED_EXTENSION,
|
||||
};
|
||||
use quickpeep_utils::dirty::DirtyTracker;
|
||||
use quickpeep_utils::urls::get_reduced_domain;
|
||||
|
||||
/// Seeds a raker's queue with URLs
|
||||
#[derive(Clone, Debug, Parser)]
|
||||
|
@ -4,8 +4,6 @@ use anyhow::Context;
|
||||
use ipnetwork::IpNetwork;
|
||||
use kuchiki::NodeRef;
|
||||
use lingua::Language;
|
||||
use reqwest::Url;
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::net::IpAddr;
|
||||
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
|
||||
@ -192,15 +190,6 @@ impl IpSet {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_reduced_domain(url: &Url) -> anyhow::Result<Cow<'_, str>> {
|
||||
let domain = url.domain().context("URLs must have domains")?;
|
||||
|
||||
Ok(Cow::Borrowed(match domain.strip_prefix("www.") {
|
||||
Some(stripped) => stripped,
|
||||
None => domain,
|
||||
}))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use crate::raking::analysis::IpSet;
|
||||
|
@ -1,4 +1,3 @@
|
||||
use crate::raking::analysis::get_reduced_domain;
|
||||
use crate::raking::references::references_from_urlrakes;
|
||||
use crate::raking::{
|
||||
get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent,
|
||||
@ -15,6 +14,7 @@ use quickpeep_structs::rake_entries::{
|
||||
IconEntry, RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind,
|
||||
};
|
||||
use quickpeep_utils::dates::date_to_quickpeep_days;
|
||||
use quickpeep_utils::urls::get_reduced_domain;
|
||||
use reqwest::{Client, Url};
|
||||
use std::borrow::{Borrow, Cow};
|
||||
use std::collections::HashSet;
|
||||
|
@ -1,4 +1,3 @@
|
||||
use crate::raking::analysis::get_reduced_domain;
|
||||
use crate::raking::{RakeIntent, TemporaryFailure};
|
||||
use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString, MdbxU32, MdbxU64};
|
||||
use crate::storage::migrations::{MIGRATION_KEY, MIGRATION_VERSION};
|
||||
@ -14,6 +13,7 @@ use libmdbx::{
|
||||
use log::info;
|
||||
use metrics::{describe_gauge, gauge, Unit};
|
||||
use ouroboros::self_referencing;
|
||||
use quickpeep_utils::urls::get_reduced_domain;
|
||||
use reqwest::Url;
|
||||
use std::borrow::{Borrow, Cow};
|
||||
use std::collections::HashSet;
|
||||
|
@ -11,3 +11,4 @@ lazy_static = "1.4.0"
|
||||
anyhow = "1.0.56"
|
||||
tokio = { version = "1.17.0", features = ["sync"] }
|
||||
log = "0.4.14"
|
||||
url = "2.2.2"
|
||||
|
@ -1,3 +1,4 @@
|
||||
pub mod dates;
|
||||
pub mod dirty;
|
||||
pub mod lazy;
|
||||
pub mod urls;
|
||||
|
12
quickpeep_utils/src/urls.rs
Normal file
12
quickpeep_utils/src/urls.rs
Normal file
@ -0,0 +1,12 @@
|
||||
use anyhow::Context;
|
||||
use std::borrow::Cow;
|
||||
use url::Url;
|
||||
|
||||
pub fn get_reduced_domain(url: &Url) -> anyhow::Result<Cow<'_, str>> {
|
||||
let domain = url.domain().context("URLs must have domains")?;
|
||||
|
||||
Ok(Cow::Borrowed(match domain.strip_prefix("www.") {
|
||||
Some(stripped) => stripped,
|
||||
None => domain,
|
||||
}))
|
||||
}
|
Loading…
Reference in New Issue
Block a user