Move get_reduced_domain
This commit is contained in:
parent
ad10b9eb38
commit
6f596b54dc
18
Cargo.lock
generated
18
Cargo.lock
generated
@ -3280,6 +3280,15 @@ version = "1.0.6"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0744126afe1a6dd7f394cb50a716dbe086cb06e255e53d8d0185d82828358fb5"
|
checksum = "0744126afe1a6dd7f394cb50a716dbe086cb06e255e53d8d0185d82828358fb5"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "patricia_tree"
|
||||||
|
version = "0.3.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "52c4b8ef84caee22395fa083b7d8ee9351e71cdf69a46c832528acdcac402117"
|
||||||
|
dependencies = [
|
||||||
|
"bitflags",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "peeking_take_while"
|
name = "peeking_take_while"
|
||||||
version = "0.1.2"
|
version = "0.1.2"
|
||||||
@ -3649,6 +3658,7 @@ dependencies = [
|
|||||||
"colour",
|
"colour",
|
||||||
"env_logger",
|
"env_logger",
|
||||||
"log",
|
"log",
|
||||||
|
"patricia_tree",
|
||||||
"quickpeep_densedoc",
|
"quickpeep_densedoc",
|
||||||
"quickpeep_index",
|
"quickpeep_index",
|
||||||
"quickpeep_seed_parser",
|
"quickpeep_seed_parser",
|
||||||
@ -3656,6 +3666,7 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
"serde_bare",
|
"serde_bare",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
"smartstring",
|
||||||
"tokio",
|
"tokio",
|
||||||
"toml",
|
"toml",
|
||||||
"zstd",
|
"zstd",
|
||||||
@ -3757,6 +3768,7 @@ dependencies = [
|
|||||||
"lazy_static",
|
"lazy_static",
|
||||||
"log",
|
"log",
|
||||||
"tokio",
|
"tokio",
|
||||||
|
"url",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -4328,11 +4340,13 @@ checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "smartstring"
|
name = "smartstring"
|
||||||
version = "1.0.0"
|
version = "1.0.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ea958ad90cacc8ece7f238fde3671e1b350ee1741964edf2a22fd16f60224163"
|
checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"autocfg",
|
||||||
"static_assertions",
|
"static_assertions",
|
||||||
|
"version_check",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -13,7 +13,6 @@ use tokio::sync::mpsc::Receiver;
|
|||||||
|
|
||||||
use quickpeep_raker::config;
|
use quickpeep_raker::config;
|
||||||
use quickpeep_raker::config::RakerConfig;
|
use quickpeep_raker::config::RakerConfig;
|
||||||
use quickpeep_raker::raking::analysis::get_reduced_domain;
|
|
||||||
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
|
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
|
||||||
use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord};
|
use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord};
|
||||||
use quickpeep_raker::storage::{maintenance, RakerStore};
|
use quickpeep_raker::storage::{maintenance, RakerStore};
|
||||||
@ -21,6 +20,7 @@ use quickpeep_seed_parser::loader::{
|
|||||||
find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION, WEED_EXTENSION,
|
find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION, WEED_EXTENSION,
|
||||||
};
|
};
|
||||||
use quickpeep_utils::dirty::DirtyTracker;
|
use quickpeep_utils::dirty::DirtyTracker;
|
||||||
|
use quickpeep_utils::urls::get_reduced_domain;
|
||||||
|
|
||||||
/// Seeds a raker's queue with URLs
|
/// Seeds a raker's queue with URLs
|
||||||
#[derive(Clone, Debug, Parser)]
|
#[derive(Clone, Debug, Parser)]
|
||||||
|
@ -4,8 +4,6 @@ use anyhow::Context;
|
|||||||
use ipnetwork::IpNetwork;
|
use ipnetwork::IpNetwork;
|
||||||
use kuchiki::NodeRef;
|
use kuchiki::NodeRef;
|
||||||
use lingua::Language;
|
use lingua::Language;
|
||||||
use reqwest::Url;
|
|
||||||
use std::borrow::Cow;
|
|
||||||
use std::collections::{BTreeSet, HashSet};
|
use std::collections::{BTreeSet, HashSet};
|
||||||
use std::net::IpAddr;
|
use std::net::IpAddr;
|
||||||
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
|
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
|
||||||
@ -192,15 +190,6 @@ impl IpSet {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_reduced_domain(url: &Url) -> anyhow::Result<Cow<'_, str>> {
|
|
||||||
let domain = url.domain().context("URLs must have domains")?;
|
|
||||||
|
|
||||||
Ok(Cow::Borrowed(match domain.strip_prefix("www.") {
|
|
||||||
Some(stripped) => stripped,
|
|
||||||
None => domain,
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use crate::raking::analysis::IpSet;
|
use crate::raking::analysis::IpSet;
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
use crate::raking::analysis::get_reduced_domain;
|
|
||||||
use crate::raking::references::references_from_urlrakes;
|
use crate::raking::references::references_from_urlrakes;
|
||||||
use crate::raking::{
|
use crate::raking::{
|
||||||
get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent,
|
get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent,
|
||||||
@ -15,6 +14,7 @@ use quickpeep_structs::rake_entries::{
|
|||||||
IconEntry, RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind,
|
IconEntry, RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind,
|
||||||
};
|
};
|
||||||
use quickpeep_utils::dates::date_to_quickpeep_days;
|
use quickpeep_utils::dates::date_to_quickpeep_days;
|
||||||
|
use quickpeep_utils::urls::get_reduced_domain;
|
||||||
use reqwest::{Client, Url};
|
use reqwest::{Client, Url};
|
||||||
use std::borrow::{Borrow, Cow};
|
use std::borrow::{Borrow, Cow};
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
use crate::raking::analysis::get_reduced_domain;
|
|
||||||
use crate::raking::{RakeIntent, TemporaryFailure};
|
use crate::raking::{RakeIntent, TemporaryFailure};
|
||||||
use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString, MdbxU32, MdbxU64};
|
use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString, MdbxU32, MdbxU64};
|
||||||
use crate::storage::migrations::{MIGRATION_KEY, MIGRATION_VERSION};
|
use crate::storage::migrations::{MIGRATION_KEY, MIGRATION_VERSION};
|
||||||
@ -14,6 +13,7 @@ use libmdbx::{
|
|||||||
use log::info;
|
use log::info;
|
||||||
use metrics::{describe_gauge, gauge, Unit};
|
use metrics::{describe_gauge, gauge, Unit};
|
||||||
use ouroboros::self_referencing;
|
use ouroboros::self_referencing;
|
||||||
|
use quickpeep_utils::urls::get_reduced_domain;
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
use std::borrow::{Borrow, Cow};
|
use std::borrow::{Borrow, Cow};
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
@ -11,3 +11,4 @@ lazy_static = "1.4.0"
|
|||||||
anyhow = "1.0.56"
|
anyhow = "1.0.56"
|
||||||
tokio = { version = "1.17.0", features = ["sync"] }
|
tokio = { version = "1.17.0", features = ["sync"] }
|
||||||
log = "0.4.14"
|
log = "0.4.14"
|
||||||
|
url = "2.2.2"
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
pub mod dates;
|
pub mod dates;
|
||||||
pub mod dirty;
|
pub mod dirty;
|
||||||
pub mod lazy;
|
pub mod lazy;
|
||||||
|
pub mod urls;
|
||||||
|
12
quickpeep_utils/src/urls.rs
Normal file
12
quickpeep_utils/src/urls.rs
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
use anyhow::Context;
|
||||||
|
use std::borrow::Cow;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
|
pub fn get_reduced_domain(url: &Url) -> anyhow::Result<Cow<'_, str>> {
|
||||||
|
let domain = url.domain().context("URLs must have domains")?;
|
||||||
|
|
||||||
|
Ok(Cow::Borrowed(match domain.strip_prefix("www.") {
|
||||||
|
Some(stripped) => stripped,
|
||||||
|
None => domain,
|
||||||
|
}))
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user