Move get_reduced_domain

This commit is contained in:
Olivier 'reivilibre' 2022-03-27 20:49:20 +01:00
parent ad10b9eb38
commit 6f596b54dc
8 changed files with 34 additions and 17 deletions

18
Cargo.lock generated
View File

@ -3280,6 +3280,15 @@ version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0744126afe1a6dd7f394cb50a716dbe086cb06e255e53d8d0185d82828358fb5"
[[package]]
name = "patricia_tree"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52c4b8ef84caee22395fa083b7d8ee9351e71cdf69a46c832528acdcac402117"
dependencies = [
"bitflags",
]
[[package]]
name = "peeking_take_while"
version = "0.1.2"
@ -3649,6 +3658,7 @@ dependencies = [
"colour",
"env_logger",
"log",
"patricia_tree",
"quickpeep_densedoc",
"quickpeep_index",
"quickpeep_seed_parser",
@ -3656,6 +3666,7 @@ dependencies = [
"serde",
"serde_bare",
"serde_json",
"smartstring",
"tokio",
"toml",
"zstd",
@ -3757,6 +3768,7 @@ dependencies = [
"lazy_static",
"log",
"tokio",
"url",
]
[[package]]
@ -4328,11 +4340,13 @@ checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83"
[[package]]
name = "smartstring"
version = "1.0.0"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea958ad90cacc8ece7f238fde3671e1b350ee1741964edf2a22fd16f60224163"
checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29"
dependencies = [
"autocfg",
"static_assertions",
"version_check",
]
[[package]]

View File

@ -13,7 +13,6 @@ use tokio::sync::mpsc::Receiver;
use quickpeep_raker::config;
use quickpeep_raker::config::RakerConfig;
use quickpeep_raker::raking::analysis::get_reduced_domain;
use quickpeep_raker::raking::{get_robots_txt_for, RakeIntent};
use quickpeep_raker::storage::records::{AllowedDomainRecord, WeedDomainRecord};
use quickpeep_raker::storage::{maintenance, RakerStore};
@ -21,6 +20,7 @@ use quickpeep_seed_parser::loader::{
find_seed_files, seed_loader, Seed, UrlOrUrlPattern, SEED_EXTENSION, WEED_EXTENSION,
};
use quickpeep_utils::dirty::DirtyTracker;
use quickpeep_utils::urls::get_reduced_domain;
/// Seeds a raker's queue with URLs
#[derive(Clone, Debug, Parser)]

View File

@ -4,8 +4,6 @@ use anyhow::Context;
use ipnetwork::IpNetwork;
use kuchiki::NodeRef;
use lingua::Language;
use reqwest::Url;
use std::borrow::Cow;
use std::collections::{BTreeSet, HashSet};
use std::net::IpAddr;
use tokio::io::{AsyncBufReadExt, AsyncRead, BufReader};
@ -192,15 +190,6 @@ impl IpSet {
}
}
pub fn get_reduced_domain(url: &Url) -> anyhow::Result<Cow<'_, str>> {
let domain = url.domain().context("URLs must have domains")?;
Ok(Cow::Borrowed(match domain.strip_prefix("www.") {
Some(stripped) => stripped,
None => domain,
}))
}
#[cfg(test)]
mod test {
use crate::raking::analysis::IpSet;

View File

@ -1,4 +1,3 @@
use crate::raking::analysis::get_reduced_domain;
use crate::raking::references::references_from_urlrakes;
use crate::raking::{
get_robots_txt_for, robots_txt_url_for, PermanentFailure, PermanentFailureReason, RakeIntent,
@ -15,6 +14,7 @@ use quickpeep_structs::rake_entries::{
IconEntry, RakedPageEntry, RakedReference, RakedReferrerEntry, ReferenceKind,
};
use quickpeep_utils::dates::date_to_quickpeep_days;
use quickpeep_utils::urls::get_reduced_domain;
use reqwest::{Client, Url};
use std::borrow::{Borrow, Cow};
use std::collections::HashSet;

View File

@ -1,4 +1,3 @@
use crate::raking::analysis::get_reduced_domain;
use crate::raking::{RakeIntent, TemporaryFailure};
use crate::storage::mdbx_helper_types::{MdbxBare, MdbxString, MdbxU32, MdbxU64};
use crate::storage::migrations::{MIGRATION_KEY, MIGRATION_VERSION};
@ -14,6 +13,7 @@ use libmdbx::{
use log::info;
use metrics::{describe_gauge, gauge, Unit};
use ouroboros::self_referencing;
use quickpeep_utils::urls::get_reduced_domain;
use reqwest::Url;
use std::borrow::{Borrow, Cow};
use std::collections::HashSet;

View File

@ -11,3 +11,4 @@ lazy_static = "1.4.0"
anyhow = "1.0.56"
tokio = { version = "1.17.0", features = ["sync"] }
log = "0.4.14"
url = "2.2.2"

View File

@ -1,3 +1,4 @@
pub mod dates;
pub mod dirty;
pub mod lazy;
pub mod urls;

View File

@ -0,0 +1,12 @@
use anyhow::Context;
use std::borrow::Cow;
use url::Url;
pub fn get_reduced_domain(url: &Url) -> anyhow::Result<Cow<'_, str>> {
let domain = url.domain().context("URLs must have domains")?;
Ok(Cow::Borrowed(match domain.strip_prefix("www.") {
Some(stripped) => stripped,
None => domain,
}))
}