STASH notes about seeds
continuous-integration/drone the build failed
Details
continuous-integration/drone the build failed
Details
This commit is contained in:
parent
c3ccd64d5f
commit
5be6cade11
|
@ -0,0 +1,32 @@
|
||||||
|
# QuickPeep Seed Formats
|
||||||
|
|
||||||
|
## Motivation
|
||||||
|
|
||||||
|
The QuickPeep seed format is a simple textual format that can be used to house
|
||||||
|
seeds (initial URLs for the raker and categories for the indexer).
|
||||||
|
|
||||||
|
The main seed pack will be tracked in Git and released under an open data licence.
|
||||||
|
|
||||||
|
It may be useful to other projects to have such a data set available.
|
||||||
|
Contributions to the set of seeds will help the search engine gain results.
|
||||||
|
|
||||||
|
|
||||||
|
## The format
|
||||||
|
|
||||||
|
```quickpeepseed
|
||||||
|
# Remark
|
||||||
|
Category1, Category2:
|
||||||
|
https://example.org
|
||||||
|
https://example.com/blah/* [Tag3, Tag4]
|
||||||
|
|
||||||
|
Category4, Category5:
|
||||||
|
https://blahblahblah.com
|
||||||
|
```
|
||||||
|
|
||||||
|
A file consists of blocks (perhaps only one block).
|
||||||
|
A block starts with header line: a comma-separated list of tags (usually broad categories) followed by a colon.
|
||||||
|
The block then continues with 1 URL or URL pattern per line.
|
||||||
|
A URL or URL pattern may optionally be followed by a square-bracketed list of additional tags.
|
||||||
|
|
||||||
|
A block should ideally end on a blank line, but this is not required.
|
||||||
|
Blank lines and lines beginning with `#` are ignored.
|
|
@ -1,11 +1,14 @@
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
|
use std::collections::BTreeSet;
|
||||||
|
|
||||||
use env_logger::Env;
|
use env_logger::Env;
|
||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use std::path::PathBuf;
|
use std::path::{Path, PathBuf};
|
||||||
|
use tokio::sync::mpsc::{Receiver, Sender};
|
||||||
|
|
||||||
use quickpeep_raker::config;
|
use quickpeep_raker::config;
|
||||||
|
use quickpeep_raker::storage::RakerStore;
|
||||||
|
|
||||||
/// Seeds a raker's queue with URLs
|
/// Seeds a raker's queue with URLs
|
||||||
#[derive(Clone, Debug, Parser)]
|
#[derive(Clone, Debug, Parser)]
|
||||||
|
@ -39,9 +42,73 @@ pub async fn main() -> anyhow::Result<()> {
|
||||||
bail!("Seed directory ({:?}) doesn't exist.", config.seed_dir);
|
bail!("Seed directory ({:?}) doesn't exist.", config.seed_dir);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let store = RakerStore::open(&config.workbench_dir.join("raker.mdbx"))?;
|
||||||
|
|
||||||
|
// TODO progress bar?
|
||||||
|
|
||||||
// TODO discover sitemaps at the same time as digging up robots.txt files
|
// TODO discover sitemaps at the same time as digging up robots.txt files
|
||||||
|
|
||||||
eprintln!("{:#?}", config);
|
eprintln!("{:#?}", config);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO use the smart string
|
||||||
|
type SmartString = String;
|
||||||
|
|
||||||
|
// TODO use the arc interner
|
||||||
|
type ArcIntern<T> = T;
|
||||||
|
|
||||||
|
pub struct Seed {
|
||||||
|
url: UrlOrUrlPattern,
|
||||||
|
tags: ArcIntern<BTreeSet<SmartString>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Either a URL or a URL prefix.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub enum UrlOrUrlPattern {
|
||||||
|
Url(String),
|
||||||
|
UrlPrefix(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Task that loads seeds from the filesystem
|
||||||
|
async fn seed_loader(seed_files: Vec<PathBuf>, send: &Sender<Seed>) -> anyhow::Result<()> {
|
||||||
|
for seed_file in seed_files {
|
||||||
|
// TODO parse the seed file
|
||||||
|
|
||||||
|
// TODO send out seeds
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn find_seed_files(seed_dir: PathBuf) -> anyhow::Result<Vec<PathBuf>> {
|
||||||
|
let mut dirs = vec![seed_dir];
|
||||||
|
let mut seedfiles = Vec::new();
|
||||||
|
|
||||||
|
while let Some(dir_to_scan) = dirs.pop() {
|
||||||
|
let mut dir = tokio::fs::read_dir(&dir_to_scan).await?;
|
||||||
|
|
||||||
|
while let Some(entry) = dir.next_entry().await? {
|
||||||
|
let path = entry.path();
|
||||||
|
if path.starts_with(".") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if path.ends_with(".seed") {
|
||||||
|
seedfiles.push(path);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if path.is_dir() {
|
||||||
|
// Recurse into this directory later.
|
||||||
|
dirs.push(path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(seedfiles)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Task that imports seeds into the store
|
||||||
|
async fn importer(store: RakerStore, recv: Receiver<Seed>) -> anyhow::Result<()> {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
|
|
@ -34,3 +34,9 @@ pub struct BackingOffDomainRecord {
|
||||||
/// MUST match the timestamp present in the reinstatements table.
|
/// MUST match the timestamp present in the reinstatements table.
|
||||||
pub reinstate_at: u64,
|
pub reinstate_at: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
pub struct DomainMaskRestriction {
|
||||||
|
/// TODO List of acceptable URL patterns...
|
||||||
|
pub patterns: Vec<String>,
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue