This commit is contained in:
parent
c3ccd64d5f
commit
5be6cade11
32
docs/internals/formats/seeds.md
Normal file
32
docs/internals/formats/seeds.md
Normal file
@ -0,0 +1,32 @@
|
||||
# QuickPeep Seed Formats
|
||||
|
||||
## Motivation
|
||||
|
||||
The QuickPeep seed format is a simple textual format that can be used to house
|
||||
seeds (initial URLs for the raker and categories for the indexer).
|
||||
|
||||
The main seed pack will be tracked in Git and released under an open data licence.
|
||||
|
||||
It may be useful to other projects to have such a data set available.
|
||||
Contributions to the set of seeds will help the search engine gain results.
|
||||
|
||||
|
||||
## The format
|
||||
|
||||
```quickpeepseed
|
||||
# Remark
|
||||
Category1, Category2:
|
||||
https://example.org
|
||||
https://example.com/blah/* [Tag3, Tag4]
|
||||
|
||||
Category4, Category5:
|
||||
https://blahblahblah.com
|
||||
```
|
||||
|
||||
A file consists of blocks (perhaps only one block).
|
||||
A block starts with header line: a comma-separated list of tags (usually broad categories) followed by a colon.
|
||||
The block then continues with 1 URL or URL pattern per line.
|
||||
A URL or URL pattern may optionally be followed by a square-bracketed list of additional tags.
|
||||
|
||||
A block should ideally end on a blank line, but this is not required.
|
||||
Blank lines and lines beginning with `#` are ignored.
|
@ -1,11 +1,14 @@
|
||||
use clap::Parser;
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use env_logger::Env;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use std::path::PathBuf;
|
||||
use std::path::{Path, PathBuf};
|
||||
use tokio::sync::mpsc::{Receiver, Sender};
|
||||
|
||||
use quickpeep_raker::config;
|
||||
use quickpeep_raker::storage::RakerStore;
|
||||
|
||||
/// Seeds a raker's queue with URLs
|
||||
#[derive(Clone, Debug, Parser)]
|
||||
@ -39,9 +42,73 @@ pub async fn main() -> anyhow::Result<()> {
|
||||
bail!("Seed directory ({:?}) doesn't exist.", config.seed_dir);
|
||||
}
|
||||
|
||||
let store = RakerStore::open(&config.workbench_dir.join("raker.mdbx"))?;
|
||||
|
||||
// TODO progress bar?
|
||||
|
||||
// TODO discover sitemaps at the same time as digging up robots.txt files
|
||||
|
||||
eprintln!("{:#?}", config);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// TODO use the smart string
|
||||
type SmartString = String;
|
||||
|
||||
// TODO use the arc interner
|
||||
type ArcIntern<T> = T;
|
||||
|
||||
pub struct Seed {
|
||||
url: UrlOrUrlPattern,
|
||||
tags: ArcIntern<BTreeSet<SmartString>>,
|
||||
}
|
||||
|
||||
/// Either a URL or a URL prefix.
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum UrlOrUrlPattern {
|
||||
Url(String),
|
||||
UrlPrefix(String),
|
||||
}
|
||||
|
||||
/// Task that loads seeds from the filesystem
|
||||
async fn seed_loader(seed_files: Vec<PathBuf>, send: &Sender<Seed>) -> anyhow::Result<()> {
|
||||
for seed_file in seed_files {
|
||||
// TODO parse the seed file
|
||||
|
||||
// TODO send out seeds
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn find_seed_files(seed_dir: PathBuf) -> anyhow::Result<Vec<PathBuf>> {
|
||||
let mut dirs = vec![seed_dir];
|
||||
let mut seedfiles = Vec::new();
|
||||
|
||||
while let Some(dir_to_scan) = dirs.pop() {
|
||||
let mut dir = tokio::fs::read_dir(&dir_to_scan).await?;
|
||||
|
||||
while let Some(entry) = dir.next_entry().await? {
|
||||
let path = entry.path();
|
||||
if path.starts_with(".") {
|
||||
continue;
|
||||
}
|
||||
if path.ends_with(".seed") {
|
||||
seedfiles.push(path);
|
||||
continue;
|
||||
}
|
||||
if path.is_dir() {
|
||||
// Recurse into this directory later.
|
||||
dirs.push(path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(seedfiles)
|
||||
}
|
||||
|
||||
/// Task that imports seeds into the store
|
||||
async fn importer(store: RakerStore, recv: Receiver<Seed>) -> anyhow::Result<()> {
|
||||
todo!()
|
||||
}
|
||||
|
@ -34,3 +34,9 @@ pub struct BackingOffDomainRecord {
|
||||
/// MUST match the timestamp present in the reinstatements table.
|
||||
pub reinstate_at: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct DomainMaskRestriction {
|
||||
/// TODO List of acceptable URL patterns...
|
||||
pub patterns: Vec<String>,
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user