Add seed file parser
This commit is contained in:
parent
5e61386a83
commit
39aa4eb9b7
|
@ -309,13 +309,34 @@ dependencies = [
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "block-buffer"
|
||||||
|
version = "0.7.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c0940dc441f31689269e10ac70eb1002a3a1d3ad1390e030043662eb7fe4688b"
|
||||||
|
dependencies = [
|
||||||
|
"block-padding",
|
||||||
|
"byte-tools",
|
||||||
|
"byteorder",
|
||||||
|
"generic-array 0.12.4",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "block-buffer"
|
name = "block-buffer"
|
||||||
version = "0.9.0"
|
version = "0.9.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4"
|
checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"generic-array",
|
"generic-array 0.14.5",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "block-padding"
|
||||||
|
version = "0.1.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fa79dedbb091f449f1f39e53edf88d5dbe95f895dae6135a8d7b881fb5af73f5"
|
||||||
|
dependencies = [
|
||||||
|
"byte-tools",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -324,6 +345,12 @@ version = "3.9.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a4a45a46ab1f2412e53d3a0ade76ffad2025804294569aae387231a0cd6e0899"
|
checksum = "a4a45a46ab1f2412e53d3a0ade76ffad2025804294569aae387231a0cd6e0899"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "byte-tools"
|
||||||
|
version = "0.3.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "byteorder"
|
name = "byteorder"
|
||||||
version = "1.4.3"
|
version = "1.4.3"
|
||||||
|
@ -682,13 +709,22 @@ dependencies = [
|
||||||
"syn",
|
"syn",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "digest"
|
||||||
|
version = "0.8.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f3d0c8c8752312f9713efd397ff63acb9f85585afbf179282e720e7704954dd5"
|
||||||
|
dependencies = [
|
||||||
|
"generic-array 0.12.4",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "digest"
|
name = "digest"
|
||||||
version = "0.9.0"
|
version = "0.9.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066"
|
checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"generic-array",
|
"generic-array 0.14.5",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -750,6 +786,12 @@ dependencies = [
|
||||||
"termcolor",
|
"termcolor",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fake-simd"
|
||||||
|
version = "0.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fastrand"
|
name = "fastrand"
|
||||||
version = "1.7.0"
|
version = "1.7.0"
|
||||||
|
@ -963,6 +1005,15 @@ dependencies = [
|
||||||
"x509-signature",
|
"x509-signature",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "generic-array"
|
||||||
|
version = "0.12.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ffdf9f34f1447443d37393cc6c2b8313aebddcd96906caf34e54c68d8e57d7bd"
|
||||||
|
dependencies = [
|
||||||
|
"typenum",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "generic-array"
|
name = "generic-array"
|
||||||
version = "0.14.5"
|
version = "0.14.5"
|
||||||
|
@ -2511,6 +2562,12 @@ version = "1.10.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9"
|
checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "opaque-debug"
|
||||||
|
version = "0.2.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "opaque-debug"
|
name = "opaque-debug"
|
||||||
version = "0.3.0"
|
version = "0.3.0"
|
||||||
|
@ -2649,6 +2706,49 @@ version = "2.1.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e"
|
checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pest"
|
||||||
|
version = "2.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53"
|
||||||
|
dependencies = [
|
||||||
|
"ucd-trie",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pest_derive"
|
||||||
|
version = "2.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0"
|
||||||
|
dependencies = [
|
||||||
|
"pest",
|
||||||
|
"pest_generator",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pest_generator"
|
||||||
|
version = "2.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55"
|
||||||
|
dependencies = [
|
||||||
|
"pest",
|
||||||
|
"pest_meta",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pest_meta"
|
||||||
|
version = "2.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d"
|
||||||
|
dependencies = [
|
||||||
|
"maplit",
|
||||||
|
"pest",
|
||||||
|
"sha-1",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "phf"
|
name = "phf"
|
||||||
version = "0.8.0"
|
version = "0.8.0"
|
||||||
|
@ -2946,6 +3046,16 @@ dependencies = [
|
||||||
"toml",
|
"toml",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "quickpeep_seed_parser"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"pest",
|
||||||
|
"pest_derive",
|
||||||
|
"smartstring",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "quickpeep_structs"
|
name = "quickpeep_structs"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
|
@ -3404,17 +3514,29 @@ dependencies = [
|
||||||
"stable_deref_trait",
|
"stable_deref_trait",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "sha-1"
|
||||||
|
version = "0.8.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f7d94d0bede923b3cea61f3f1ff57ff8cdfd77b400fb8f9998949e0cf04163df"
|
||||||
|
dependencies = [
|
||||||
|
"block-buffer 0.7.3",
|
||||||
|
"digest 0.8.1",
|
||||||
|
"fake-simd",
|
||||||
|
"opaque-debug 0.2.3",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sha2"
|
name = "sha2"
|
||||||
version = "0.9.9"
|
version = "0.9.9"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800"
|
checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"block-buffer",
|
"block-buffer 0.9.0",
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"cpufeatures",
|
"cpufeatures",
|
||||||
"digest",
|
"digest 0.9.0",
|
||||||
"opaque-debug",
|
"opaque-debug 0.3.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -3996,6 +4118,12 @@ version = "1.15.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987"
|
checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ucd-trie"
|
||||||
|
version = "0.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unchecked-index"
|
name = "unchecked-index"
|
||||||
version = "0.2.2"
|
version = "0.2.2"
|
||||||
|
|
|
@ -5,6 +5,7 @@ members = [
|
||||||
"quickpeep_raker",
|
"quickpeep_raker",
|
||||||
"quickpeep_densedoc",
|
"quickpeep_densedoc",
|
||||||
"quickpeep_moz_readability",
|
"quickpeep_moz_readability",
|
||||||
|
"quickpeep_seed_parser",
|
||||||
"quickpeep_structs",
|
"quickpeep_structs",
|
||||||
"quickpeep_utils"
|
"quickpeep_utils"
|
||||||
]
|
]
|
||||||
|
|
|
@ -0,0 +1,12 @@
|
||||||
|
[package]
|
||||||
|
name = "quickpeep_seed_parser"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
pest = "2.1.3"
|
||||||
|
pest_derive = "2.1.0"
|
||||||
|
smartstring = "1.0.0"
|
||||||
|
anyhow = "1.0.56"
|
|
@ -0,0 +1,80 @@
|
||||||
|
extern crate pest;
|
||||||
|
#[macro_use]
|
||||||
|
extern crate pest_derive;
|
||||||
|
|
||||||
|
use anyhow::{bail, ensure, Context};
|
||||||
|
use smartstring::alias::CompactString;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test;
|
||||||
|
|
||||||
|
#[derive(Parser)]
|
||||||
|
#[grammar = "seed.pest"]
|
||||||
|
struct SeedParser;
|
||||||
|
|
||||||
|
use pest::Parser;
|
||||||
|
|
||||||
|
#[derive(Eq, PartialEq, Clone, Debug)]
|
||||||
|
pub struct SeedBlock {
|
||||||
|
pub tags: Vec<CompactString>,
|
||||||
|
pub seeds: Vec<Seed>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Eq, PartialEq, Clone, Debug)]
|
||||||
|
pub struct Seed {
|
||||||
|
pub url: String,
|
||||||
|
pub extra_tags: Vec<CompactString>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn parse_seeds(input: &str) -> anyhow::Result<Vec<SeedBlock>> {
|
||||||
|
use pest::iterators::Pair;
|
||||||
|
|
||||||
|
pub fn parse_tag(pair: Pair<Rule>) -> anyhow::Result<CompactString> {
|
||||||
|
ensure!(matches!(pair.as_rule(), Rule::tagName));
|
||||||
|
Ok(CompactString::from(pair.as_str()))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn parse_url(pair: Pair<Rule>) -> anyhow::Result<String> {
|
||||||
|
ensure!(matches!(pair.as_rule(), Rule::url));
|
||||||
|
Ok(String::from(pair.as_str()))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn parse_seedblock_header(pair: Pair<Rule>) -> anyhow::Result<Vec<CompactString>> {
|
||||||
|
pair.into_inner().map(parse_tag).collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn parse_seed(pair: Pair<Rule>) -> anyhow::Result<Seed> {
|
||||||
|
let mut children = pair.into_inner();
|
||||||
|
let url = parse_url(children.next().context("Expecting URL")?)?;
|
||||||
|
|
||||||
|
let extra_tags: anyhow::Result<Vec<CompactString>> = children.map(parse_tag).collect();
|
||||||
|
let extra_tags = extra_tags?;
|
||||||
|
|
||||||
|
Ok(Seed { url, extra_tags })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn parse_seedblock(pair: Pair<Rule>) -> anyhow::Result<SeedBlock> {
|
||||||
|
match pair.as_rule() {
|
||||||
|
Rule::seedblock => {
|
||||||
|
let mut children = pair.into_inner();
|
||||||
|
let header = children.next().context("No seedblock header")?;
|
||||||
|
let tags = parse_seedblock_header(header)?;
|
||||||
|
|
||||||
|
let urls: anyhow::Result<Vec<Seed>> = children.map(parse_seed).collect();
|
||||||
|
let seeds = urls?;
|
||||||
|
|
||||||
|
Ok(SeedBlock { tags, seeds })
|
||||||
|
}
|
||||||
|
other => {
|
||||||
|
bail!("Looking for seedblock; unexpected {:#?}", other);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let pairs = SeedParser::parse(Rule::main, input)?;
|
||||||
|
pairs
|
||||||
|
.into_iter()
|
||||||
|
.filter(|pair| pair.as_rule() != Rule::EOI)
|
||||||
|
.map(parse_seedblock)
|
||||||
|
.collect()
|
||||||
|
}
|
|
@ -0,0 +1,20 @@
|
||||||
|
main = _{ SOI ~ (WHITESPACE | newline)* ~ seedblock* ~ (WHITESPACE | newline)* ~ EOI }
|
||||||
|
|
||||||
|
seedheader = { !urlPrefix ~ tagName ~ ("," ~ tagName)* ~ ":" }
|
||||||
|
seedblock = { seedheader ~ newline+ ~ urlEntry+ }
|
||||||
|
|
||||||
|
urlEntry = ${ url ~ (WHITESPACE+ ~ oneUrlTags)? ~ WHITESPACE* ~ newline+ }
|
||||||
|
urlPrefix = _{ "http://" | "https://" | "gemini://" }
|
||||||
|
url = @{ urlPrefix ~ nonwhitespace+ }
|
||||||
|
|
||||||
|
tagName = @{ ALPHABETIC+ }
|
||||||
|
|
||||||
|
oneUrlTags = _{ "[" ~ tagName ~ ("," ~ tagName)* ~ "]" }
|
||||||
|
|
||||||
|
|
||||||
|
COMMENT = _{ "#" ~ (!newline ~ ANY)* ~ newline }
|
||||||
|
|
||||||
|
WHITESPACE = _{ " " | "\t" }
|
||||||
|
nonwhitespace = @{ !(" " | "\t" | "\n") ~ ANY }
|
||||||
|
|
||||||
|
newline = _{ "\n" }
|
|
@ -0,0 +1,66 @@
|
||||||
|
use crate::{parse_seeds, Seed, SeedBlock};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_simple_seeds() {
|
||||||
|
let parsed = parse_seeds(
|
||||||
|
r#"
|
||||||
|
Blog, Example:
|
||||||
|
https://example.org
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
parsed,
|
||||||
|
vec![SeedBlock {
|
||||||
|
tags: vec!["Blog".into(), "Example".into()],
|
||||||
|
seeds: vec![Seed {
|
||||||
|
url: "https://example.org".to_string(),
|
||||||
|
extra_tags: vec![]
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_complicated_seeds() {
|
||||||
|
let parsed = parse_seeds(
|
||||||
|
r#"
|
||||||
|
Blog, Example:
|
||||||
|
https://example.org
|
||||||
|
# This demonstrates fancy extra tags
|
||||||
|
https://example.com [Commercial]
|
||||||
|
|
||||||
|
# This demonstrates another seed block
|
||||||
|
Software:
|
||||||
|
https://www2.example.org
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
parsed,
|
||||||
|
vec![
|
||||||
|
SeedBlock {
|
||||||
|
tags: vec!["Blog".into(), "Example".into()],
|
||||||
|
seeds: vec![
|
||||||
|
Seed {
|
||||||
|
url: "https://example.org".to_string(),
|
||||||
|
extra_tags: vec![]
|
||||||
|
},
|
||||||
|
Seed {
|
||||||
|
url: "https://example.com".to_string(),
|
||||||
|
extra_tags: vec!["Commercial".into()]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
SeedBlock {
|
||||||
|
tags: vec!["Software".into()],
|
||||||
|
seeds: vec![Seed {
|
||||||
|
url: "https://www2.example.org".to_string(),
|
||||||
|
extra_tags: vec![]
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
);
|
||||||
|
}
|
Loading…
Reference in New Issue