diff --git a/quickpeep_densedoc/src/lib.rs b/quickpeep_densedoc/src/lib.rs index a65062a..32cce5d 100644 --- a/quickpeep_densedoc/src/lib.rs +++ b/quickpeep_densedoc/src/lib.rs @@ -272,7 +272,7 @@ impl DenseTreeBuilder { self.preceding_newlines = 0; } "img" => { - // TODO Decide if this is worth the space... + // TODO(future) Decide if this is worth the space... let attrs = element.attributes.borrow(); let src = attrs.get("src").unwrap_or("").to_owned(); diff --git a/quickpeep_moz_readability/src/lib.rs b/quickpeep_moz_readability/src/lib.rs index 4b6aa1a..646b927 100644 --- a/quickpeep_moz_readability/src/lib.rs +++ b/quickpeep_moz_readability/src/lib.rs @@ -26,17 +26,17 @@ const FLAG_WEIGHT_CLASSES: u32 = 0x2; const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4; const READABILITY_SCORE: &'static str = "readability-score"; const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml"; -// TODO: Change to HashSet +// TODO(perf): Change to HashSet const PHRASING_ELEMS: [&str; 39] = [ "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em", "embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter", "noscript", "object", "output", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong", "sub", "sup", "textarea", "time", "var", "wbr", ]; -// TODO: Change to HashSet +// TODO(perf): Change to HashSet const DEFAULT_TAGS_TO_SCORE: [&str; 9] = ["section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre"]; -// TODO: Change to HashSet +// TODO(perf): Change to HashSet const ALTER_TO_DIV_EXCEPTIONS: [&str; 4] = ["div", "article", "section", "p"]; const PRESENTATIONAL_ATTRIBUTES: [&str; 12] = [ "align", @@ -54,7 +54,7 @@ const PRESENTATIONAL_ATTRIBUTES: [&str; 12] = [ ]; const DATA_TABLE_DESCENDANTS: [&str; 5] = ["col", "colgroup", "tfoot", "thead", "th"]; -// TODO: Change to HashSet +// TODO(perf): Change to HashSet const DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [&str; 5] = ["table", "th", "td", "hr", "pre"]; pub mod regexes; @@ -168,7 +168,7 @@ impl Readability { continue; } if let Some(mut prev_elem) = noscript.as_node().previous_sibling() { - // TODO: Fix this to have a better way of extracting nodes that are elements + // TODO(style): Fix this to have a better way of extracting nodes that are elements while prev_elem.as_element().is_none() { match prev_elem.previous_sibling() { Some(new_prev) => prev_elem = new_prev, @@ -389,7 +389,7 @@ impl Readability { /// The must_be_element argument ensure the next element is actually an element node. /// This is likely to factored out into a new function. fn next_element(node_ref: Option, must_be_element: bool) -> Option { - // TODO: Could probably be refactored to use the elements method + // TODO(style): Could probably be refactored to use the elements method let mut node_ref = node_ref; while node_ref.is_some() { match node_ref.as_ref().unwrap().data() { @@ -544,7 +544,7 @@ impl Readability { /// Converts some of the common HTML entities in string to their corresponding characters. fn unescape_html_entities(value: &mut String) { if !value.is_empty() { - // TODO: Extract this + // TODO(style): Extract this let mut html_escape_map: HashMap<&str, &str> = HashMap::new(); html_escape_map.insert("lt", "<"); html_escape_map.insert("gt", ">"); @@ -637,7 +637,7 @@ impl Readability { /// Removes the class="" attribute from every element in the given subtree, except those that /// match CLASSES_TO_PRESERVE and the classesToPreserve array from the options object. fn clean_classes(&mut self) { - // TODO: This should accessed from Self + // TODO(style): This should accessed from Self let classes_to_preserve: HashSet<&str> = HashSet::new(); if let Some(article_node) = &mut self.article_node { for elem in article_node.inclusive_descendants().elements() { @@ -790,7 +790,7 @@ impl Readability { /// Run any post-process modifications to article content as necessary. fn post_process_content(&mut self, url: &str) { self.fix_relative_uris(url); - // TODO: Add flag check + // TODO(style): Add flag check self.clean_classes(); self.clean_readability_attrs(); } @@ -1041,7 +1041,7 @@ impl Readability { /// Determine whether element has any children block level elements. fn has_child_block_element(node_ref: &NodeRef) -> bool { - // TODO: Refer to a static HashSet + // TODO(perf): Refer to a static HashSet let block_level_elems: [&str; 32] = [ "address", "article", @@ -1622,13 +1622,13 @@ impl Readability { let page = self.root_node.select_first("body"); if page.is_err() { bail!("Document has no "); - // TODO return Err(ErrorKind::ReadabilityError("Document has no ".into()).into()); + // TODO(style) return Err(ErrorKind::ReadabilityError("Document has no ".into()).into()); } let page = page.unwrap(); let mut attempts: Vec = Vec::new(); // var pageCacheHtml = page.innerHTML; - //TODO: Add page cache + //TODO(???): Add page cache loop { // var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); @@ -2197,7 +2197,7 @@ mod test { use kuchiki::traits::*; use kuchiki::NodeRef; - // TODO: Refactor not to use test file possibly + // TODO(style): Refactor not to use test file possibly const TEST_HTML: &'static str = include_str!("../test_html/simple.html"); #[test] @@ -3867,7 +3867,7 @@ characters. For that reason, this

tag could not be a byline because it's too #[test] fn test_clean_classes() { - // TODO: This test will later be edited to ensure it checks to only remove certain classes + // TODO(task): This test will later be edited to ensure it checks to only remove certain classes let html_str = r#" diff --git a/quickpeep_raker/src/bin/qp-rake1.rs b/quickpeep_raker/src/bin/qp-rake1.rs index 2531dc4..e67cd57 100644 --- a/quickpeep_raker/src/bin/qp-rake1.rs +++ b/quickpeep_raker/src/bin/qp-rake1.rs @@ -42,14 +42,14 @@ pub async fn main() -> anyhow::Result<()> { let client = reqwest::ClientBuilder::new() .timeout(TIME_LIMIT) .default_headers(header_map) - // TODO We want to handle redirects ourselves so we can track them... + // We want to handle redirects ourselves so we can track them... .redirect(Policy::none()) .build()?; let mut adblock_engines = Vec::new(); for (antifeature, name) in &ADBLOCK_FILTER_PATHS { - // TODO Don't hardcode these paths in quite as bad a way... + // TODO(not important) Don't hardcode these paths in quite as bad a way... let path = PathBuf::from(format!("./data/{}.adblock", name)); if !path.exists() { warn!("Missing adblock rules: {:?}.", path); @@ -94,16 +94,12 @@ pub async fn main() -> anyhow::Result<()> { } RakeOutcome::RakedFeed(feed) => { green_ln!("Feed"); - // TODO - println!(); let refs = references_from_urlrakes(&feed, ReferenceKind::FeedEntry); print_references(&refs); } RakeOutcome::RakedSitemap(sitemap) => { green_ln!("Sitemap"); - // TODO - println!(); let refs = references_from_urlrakes(&sitemap, ReferenceKind::SitemapEntry); print_references(&refs); diff --git a/quickpeep_raker/src/bin/qp-seedrake.rs b/quickpeep_raker/src/bin/qp-seedrake.rs index e4e4a49..8566a4c 100644 --- a/quickpeep_raker/src/bin/qp-seedrake.rs +++ b/quickpeep_raker/src/bin/qp-seedrake.rs @@ -84,7 +84,7 @@ pub async fn main() -> anyhow::Result<()> { pub struct Seed { url: UrlOrUrlPattern, - // TODO These make more sense at the indexer stage. tags: BTreeSet, + // TODO(later) These make more sense at the indexer stage. tags: BTreeSet, } /// Either a URL or a URL prefix. @@ -237,6 +237,10 @@ async fn import_and_flush_batch( let mut allowed_domain_record = DirtyTracker::new( allowed_domain_record.unwrap_or_else(|| AllowedDomainRecord::default()), ); + if is_domain_new { + // Mark it as dirty + let _: &mut AllowedDomainRecord = allowed_domain_record.borrow_mut(); + } // Register the domain. This is a no-op if it's already active or backing off. txn.insert_active_domain_with_new_raffle_ticket(domain.clone().into_owned())?; diff --git a/quickpeep_raker/src/raking.rs b/quickpeep_raker/src/raking.rs index 0287384..24d27d4 100644 --- a/quickpeep_raker/src/raking.rs +++ b/quickpeep_raker/src/raking.rs @@ -361,7 +361,7 @@ pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result> { } else { continue; }; - let url = Url::parse(&link.href).context("parsing URL in feed")?; // TODO ignore failure here...? + let url = Url::parse(&link.href).context("parsing URL in feed")?; // TODO(robustness) ignore failure here...? let last_changed = entry.updated.or(entry.published); @@ -372,7 +372,7 @@ pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result> { }); } - // TODO paginated feeds (e.g. JSON Feed next_url) + // TODO(feature) paginated feeds (e.g. JSON Feed next_url) Ok(urls) } diff --git a/quickpeep_raker/src/raking/analysis.rs b/quickpeep_raker/src/raking/analysis.rs index eb4a61e..d8d04a1 100644 --- a/quickpeep_raker/src/raking/analysis.rs +++ b/quickpeep_raker/src/raking/analysis.rs @@ -126,7 +126,7 @@ pub fn guess_document_language(text: &str) -> Option { .map(|lang: Language| lang.iso_code_639_1().to_string()) } -// TODO this isn't particularly efficient. Probably want a trie if it's important... +// TODO(perf) this isn't particularly efficient. Probably want a trie if it's important... pub struct IpSet { ips: BTreeSet, } diff --git a/quickpeep_raker/src/raking/page_extraction.rs b/quickpeep_raker/src/raking/page_extraction.rs index 71ce17b..54baa0f 100644 --- a/quickpeep_raker/src/raking/page_extraction.rs +++ b/quickpeep_raker/src/raking/page_extraction.rs @@ -261,7 +261,7 @@ impl PageExtractionServiceInternal { document.body_remainder = DenseTree::from_body(root_node.clone()); document.body_content = DenseTree::from_body(article_node); } else { - todo!() + todo!() // NEEDED } Ok(ExtractedPage::Success { diff --git a/quickpeep_raker/src/raking/task.rs b/quickpeep_raker/src/raking/task.rs index c54f5dc..c3b2682 100644 --- a/quickpeep_raker/src/raking/task.rs +++ b/quickpeep_raker/src/raking/task.rs @@ -94,13 +94,13 @@ impl TaskContext { self.process_domain(domain).await?; } else { // Already in use; need to pick again - // TODO be smarter about this. + // TODO(perf) be smarter about this. tokio::time::sleep(Duration::from_secs(60)).await; } } None => { // No domains left! Woot. - // TODO come up with some stopping conditions + // TODO(perf, feature) come up with some stopping conditions tokio::time::sleep(Duration::from_secs(60)).await; } } @@ -143,8 +143,8 @@ impl TaskContext { return Ok(false); } - // TODO delete the domain from the store - todo!(); + // TODO(needed) delete the domain from the store + todo!(); // NEEDED txn.commit()?; Ok(true) @@ -366,12 +366,13 @@ impl TaskContext { Ok(NextAction::Continue) } RakeOutcome::TemporaryFailure(failure) => { - // TODO do we want to log this somewhere? + // TODO(future) do we want to log this somewhere? + // or at least a metric let domain = get_reduced_domain(url)?; let url = url.clone(); - // TODO add 1.1× the previous backoff, if there was one. + // TODO(feature) add 1.1× the previous backoff, if there was one. let new_backoff = failure.backoff_sec; let domain = domain.into_owned(); diff --git a/quickpeep_raker/src/storage.rs b/quickpeep_raker/src/storage.rs index 59a0a79..6b56af8 100644 --- a/quickpeep_raker/src/storage.rs +++ b/quickpeep_raker/src/storage.rs @@ -178,7 +178,7 @@ impl RakerStore { F: FnOnce(RakerTxn<'_, RW>) -> anyhow::Result + Send + 'static, R: Send + 'static, { - // TODO consider adding a lock here to prevent all the async executors getting stuck here... + // TODO(robustness) consider adding a lock here to prevent all the async executors getting stuck here... let this = self.clone(); Ok(tokio::task::spawn_blocking(move || -> anyhow::Result { let txn = this.rw_txn()?; diff --git a/quickpeep_raker/src/storage/records.rs b/quickpeep_raker/src/storage/records.rs index 745e219..2cafdbe 100644 --- a/quickpeep_raker/src/storage/records.rs +++ b/quickpeep_raker/src/storage/records.rs @@ -20,7 +20,7 @@ pub struct UrlVisitedRecord { #[derive(Clone, Debug, Deserialize, Serialize)] pub struct QueueUrlRecord { - pub intent: RakeIntent, // TODO + pub intent: RakeIntent, // TODO CONSIDER } #[derive(Clone, Debug, Deserialize, Serialize)]