Sort through TODO items

2022-03-20 22:23:12 +00:00 · 2022-03-20 22:23:12 +00:00 · 6d109632a3
commit 6d109632a3
parent 120702ce0e
10 changed files with 35 additions and 34 deletions
--- a/quickpeep_densedoc/src/lib.rs
+++ b/quickpeep_densedoc/src/lib.rs
@ -272,7 +272,7 @@ impl DenseTreeBuilder {
                        self.preceding_newlines = 0;
                    }
                    "img" => {
-                        // TODO Decide if this is worth the space...
+                        // TODO(future) Decide if this is worth the space...
                        let attrs = element.attributes.borrow();
                        let src = attrs.get("src").unwrap_or("").to_owned();

--- a/quickpeep_moz_readability/src/lib.rs
+++ b/quickpeep_moz_readability/src/lib.rs
@ -26,17 +26,17 @@ const FLAG_WEIGHT_CLASSES: u32 = 0x2;
 const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4;
 const READABILITY_SCORE: &'static str = "readability-score";
 const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
-// TODO: Change to HashSet
+// TODO(perf): Change to HashSet
 const PHRASING_ELEMS: [&str; 39] = [
    "abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em",
    "embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter", "noscript", "object",
    "output", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong",
    "sub", "sup", "textarea", "time", "var", "wbr",
 ];
-// TODO: Change to HashSet
+// TODO(perf): Change to HashSet
 const DEFAULT_TAGS_TO_SCORE: [&str; 9] =
    ["section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre"];
-// TODO: Change to HashSet
+// TODO(perf): Change to HashSet
 const ALTER_TO_DIV_EXCEPTIONS: [&str; 4] = ["div", "article", "section", "p"];
 const PRESENTATIONAL_ATTRIBUTES: [&str; 12] = [
    "align",
@ -54,7 +54,7 @@ const PRESENTATIONAL_ATTRIBUTES: [&str; 12] = [
 ];

 const DATA_TABLE_DESCENDANTS: [&str; 5] = ["col", "colgroup", "tfoot", "thead", "th"];
-// TODO: Change to HashSet
+// TODO(perf): Change to HashSet
 const DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [&str; 5] = ["table", "th", "td", "hr", "pre"];

 pub mod regexes;
@ -168,7 +168,7 @@ impl Readability {
                    continue;
                }
                if let Some(mut prev_elem) = noscript.as_node().previous_sibling() {
-                    // TODO: Fix this to have a better way of extracting nodes that are elements
+                    // TODO(style): Fix this to have a better way of extracting nodes that are elements
                    while prev_elem.as_element().is_none() {
                        match prev_elem.previous_sibling() {
                            Some(new_prev) => prev_elem = new_prev,
@ -389,7 +389,7 @@ impl Readability {
    /// The must_be_element argument ensure the next element is actually an element node.
    /// This is likely to factored out into a new function.
    fn next_element(node_ref: Option<NodeRef>, must_be_element: bool) -> Option<NodeRef> {
-        // TODO: Could probably be refactored to use the elements method
+        // TODO(style): Could probably be refactored to use the elements method
        let mut node_ref = node_ref;
        while node_ref.is_some() {
            match node_ref.as_ref().unwrap().data() {
@ -544,7 +544,7 @@ impl Readability {
    /// Converts some of the common HTML entities in string to their corresponding characters.
    fn unescape_html_entities(value: &mut String) {
        if !value.is_empty() {
-            // TODO: Extract this
+            // TODO(style): Extract this
            let mut html_escape_map: HashMap<&str, &str> = HashMap::new();
            html_escape_map.insert("lt", "<");
            html_escape_map.insert("gt", ">");
@ -637,7 +637,7 @@ impl Readability {
    /// Removes the class="" attribute from every element in the given subtree, except those that
    /// match CLASSES_TO_PRESERVE and the classesToPreserve array from the options object.
    fn clean_classes(&mut self) {
-        // TODO: This should accessed from Self
+        // TODO(style): This should accessed from Self
        let classes_to_preserve: HashSet<&str> = HashSet::new();
        if let Some(article_node) = &mut self.article_node {
            for elem in article_node.inclusive_descendants().elements() {
@ -790,7 +790,7 @@ impl Readability {
    /// Run any post-process modifications to article content as necessary.
    fn post_process_content(&mut self, url: &str) {
        self.fix_relative_uris(url);
-        // TODO: Add flag check
+        // TODO(style): Add flag check
        self.clean_classes();
        self.clean_readability_attrs();
    }
@ -1041,7 +1041,7 @@ impl Readability {

    /// Determine whether element has any children block level elements.
    fn has_child_block_element(node_ref: &NodeRef) -> bool {
-        // TODO: Refer to a static HashSet
+        // TODO(perf): Refer to a static HashSet
        let block_level_elems: [&str; 32] = [
            "address",
            "article",
@ -1622,13 +1622,13 @@ impl Readability {
        let page = self.root_node.select_first("body");
        if page.is_err() {
            bail!("Document has no <body>");
-            // TODO return Err(ErrorKind::ReadabilityError("Document has no <body>".into()).into());
+            // TODO(style) return Err(ErrorKind::ReadabilityError("Document has no <body>".into()).into());
        }
        let page = page.unwrap();
        let mut attempts: Vec<ExtractAttempt> = Vec::new();

        // var pageCacheHtml = page.innerHTML;
-        //TODO: Add page cache
+        //TODO(???): Add page cache

        loop {
            //   var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
@ -2197,7 +2197,7 @@ mod test {
    use kuchiki::traits::*;
    use kuchiki::NodeRef;

-    // TODO: Refactor not to use test file possibly
+    // TODO(style): Refactor not to use test file possibly
    const TEST_HTML: &'static str = include_str!("../test_html/simple.html");

    #[test]
@ -3867,7 +3867,7 @@ characters. For that reason, this <p> tag could not be a byline because it's too

    #[test]
    fn test_clean_classes() {
-        // TODO: This test will later be edited to ensure it checks to only remove certain classes
+        // TODO(task): This test will later be edited to ensure it checks to only remove certain classes
        let html_str = r#"
        <!DOCTYPE html>
        <html>
--- a/quickpeep_raker/src/bin/qp-rake1.rs
+++ b/quickpeep_raker/src/bin/qp-rake1.rs
@ -42,14 +42,14 @@ pub async fn main() -> anyhow::Result<()> {
    let client = reqwest::ClientBuilder::new()
        .timeout(TIME_LIMIT)
        .default_headers(header_map)
-        // TODO We want to handle redirects ourselves so we can track them...
+        // We want to handle redirects ourselves so we can track them...
        .redirect(Policy::none())
        .build()?;

    let mut adblock_engines = Vec::new();

    for (antifeature, name) in &ADBLOCK_FILTER_PATHS {
-        // TODO Don't hardcode these paths in quite as bad a way...
+        // TODO(not important) Don't hardcode these paths in quite as bad a way...
        let path = PathBuf::from(format!("./data/{}.adblock", name));
        if !path.exists() {
            warn!("Missing adblock rules: {:?}.", path);
@ -94,16 +94,12 @@ pub async fn main() -> anyhow::Result<()> {
        }
        RakeOutcome::RakedFeed(feed) => {
            green_ln!("Feed");
-            // TODO
-
            println!();
            let refs = references_from_urlrakes(&feed, ReferenceKind::FeedEntry);
            print_references(&refs);
        }
        RakeOutcome::RakedSitemap(sitemap) => {
            green_ln!("Sitemap");
-            // TODO
-
            println!();
            let refs = references_from_urlrakes(&sitemap, ReferenceKind::SitemapEntry);
            print_references(&refs);
--- a/quickpeep_raker/src/bin/qp-seedrake.rs
+++ b/quickpeep_raker/src/bin/qp-seedrake.rs
@ -84,7 +84,7 @@ pub async fn main() -> anyhow::Result<()> {

 pub struct Seed {
    url: UrlOrUrlPattern,
-    // TODO These make more sense at the indexer stage. tags: BTreeSet<CompactString>,
+    // TODO(later) These make more sense at the indexer stage. tags: BTreeSet<CompactString>,
 }

 /// Either a URL or a URL prefix.
@ -237,6 +237,10 @@ async fn import_and_flush_batch(
        let mut allowed_domain_record = DirtyTracker::new(
            allowed_domain_record.unwrap_or_else(|| AllowedDomainRecord::default()),
        );
+        if is_domain_new {
+            // Mark it as dirty
+            let _: &mut AllowedDomainRecord = allowed_domain_record.borrow_mut();
+        }

        // Register the domain. This is a no-op if it's already active or backing off.
        txn.insert_active_domain_with_new_raffle_ticket(domain.clone().into_owned())?;
--- a/quickpeep_raker/src/raking.rs
+++ b/quickpeep_raker/src/raking.rs
@ -361,7 +361,7 @@ pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
        } else {
            continue;
        };
-        let url = Url::parse(&link.href).context("parsing URL in feed")?; // TODO ignore failure here...?
+        let url = Url::parse(&link.href).context("parsing URL in feed")?; // TODO(robustness) ignore failure here...?

        let last_changed = entry.updated.or(entry.published);

@ -372,7 +372,7 @@ pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
        });
    }

-    // TODO paginated feeds (e.g. JSON Feed next_url)
+    // TODO(feature) paginated feeds (e.g. JSON Feed next_url)

    Ok(urls)
 }
--- a/quickpeep_raker/src/raking/analysis.rs
+++ b/quickpeep_raker/src/raking/analysis.rs
@ -126,7 +126,7 @@ pub fn guess_document_language(text: &str) -> Option<String> {
        .map(|lang: Language| lang.iso_code_639_1().to_string())
 }

-// TODO this isn't particularly efficient. Probably want a trie if it's important...
+// TODO(perf) this isn't particularly efficient. Probably want a trie if it's important...
 pub struct IpSet {
    ips: BTreeSet<IpNetwork>,
 }
--- a/quickpeep_raker/src/raking/page_extraction.rs
+++ b/quickpeep_raker/src/raking/page_extraction.rs
@ -261,7 +261,7 @@ impl PageExtractionServiceInternal {
            document.body_remainder = DenseTree::from_body(root_node.clone());
            document.body_content = DenseTree::from_body(article_node);
        } else {
-            todo!()
+            todo!() // NEEDED
        }

        Ok(ExtractedPage::Success {
--- a/quickpeep_raker/src/raking/task.rs
+++ b/quickpeep_raker/src/raking/task.rs
@ -94,13 +94,13 @@ impl TaskContext {
                        self.process_domain(domain).await?;
                    } else {
                        // Already in use; need to pick again
-                        // TODO be smarter about this.
+                        // TODO(perf) be smarter about this.
                        tokio::time::sleep(Duration::from_secs(60)).await;
                    }
                }
                None => {
                    // No domains left! Woot.
-                    // TODO come up with some stopping conditions
+                    // TODO(perf, feature) come up with some stopping conditions
                    tokio::time::sleep(Duration::from_secs(60)).await;
                }
            }
@ -143,8 +143,8 @@ impl TaskContext {
                            return Ok(false);
                        }

-                        // TODO delete the domain from the store
-                        todo!();
+                        // TODO(needed) delete the domain from the store
+                        todo!(); // NEEDED

                        txn.commit()?;
                        Ok(true)
@ -366,12 +366,13 @@ impl TaskContext {
                Ok(NextAction::Continue)
            }
            RakeOutcome::TemporaryFailure(failure) => {
-                // TODO do we want to log this somewhere?
+                // TODO(future) do we want to log this somewhere?
+                // or at least a metric

                let domain = get_reduced_domain(url)?;
                let url = url.clone();

-                // TODO add 1.1× the previous backoff, if there was one.
+                // TODO(feature) add 1.1× the previous backoff, if there was one.
                let new_backoff = failure.backoff_sec;
                let domain = domain.into_owned();

--- a/quickpeep_raker/src/storage.rs
+++ b/quickpeep_raker/src/storage.rs
@ -178,7 +178,7 @@ impl RakerStore {
        F: FnOnce(RakerTxn<'_, RW>) -> anyhow::Result<R> + Send + 'static,
        R: Send + 'static,
    {
-        // TODO consider adding a lock here to prevent all the async executors getting stuck here...
+        // TODO(robustness) consider adding a lock here to prevent all the async executors getting stuck here...
        let this = self.clone();
        Ok(tokio::task::spawn_blocking(move || -> anyhow::Result<R> {
            let txn = this.rw_txn()?;
--- a/quickpeep_raker/src/storage/records.rs
+++ b/quickpeep_raker/src/storage/records.rs
@ -20,7 +20,7 @@ pub struct UrlVisitedRecord {

 #[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct QueueUrlRecord {
-    pub intent: RakeIntent, // TODO
+    pub intent: RakeIntent, // TODO CONSIDER
 }

 #[derive(Clone, Debug, Deserialize, Serialize)]