Sort through TODO items
This commit is contained in:
parent
120702ce0e
commit
6d109632a3
@ -272,7 +272,7 @@ impl DenseTreeBuilder {
|
||||
self.preceding_newlines = 0;
|
||||
}
|
||||
"img" => {
|
||||
// TODO Decide if this is worth the space...
|
||||
// TODO(future) Decide if this is worth the space...
|
||||
let attrs = element.attributes.borrow();
|
||||
let src = attrs.get("src").unwrap_or("").to_owned();
|
||||
|
||||
|
@ -26,17 +26,17 @@ const FLAG_WEIGHT_CLASSES: u32 = 0x2;
|
||||
const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4;
|
||||
const READABILITY_SCORE: &'static str = "readability-score";
|
||||
const HTML_NS: &'static str = "http://www.w3.org/1999/xhtml";
|
||||
// TODO: Change to HashSet
|
||||
// TODO(perf): Change to HashSet
|
||||
const PHRASING_ELEMS: [&str; 39] = [
|
||||
"abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data", "datalist", "dfn", "em",
|
||||
"embed", "i", "img", "input", "kbd", "label", "mark", "math", "meter", "noscript", "object",
|
||||
"output", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong",
|
||||
"sub", "sup", "textarea", "time", "var", "wbr",
|
||||
];
|
||||
// TODO: Change to HashSet
|
||||
// TODO(perf): Change to HashSet
|
||||
const DEFAULT_TAGS_TO_SCORE: [&str; 9] =
|
||||
["section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre"];
|
||||
// TODO: Change to HashSet
|
||||
// TODO(perf): Change to HashSet
|
||||
const ALTER_TO_DIV_EXCEPTIONS: [&str; 4] = ["div", "article", "section", "p"];
|
||||
const PRESENTATIONAL_ATTRIBUTES: [&str; 12] = [
|
||||
"align",
|
||||
@ -54,7 +54,7 @@ const PRESENTATIONAL_ATTRIBUTES: [&str; 12] = [
|
||||
];
|
||||
|
||||
const DATA_TABLE_DESCENDANTS: [&str; 5] = ["col", "colgroup", "tfoot", "thead", "th"];
|
||||
// TODO: Change to HashSet
|
||||
// TODO(perf): Change to HashSet
|
||||
const DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [&str; 5] = ["table", "th", "td", "hr", "pre"];
|
||||
|
||||
pub mod regexes;
|
||||
@ -168,7 +168,7 @@ impl Readability {
|
||||
continue;
|
||||
}
|
||||
if let Some(mut prev_elem) = noscript.as_node().previous_sibling() {
|
||||
// TODO: Fix this to have a better way of extracting nodes that are elements
|
||||
// TODO(style): Fix this to have a better way of extracting nodes that are elements
|
||||
while prev_elem.as_element().is_none() {
|
||||
match prev_elem.previous_sibling() {
|
||||
Some(new_prev) => prev_elem = new_prev,
|
||||
@ -389,7 +389,7 @@ impl Readability {
|
||||
/// The must_be_element argument ensure the next element is actually an element node.
|
||||
/// This is likely to factored out into a new function.
|
||||
fn next_element(node_ref: Option<NodeRef>, must_be_element: bool) -> Option<NodeRef> {
|
||||
// TODO: Could probably be refactored to use the elements method
|
||||
// TODO(style): Could probably be refactored to use the elements method
|
||||
let mut node_ref = node_ref;
|
||||
while node_ref.is_some() {
|
||||
match node_ref.as_ref().unwrap().data() {
|
||||
@ -544,7 +544,7 @@ impl Readability {
|
||||
/// Converts some of the common HTML entities in string to their corresponding characters.
|
||||
fn unescape_html_entities(value: &mut String) {
|
||||
if !value.is_empty() {
|
||||
// TODO: Extract this
|
||||
// TODO(style): Extract this
|
||||
let mut html_escape_map: HashMap<&str, &str> = HashMap::new();
|
||||
html_escape_map.insert("lt", "<");
|
||||
html_escape_map.insert("gt", ">");
|
||||
@ -637,7 +637,7 @@ impl Readability {
|
||||
/// Removes the class="" attribute from every element in the given subtree, except those that
|
||||
/// match CLASSES_TO_PRESERVE and the classesToPreserve array from the options object.
|
||||
fn clean_classes(&mut self) {
|
||||
// TODO: This should accessed from Self
|
||||
// TODO(style): This should accessed from Self
|
||||
let classes_to_preserve: HashSet<&str> = HashSet::new();
|
||||
if let Some(article_node) = &mut self.article_node {
|
||||
for elem in article_node.inclusive_descendants().elements() {
|
||||
@ -790,7 +790,7 @@ impl Readability {
|
||||
/// Run any post-process modifications to article content as necessary.
|
||||
fn post_process_content(&mut self, url: &str) {
|
||||
self.fix_relative_uris(url);
|
||||
// TODO: Add flag check
|
||||
// TODO(style): Add flag check
|
||||
self.clean_classes();
|
||||
self.clean_readability_attrs();
|
||||
}
|
||||
@ -1041,7 +1041,7 @@ impl Readability {
|
||||
|
||||
/// Determine whether element has any children block level elements.
|
||||
fn has_child_block_element(node_ref: &NodeRef) -> bool {
|
||||
// TODO: Refer to a static HashSet
|
||||
// TODO(perf): Refer to a static HashSet
|
||||
let block_level_elems: [&str; 32] = [
|
||||
"address",
|
||||
"article",
|
||||
@ -1622,13 +1622,13 @@ impl Readability {
|
||||
let page = self.root_node.select_first("body");
|
||||
if page.is_err() {
|
||||
bail!("Document has no <body>");
|
||||
// TODO return Err(ErrorKind::ReadabilityError("Document has no <body>".into()).into());
|
||||
// TODO(style) return Err(ErrorKind::ReadabilityError("Document has no <body>".into()).into());
|
||||
}
|
||||
let page = page.unwrap();
|
||||
let mut attempts: Vec<ExtractAttempt> = Vec::new();
|
||||
|
||||
// var pageCacheHtml = page.innerHTML;
|
||||
//TODO: Add page cache
|
||||
//TODO(???): Add page cache
|
||||
|
||||
loop {
|
||||
// var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
|
||||
@ -2197,7 +2197,7 @@ mod test {
|
||||
use kuchiki::traits::*;
|
||||
use kuchiki::NodeRef;
|
||||
|
||||
// TODO: Refactor not to use test file possibly
|
||||
// TODO(style): Refactor not to use test file possibly
|
||||
const TEST_HTML: &'static str = include_str!("../test_html/simple.html");
|
||||
|
||||
#[test]
|
||||
@ -3867,7 +3867,7 @@ characters. For that reason, this <p> tag could not be a byline because it's too
|
||||
|
||||
#[test]
|
||||
fn test_clean_classes() {
|
||||
// TODO: This test will later be edited to ensure it checks to only remove certain classes
|
||||
// TODO(task): This test will later be edited to ensure it checks to only remove certain classes
|
||||
let html_str = r#"
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
|
@ -42,14 +42,14 @@ pub async fn main() -> anyhow::Result<()> {
|
||||
let client = reqwest::ClientBuilder::new()
|
||||
.timeout(TIME_LIMIT)
|
||||
.default_headers(header_map)
|
||||
// TODO We want to handle redirects ourselves so we can track them...
|
||||
// We want to handle redirects ourselves so we can track them...
|
||||
.redirect(Policy::none())
|
||||
.build()?;
|
||||
|
||||
let mut adblock_engines = Vec::new();
|
||||
|
||||
for (antifeature, name) in &ADBLOCK_FILTER_PATHS {
|
||||
// TODO Don't hardcode these paths in quite as bad a way...
|
||||
// TODO(not important) Don't hardcode these paths in quite as bad a way...
|
||||
let path = PathBuf::from(format!("./data/{}.adblock", name));
|
||||
if !path.exists() {
|
||||
warn!("Missing adblock rules: {:?}.", path);
|
||||
@ -94,16 +94,12 @@ pub async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
RakeOutcome::RakedFeed(feed) => {
|
||||
green_ln!("Feed");
|
||||
// TODO
|
||||
|
||||
println!();
|
||||
let refs = references_from_urlrakes(&feed, ReferenceKind::FeedEntry);
|
||||
print_references(&refs);
|
||||
}
|
||||
RakeOutcome::RakedSitemap(sitemap) => {
|
||||
green_ln!("Sitemap");
|
||||
// TODO
|
||||
|
||||
println!();
|
||||
let refs = references_from_urlrakes(&sitemap, ReferenceKind::SitemapEntry);
|
||||
print_references(&refs);
|
||||
|
@ -84,7 +84,7 @@ pub async fn main() -> anyhow::Result<()> {
|
||||
|
||||
pub struct Seed {
|
||||
url: UrlOrUrlPattern,
|
||||
// TODO These make more sense at the indexer stage. tags: BTreeSet<CompactString>,
|
||||
// TODO(later) These make more sense at the indexer stage. tags: BTreeSet<CompactString>,
|
||||
}
|
||||
|
||||
/// Either a URL or a URL prefix.
|
||||
@ -237,6 +237,10 @@ async fn import_and_flush_batch(
|
||||
let mut allowed_domain_record = DirtyTracker::new(
|
||||
allowed_domain_record.unwrap_or_else(|| AllowedDomainRecord::default()),
|
||||
);
|
||||
if is_domain_new {
|
||||
// Mark it as dirty
|
||||
let _: &mut AllowedDomainRecord = allowed_domain_record.borrow_mut();
|
||||
}
|
||||
|
||||
// Register the domain. This is a no-op if it's already active or backing off.
|
||||
txn.insert_active_domain_with_new_raffle_ticket(domain.clone().into_owned())?;
|
||||
|
@ -361,7 +361,7 @@ pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
let url = Url::parse(&link.href).context("parsing URL in feed")?; // TODO ignore failure here...?
|
||||
let url = Url::parse(&link.href).context("parsing URL in feed")?; // TODO(robustness) ignore failure here...?
|
||||
|
||||
let last_changed = entry.updated.or(entry.published);
|
||||
|
||||
@ -372,7 +372,7 @@ pub fn rake_feed(content: &[u8], url: &Url) -> anyhow::Result<Vec<UrlRaked>> {
|
||||
});
|
||||
}
|
||||
|
||||
// TODO paginated feeds (e.g. JSON Feed next_url)
|
||||
// TODO(feature) paginated feeds (e.g. JSON Feed next_url)
|
||||
|
||||
Ok(urls)
|
||||
}
|
||||
|
@ -126,7 +126,7 @@ pub fn guess_document_language(text: &str) -> Option<String> {
|
||||
.map(|lang: Language| lang.iso_code_639_1().to_string())
|
||||
}
|
||||
|
||||
// TODO this isn't particularly efficient. Probably want a trie if it's important...
|
||||
// TODO(perf) this isn't particularly efficient. Probably want a trie if it's important...
|
||||
pub struct IpSet {
|
||||
ips: BTreeSet<IpNetwork>,
|
||||
}
|
||||
|
@ -261,7 +261,7 @@ impl PageExtractionServiceInternal {
|
||||
document.body_remainder = DenseTree::from_body(root_node.clone());
|
||||
document.body_content = DenseTree::from_body(article_node);
|
||||
} else {
|
||||
todo!()
|
||||
todo!() // NEEDED
|
||||
}
|
||||
|
||||
Ok(ExtractedPage::Success {
|
||||
|
@ -94,13 +94,13 @@ impl TaskContext {
|
||||
self.process_domain(domain).await?;
|
||||
} else {
|
||||
// Already in use; need to pick again
|
||||
// TODO be smarter about this.
|
||||
// TODO(perf) be smarter about this.
|
||||
tokio::time::sleep(Duration::from_secs(60)).await;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// No domains left! Woot.
|
||||
// TODO come up with some stopping conditions
|
||||
// TODO(perf, feature) come up with some stopping conditions
|
||||
tokio::time::sleep(Duration::from_secs(60)).await;
|
||||
}
|
||||
}
|
||||
@ -143,8 +143,8 @@ impl TaskContext {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
// TODO delete the domain from the store
|
||||
todo!();
|
||||
// TODO(needed) delete the domain from the store
|
||||
todo!(); // NEEDED
|
||||
|
||||
txn.commit()?;
|
||||
Ok(true)
|
||||
@ -366,12 +366,13 @@ impl TaskContext {
|
||||
Ok(NextAction::Continue)
|
||||
}
|
||||
RakeOutcome::TemporaryFailure(failure) => {
|
||||
// TODO do we want to log this somewhere?
|
||||
// TODO(future) do we want to log this somewhere?
|
||||
// or at least a metric
|
||||
|
||||
let domain = get_reduced_domain(url)?;
|
||||
let url = url.clone();
|
||||
|
||||
// TODO add 1.1× the previous backoff, if there was one.
|
||||
// TODO(feature) add 1.1× the previous backoff, if there was one.
|
||||
let new_backoff = failure.backoff_sec;
|
||||
let domain = domain.into_owned();
|
||||
|
||||
|
@ -178,7 +178,7 @@ impl RakerStore {
|
||||
F: FnOnce(RakerTxn<'_, RW>) -> anyhow::Result<R> + Send + 'static,
|
||||
R: Send + 'static,
|
||||
{
|
||||
// TODO consider adding a lock here to prevent all the async executors getting stuck here...
|
||||
// TODO(robustness) consider adding a lock here to prevent all the async executors getting stuck here...
|
||||
let this = self.clone();
|
||||
Ok(tokio::task::spawn_blocking(move || -> anyhow::Result<R> {
|
||||
let txn = this.rw_txn()?;
|
||||
|
@ -20,7 +20,7 @@ pub struct UrlVisitedRecord {
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
pub struct QueueUrlRecord {
|
||||
pub intent: RakeIntent, // TODO
|
||||
pub intent: RakeIntent, // TODO CONSIDER
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
|
Loading…
Reference in New Issue
Block a user