Add Rust number parsing implementation
authorJakob Cornell <jakob+gpg@jcornell.net>
Sat, 23 Sep 2023 04:26:06 +0000 (23:26 -0500)
committerJakob Cornell <jakob+gpg@jcornell.net>
Tue, 26 Sep 2023 22:38:09 +0000 (17:38 -0500)
This includes ports of tests that don't require HTML parsing; the rest are coming later.

sharedmodel/Cargo.toml
sharedmodel/src/lib.rs
sharedmodel/src/update_parse.rs [new file with mode: 0644]

index 5113d4a10e88370e17105c345558a8c3e5f601b6..730305955f73a6b7bafe14597794aa40f730ffb8 100644 (file)
@@ -18,3 +18,7 @@ edition = "2018"
 uuid = "0.8"
 html5ever = "0.25"
 string_cache = "0.8"
+anyhow = "1.0.71"
+
+# no need for Unicode support
+fancy-regex = { version = "0.11.0", default-features = false, features = ["perf"] }
index c0430fd65bee994e9eb5dac793c580fb24794081..a1dc2f4dd9e7537f9bbb1ef96f979a6ec48a76b1 100644 (file)
@@ -1,6 +1,7 @@
 mod live_event_buffer;
 mod html;
 mod html_html5ever;
+mod update_parse;
 
 pub use live_event_buffer::LiveUpdateBuffer;
 pub use html::{HtmlNode, HtmlElement};
diff --git a/sharedmodel/src/update_parse.rs b/sharedmodel/src/update_parse.rs
new file mode 100644 (file)
index 0000000..9c0c8eb
--- /dev/null
@@ -0,0 +1,341 @@
+use std::borrow::Borrow;
+use std::collections::{BTreeSet, VecDeque};
+use std::slice;
+
+use anyhow::Context;
+
+use crate::html::{HtmlElement, HtmlNode};
+
+type Count = i64;
+
+#[derive(Debug, PartialEq)]
+pub enum Command { Reset, Report }
+
+pub struct ParsedUpdate {
+       number: Option<Count>,
+       command: Option<Command>,
+       count_attempt: bool,
+       deletable: bool,
+}
+
+pub enum ParseError {
+       InvalidElement(String),
+}
+
+fn parse_command(line: &str, bot_user: &str) -> Option<Command> {
+       let lower_line = line.to_lowercase();
+       if lower_line == format!("/u/{} reset", bot_user) {
+               return Some(Command::Reset);
+       } else if lower_line == "sidebar count" || lower_line == "current count" {
+               return Some(Command::Report);
+       } else {
+               return None;
+       }
+}
+
+/// `curr_count` is the next number up, one more than the last count
+pub fn parse_update(
+       body: impl HtmlElement,
+       curr_count: Option<Count>,
+       bot_user: &str
+) -> Result<ParsedUpdate, ParseError> {
+
+       // TextNode is HtmlElementImpl::Node containing HtmlNode::Text
+       enum Text<TextNode> {
+               FromTree(TextNode),
+               Other(String),
+       }
+
+       // Node is HtmlElementImpl::Node
+       enum WorklistEntry<Node> {
+               Space,
+               NewLine,
+               Node(Node),
+               Text(String),
+       }
+
+       // flatten the update content to plain text
+       let mut worklist: VecDeque<_> = body.contents().map(WorklistEntry::Node).collect();
+       let mut out = vec![vec![]];
+
+       while let Some(entry) = worklist.pop_back() {
+               match entry {
+                       WorklistEntry::Space => {
+                               out.last_mut().unwrap().push(Text::Other(" ".to_string()));
+                       },
+                       WorklistEntry::NewLine => {
+                               if !out.last().unwrap().is_empty() {
+                                       out.push(vec![]);
+                               }
+                       },
+                       WorklistEntry::Node(node) => match node {
+                               HtmlNode::Element(element) => match element.name() {
+                                       "br" | "hr" => {
+                                               if !out.last().unwrap().is_empty() {
+                                                       out.push(vec![]);
+                                               }
+                                       },
+                                       "em" | "strong" | "del" | "span" | "sup" | "code" | "a" | "th" | "td"
+                                       | "ul" | "ol" | "table" | "thead" | "tbody" => {
+                                               worklist.extend(element.contents().rev().map(WorklistEntry::Node));
+                                       },
+                                       "li" | "p" | "div" | "blockquote" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
+                                               worklist.push_back(WorklistEntry::NewLine);
+                                               worklist.extend(element.contents().rev().map(WorklistEntry::Node));
+                                               worklist.push_back(WorklistEntry::NewLine);
+                                       },
+                                       "pre" => {
+                                               out.extend(
+                                                       element.text().as_ref()
+                                                       .split('\n').map(str::to_string)
+                                                       .map(Text::Other).map(|s| vec![s]));
+                                               out.push(vec![]);
+                                       },
+                                       "tr" => {
+                                               worklist.push_back(WorklistEntry::NewLine);
+
+                                               // intersperse space markers
+                                               for el in element.contents().rev() {
+                                                       worklist.push_back(WorklistEntry::Node(el));
+                                                       worklist.push_back(WorklistEntry::Space);
+                                               }
+                                               worklist.pop_back();
+
+                                               worklist.push_back(WorklistEntry::NewLine);
+                                       },
+                                       name => return Err(ParseError::InvalidElement(name.to_string())),
+                               },
+                               HtmlNode::Text(_) => out.last_mut().unwrap().push(Text::FromTree(node)),
+                       },
+                       WorklistEntry::Text(t) => out.last_mut().unwrap().push(Text::Other(t)),
+               }
+       }
+
+       let pre_strip_lines = out.iter()
+               .filter(|line| !line.is_empty())
+               .map(|line|
+                       line.iter().map(|text|
+                               match text {
+                                       Text::FromTree(node) => match node {
+                                               HtmlNode::Text(t) => t.as_ref(),
+                                               _ => panic!("element in out vec"),
+                                       },
+                                       Text::Other(t) => t,
+                       }).collect::<Vec<_>>()
+                       .join("")
+       );
+
+       // normalize whitespace according to HTML rendering rules
+       // https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Whitespace#explanation
+       let stripped_lines: Vec<_> = pre_strip_lines.map(|line|
+               fancy_regex::Regex::new(" +").unwrap().replace_all(
+                       &line.replace('\t', " ").replace('\n', " "),
+                       " ")
+               .trim().to_string()
+       ).collect();
+
+       Ok(parse_from_lines(&stripped_lines, curr_count, bot_user))
+}
+
+fn parse_from_lines(lines: &[impl Borrow<str>], curr_count: Option<Count>, bot_user: &str) -> ParsedUpdate {
+       let command = lines.iter().find_map(|l| parse_command(l.borrow(), bot_user));
+
+       let number_out: Option<Count>;
+       let mut count_attempt;
+       let mut deletable;
+
+       if lines.is_empty() {
+               number_out = None;
+               count_attempt = false;
+               deletable = true;
+       } else {
+               // look for groups of ASCII digits (as many as possible) separated by a uniform separator
+               // from the valid set
+               let pattern = fancy_regex::Regex::new(
+                       "(?-u)^(?P<v>v)?(?P<neg>-)?(?P<num>\\d+((?P<sep>[,. \u{2009}]|, )\\d+((?P=sep)\\d+)*)?)"
+               ).unwrap();
+               let match_result = pattern
+                       .captures(lines[0].borrow())
+                       .with_context(|| format!("parsing line {:?}", lines[0].borrow()))
+                       .unwrap();
+               if let Some(ref match_) = match_result {
+                       let sep_opt = match_.name("sep").map(|m| m.as_str());
+                       let post = &lines[0].borrow()[match_.get(0).unwrap().end()..];
+
+                       // strip leading zeros
+                       let (raw_digits, zeros) = {
+                               let mut zeros = false;
+                               let mut raw_digits = &match_["num"];
+                               while raw_digits.len() > 1 && raw_digits.starts_with('0') {
+                                       zeros = true;
+                                       raw_digits = raw_digits.strip_prefix('0').unwrap();
+                                       if let Some(ref sep) = sep_opt {
+                                               raw_digits = raw_digits.strip_prefix(sep).unwrap_or(raw_digits);
+                                       }
+                               }
+                               (raw_digits, zeros)
+                       };
+
+                       let parts = match sep_opt {
+                               Some(ref sep) => raw_digits.split(sep).collect(),
+                               None => vec![raw_digits],
+                       };
+                       // TODO look into cleaner way to handle parts below (char slices?)
+                       let raw_parts: Vec<&[u8]> = parts.iter().cloned().map(str::as_bytes).collect();
+
+                       let all_parts_valid = sep_opt.is_none()
+                               || ((1..=3).contains(&raw_parts[0].len()) && raw_parts[1..].iter().all(|p| p.len() == 3));
+                       let mut lone = lines.len() == 1 && post.chars().all(char::is_whitespace);
+
+                       let has_v = match_.name("v").is_some();
+                       let typo =
+                               // failed paste of leading digits
+                               (has_v && raw_parts.len() == 1 && raw_parts[0].len() <= 2)
+                               || (has_v && all_parts_valid)  // v followed by count
+                               || match curr_count {
+                                       Some(count_val) => {
+                                               count_val.abs() >= 100 && match_.name("neg").is_some() == (count_val < 0) && {
+                                                       let goal_string = count_val.abs().to_string();
+                                                       let goal_parts: Vec<&[u8]> = goal_string.as_bytes().rchunks(3).rev().collect();
+                                                       let (&last, front) = goal_parts.split_last().unwrap();
+                                                       let partials: Vec<Vec<&[u8]>> = vec![
+                                                               // missing last digit
+                                                               front.iter().cloned().chain([last.get(..last.len() - 1).unwrap()]).collect(),
+
+                                                               // missing last two digits
+                                                               front.iter().cloned().chain([last.get(..last.len() - 2).unwrap()]).collect(),
+
+                                                               // missing second-last digit
+                                                               front.iter().cloned().chain([
+                                                                       last.get(..last.len() - 2).unwrap(),
+                                                                       slice::from_ref(&last[last.len() - 1]),
+                                                               ]).collect(),
+                                                       ];
+
+                                                       // missing any of last two digits or double paste
+                                                       partials.contains(&raw_parts)
+                                                               || partials.iter().any(
+                                                                       |p| raw_parts == p[..p.len() - 1].iter().cloned()
+                                                                               .chain([p[p.len() - 1], goal_parts[0]])
+                                                                               .chain(goal_parts[1..].iter().cloned())
+                                                                               .collect::<Vec<_>>()
+                                                               )
+                                               }
+                                       },
+                                       None => false
+                               };
+                       if has_v || zeros || typo || (parts == ["0"] && match_.name("neg").is_some()) {
+                               number_out = None;
+                               count_attempt = true;
+                               deletable = lone;
+                       } else {
+                               let mut groups_okay = all_parts_valid;
+                               let mut use_parts = &parts;
+                               if let Some(count_val) = curr_count {
+                                       if sep_opt.is_some() && sep_opt.unwrap().chars().all(char::is_whitespace) {
+                                               // Presume that the intended count consists of as many valid digit groups as
+                                               // necessary to match the number of digits in the expected count, if
+                                               // possible.
+                                               let digit_count = format!("{}", count_val.abs()).len();
+                                               let mut use_parts = vec![];
+                                               let mut accum = 0;
+                                               for (i, part) in parts.iter().cloned().enumerate() {
+                                                       let part_valid = if i == 0 { part.len() <= 3 } else { part.len() == 3 };
+                                                       if part_valid && accum < digit_count {
+                                                               use_parts.push(part);
+                                                               accum += part.len();
+                                                       } else {
+                                                               break;
+                                                       }
+                                               }
+
+                                               // Could still be a no-separator count with some extra digit groups on the
+                                               // same line.
+                                               if use_parts.is_empty() {
+                                                       use_parts = vec![parts[0]];
+                                               }
+
+                                               lone = lone && use_parts.len() == parts.len();
+                                       }
+                               }
+
+                               let digits = use_parts.join("");
+                               let number = {
+                                       let magnitude: Count = digits.parse().unwrap();
+                                       if match_.name("neg").is_some() { -magnitude } else { magnitude }
+                               };
+                               let special = match curr_count {
+                                       Some(count_val) => (number - count_val).abs() <= 25 && is_special(number),
+                                       None => false,
+                               };
+                               if groups_okay {
+                                       deletable = lone && !special;
+                                       if use_parts.len() == parts.len()
+                                               && !post.is_empty() && !post.chars().next().unwrap().is_whitespace() {
+                                               count_attempt = match curr_count {
+                                                       Some(count_val) => (number - count_val).abs() <= 25,
+                                                       None => false,
+                                               };
+                                               number_out = None;
+                                       } else {
+                                               count_attempt = true;
+                                               number_out = Some(number);
+                                       }
+                               } else {
+                                       number_out = None;
+                                       count_attempt = true;
+                                       deletable = false;
+                               }
+                       }
+
+                       // TODO consider integrating this into logic above
+                       if lines[0].borrow().chars().next().unwrap().is_digit(10) && !count_attempt {
+                               count_attempt = true;
+                               deletable = false;
+                       }
+               } else {
+                       // no count attempt found
+                       number_out = None;
+                       count_attempt = false;
+                       deletable = false;
+               }
+       }
+       ParsedUpdate {
+               number: number_out,
+               command: command,
+               count_attempt: count_attempt,
+               deletable: deletable,
+       }
+}
+
+fn is_special(count: Count) -> bool {
+       let count_str = format!("{}", count);
+
+       [0, 1, 333, 999].contains(&(count % 1000))
+               // palindrome
+               || (count > 10_000_000 && count_str.chars().rev().collect::<String>() == count_str)
+               // repeated sequence
+               || (1..=count_str.len() / 2).any(
+                       |len| count_str.len() % len == 0
+                               && count_str.as_bytes().chunks(len).collect::<BTreeSet<_>>().len() == 1)
+}
+
+#[cfg(test)]
+mod tests {
+       use super::*;
+
+       #[test]
+       fn test_parse_command() {
+               assert_eq!(parse_command("/U/foo reset", "foo"), Some(Command::Reset));
+               assert_eq!(parse_command("current count", ""), Some(Command::Report));
+               assert_eq!(parse_command("foo", ""), None);
+       }
+
+       #[test]
+       fn test_is_special() {
+               assert!(is_special(10_333));
+               assert!(is_special(10_000_001));
+               assert!(is_special(12341234));
+               assert!(!is_special(1937592));
+       }
+}