From: Jakob Cornell Date: Sat, 23 Sep 2023 04:26:06 +0000 (-0500) Subject: Add Rust number parsing implementation X-Git-Url: https://jcornell.net/gitweb/gitweb.cgi?a=commitdiff_plain;h=cf8a86bd3c597eaffa1f00899212f9a67db0ae57;p=counting.git Add Rust number parsing implementation This includes ports of tests that don't require HTML parsing; the rest are coming later. --- diff --git a/sharedmodel/Cargo.toml b/sharedmodel/Cargo.toml index 5113d4a..7303059 100644 --- a/sharedmodel/Cargo.toml +++ b/sharedmodel/Cargo.toml @@ -18,3 +18,7 @@ edition = "2018" uuid = "0.8" html5ever = "0.25" string_cache = "0.8" +anyhow = "1.0.71" + +# no need for Unicode support +fancy-regex = { version = "0.11.0", default-features = false, features = ["perf"] } diff --git a/sharedmodel/src/lib.rs b/sharedmodel/src/lib.rs index c0430fd..a1dc2f4 100644 --- a/sharedmodel/src/lib.rs +++ b/sharedmodel/src/lib.rs @@ -1,6 +1,7 @@ mod live_event_buffer; mod html; mod html_html5ever; +mod update_parse; pub use live_event_buffer::LiveUpdateBuffer; pub use html::{HtmlNode, HtmlElement}; diff --git a/sharedmodel/src/update_parse.rs b/sharedmodel/src/update_parse.rs new file mode 100644 index 0000000..9c0c8eb --- /dev/null +++ b/sharedmodel/src/update_parse.rs @@ -0,0 +1,341 @@ +use std::borrow::Borrow; +use std::collections::{BTreeSet, VecDeque}; +use std::slice; + +use anyhow::Context; + +use crate::html::{HtmlElement, HtmlNode}; + +type Count = i64; + +#[derive(Debug, PartialEq)] +pub enum Command { Reset, Report } + +pub struct ParsedUpdate { + number: Option, + command: Option, + count_attempt: bool, + deletable: bool, +} + +pub enum ParseError { + InvalidElement(String), +} + +fn parse_command(line: &str, bot_user: &str) -> Option { + let lower_line = line.to_lowercase(); + if lower_line == format!("/u/{} reset", bot_user) { + return Some(Command::Reset); + } else if lower_line == "sidebar count" || lower_line == "current count" { + return Some(Command::Report); + } else { + return None; + } +} + +/// `curr_count` is the next number up, one more than the last count +pub fn parse_update( + body: impl HtmlElement, + curr_count: Option, + bot_user: &str +) -> Result { + + // TextNode is HtmlElementImpl::Node containing HtmlNode::Text + enum Text { + FromTree(TextNode), + Other(String), + } + + // Node is HtmlElementImpl::Node + enum WorklistEntry { + Space, + NewLine, + Node(Node), + Text(String), + } + + // flatten the update content to plain text + let mut worklist: VecDeque<_> = body.contents().map(WorklistEntry::Node).collect(); + let mut out = vec![vec![]]; + + while let Some(entry) = worklist.pop_back() { + match entry { + WorklistEntry::Space => { + out.last_mut().unwrap().push(Text::Other(" ".to_string())); + }, + WorklistEntry::NewLine => { + if !out.last().unwrap().is_empty() { + out.push(vec![]); + } + }, + WorklistEntry::Node(node) => match node { + HtmlNode::Element(element) => match element.name() { + "br" | "hr" => { + if !out.last().unwrap().is_empty() { + out.push(vec![]); + } + }, + "em" | "strong" | "del" | "span" | "sup" | "code" | "a" | "th" | "td" + | "ul" | "ol" | "table" | "thead" | "tbody" => { + worklist.extend(element.contents().rev().map(WorklistEntry::Node)); + }, + "li" | "p" | "div" | "blockquote" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => { + worklist.push_back(WorklistEntry::NewLine); + worklist.extend(element.contents().rev().map(WorklistEntry::Node)); + worklist.push_back(WorklistEntry::NewLine); + }, + "pre" => { + out.extend( + element.text().as_ref() + .split('\n').map(str::to_string) + .map(Text::Other).map(|s| vec![s])); + out.push(vec![]); + }, + "tr" => { + worklist.push_back(WorklistEntry::NewLine); + + // intersperse space markers + for el in element.contents().rev() { + worklist.push_back(WorklistEntry::Node(el)); + worklist.push_back(WorklistEntry::Space); + } + worklist.pop_back(); + + worklist.push_back(WorklistEntry::NewLine); + }, + name => return Err(ParseError::InvalidElement(name.to_string())), + }, + HtmlNode::Text(_) => out.last_mut().unwrap().push(Text::FromTree(node)), + }, + WorklistEntry::Text(t) => out.last_mut().unwrap().push(Text::Other(t)), + } + } + + let pre_strip_lines = out.iter() + .filter(|line| !line.is_empty()) + .map(|line| + line.iter().map(|text| + match text { + Text::FromTree(node) => match node { + HtmlNode::Text(t) => t.as_ref(), + _ => panic!("element in out vec"), + }, + Text::Other(t) => t, + }).collect::>() + .join("") + ); + + // normalize whitespace according to HTML rendering rules + // https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Whitespace#explanation + let stripped_lines: Vec<_> = pre_strip_lines.map(|line| + fancy_regex::Regex::new(" +").unwrap().replace_all( + &line.replace('\t', " ").replace('\n', " "), + " ") + .trim().to_string() + ).collect(); + + Ok(parse_from_lines(&stripped_lines, curr_count, bot_user)) +} + +fn parse_from_lines(lines: &[impl Borrow], curr_count: Option, bot_user: &str) -> ParsedUpdate { + let command = lines.iter().find_map(|l| parse_command(l.borrow(), bot_user)); + + let number_out: Option; + let mut count_attempt; + let mut deletable; + + if lines.is_empty() { + number_out = None; + count_attempt = false; + deletable = true; + } else { + // look for groups of ASCII digits (as many as possible) separated by a uniform separator + // from the valid set + let pattern = fancy_regex::Regex::new( + "(?-u)^(?Pv)?(?P-)?(?P\\d+((?P[,. \u{2009}]|, )\\d+((?P=sep)\\d+)*)?)" + ).unwrap(); + let match_result = pattern + .captures(lines[0].borrow()) + .with_context(|| format!("parsing line {:?}", lines[0].borrow())) + .unwrap(); + if let Some(ref match_) = match_result { + let sep_opt = match_.name("sep").map(|m| m.as_str()); + let post = &lines[0].borrow()[match_.get(0).unwrap().end()..]; + + // strip leading zeros + let (raw_digits, zeros) = { + let mut zeros = false; + let mut raw_digits = &match_["num"]; + while raw_digits.len() > 1 && raw_digits.starts_with('0') { + zeros = true; + raw_digits = raw_digits.strip_prefix('0').unwrap(); + if let Some(ref sep) = sep_opt { + raw_digits = raw_digits.strip_prefix(sep).unwrap_or(raw_digits); + } + } + (raw_digits, zeros) + }; + + let parts = match sep_opt { + Some(ref sep) => raw_digits.split(sep).collect(), + None => vec![raw_digits], + }; + // TODO look into cleaner way to handle parts below (char slices?) + let raw_parts: Vec<&[u8]> = parts.iter().cloned().map(str::as_bytes).collect(); + + let all_parts_valid = sep_opt.is_none() + || ((1..=3).contains(&raw_parts[0].len()) && raw_parts[1..].iter().all(|p| p.len() == 3)); + let mut lone = lines.len() == 1 && post.chars().all(char::is_whitespace); + + let has_v = match_.name("v").is_some(); + let typo = + // failed paste of leading digits + (has_v && raw_parts.len() == 1 && raw_parts[0].len() <= 2) + || (has_v && all_parts_valid) // v followed by count + || match curr_count { + Some(count_val) => { + count_val.abs() >= 100 && match_.name("neg").is_some() == (count_val < 0) && { + let goal_string = count_val.abs().to_string(); + let goal_parts: Vec<&[u8]> = goal_string.as_bytes().rchunks(3).rev().collect(); + let (&last, front) = goal_parts.split_last().unwrap(); + let partials: Vec> = vec![ + // missing last digit + front.iter().cloned().chain([last.get(..last.len() - 1).unwrap()]).collect(), + + // missing last two digits + front.iter().cloned().chain([last.get(..last.len() - 2).unwrap()]).collect(), + + // missing second-last digit + front.iter().cloned().chain([ + last.get(..last.len() - 2).unwrap(), + slice::from_ref(&last[last.len() - 1]), + ]).collect(), + ]; + + // missing any of last two digits or double paste + partials.contains(&raw_parts) + || partials.iter().any( + |p| raw_parts == p[..p.len() - 1].iter().cloned() + .chain([p[p.len() - 1], goal_parts[0]]) + .chain(goal_parts[1..].iter().cloned()) + .collect::>() + ) + } + }, + None => false + }; + if has_v || zeros || typo || (parts == ["0"] && match_.name("neg").is_some()) { + number_out = None; + count_attempt = true; + deletable = lone; + } else { + let mut groups_okay = all_parts_valid; + let mut use_parts = &parts; + if let Some(count_val) = curr_count { + if sep_opt.is_some() && sep_opt.unwrap().chars().all(char::is_whitespace) { + // Presume that the intended count consists of as many valid digit groups as + // necessary to match the number of digits in the expected count, if + // possible. + let digit_count = format!("{}", count_val.abs()).len(); + let mut use_parts = vec![]; + let mut accum = 0; + for (i, part) in parts.iter().cloned().enumerate() { + let part_valid = if i == 0 { part.len() <= 3 } else { part.len() == 3 }; + if part_valid && accum < digit_count { + use_parts.push(part); + accum += part.len(); + } else { + break; + } + } + + // Could still be a no-separator count with some extra digit groups on the + // same line. + if use_parts.is_empty() { + use_parts = vec![parts[0]]; + } + + lone = lone && use_parts.len() == parts.len(); + } + } + + let digits = use_parts.join(""); + let number = { + let magnitude: Count = digits.parse().unwrap(); + if match_.name("neg").is_some() { -magnitude } else { magnitude } + }; + let special = match curr_count { + Some(count_val) => (number - count_val).abs() <= 25 && is_special(number), + None => false, + }; + if groups_okay { + deletable = lone && !special; + if use_parts.len() == parts.len() + && !post.is_empty() && !post.chars().next().unwrap().is_whitespace() { + count_attempt = match curr_count { + Some(count_val) => (number - count_val).abs() <= 25, + None => false, + }; + number_out = None; + } else { + count_attempt = true; + number_out = Some(number); + } + } else { + number_out = None; + count_attempt = true; + deletable = false; + } + } + + // TODO consider integrating this into logic above + if lines[0].borrow().chars().next().unwrap().is_digit(10) && !count_attempt { + count_attempt = true; + deletable = false; + } + } else { + // no count attempt found + number_out = None; + count_attempt = false; + deletable = false; + } + } + ParsedUpdate { + number: number_out, + command: command, + count_attempt: count_attempt, + deletable: deletable, + } +} + +fn is_special(count: Count) -> bool { + let count_str = format!("{}", count); + + [0, 1, 333, 999].contains(&(count % 1000)) + // palindrome + || (count > 10_000_000 && count_str.chars().rev().collect::() == count_str) + // repeated sequence + || (1..=count_str.len() / 2).any( + |len| count_str.len() % len == 0 + && count_str.as_bytes().chunks(len).collect::>().len() == 1) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_command() { + assert_eq!(parse_command("/U/foo reset", "foo"), Some(Command::Reset)); + assert_eq!(parse_command("current count", ""), Some(Command::Report)); + assert_eq!(parse_command("foo", ""), None); + } + + #[test] + fn test_is_special() { + assert!(is_special(10_333)); + assert!(is_special(10_000_001)); + assert!(is_special(12341234)); + assert!(!is_special(1937592)); + } +}