From c0c1a8744fed10105f362c67c8171e367ffc13ee Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Thu, 4 Jul 2024 22:07:21 +0800 Subject: [PATCH 1/7] feat: handle parentheses with unary ops Signed-off-by: Ruihang Xia --- src/common/function/src/scalars/matches.rs | 393 ++++++++++++++++----- 1 file changed, 307 insertions(+), 86 deletions(-) diff --git a/src/common/function/src/scalars/matches.rs b/src/common/function/src/scalars/matches.rs index 704a0a359550..06262e11ff3f 100644 --- a/src/common/function/src/scalars/matches.rs +++ b/src/common/function/src/scalars/matches.rs @@ -146,6 +146,10 @@ enum PatternAst { op: BinaryOp, children: Vec, }, + Group { + op: UnaryOp, + child: Box, + }, } #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -185,6 +189,14 @@ impl PatternAst { BinaryOp::Or => exprs.reduce(Expr::or).unwrap(), } } + PatternAst::Group { op, child } => { + let child = child.into_like_expr(column); + match op { + UnaryOp::Must => child, + UnaryOp::Optional => child, + UnaryOp::Negative => logical_expr::not(child), + } + } } } @@ -207,7 +219,11 @@ impl PatternAst { /// Transform this AST with preset rules to make it correct. fn transform_ast(self) -> Result { - self.transform_up(Self::collapse_binary_branch_fn) + self + // .transform_up(Self::replace_or_with_and_fn) + // .context(GeneralDataFusionSnafu) + // .map(|data| data.data)? + .transform_up(Self::collapse_binary_branch_fn) .context(GeneralDataFusionSnafu) .map(|data| data.data)? .transform_up(Self::eliminate_optional_fn) @@ -218,6 +234,47 @@ impl PatternAst { .map(|data| data.data) } + /// This function assumes the AST is in binary-tree form. + fn replace_or_with_and_fn(self) -> DfResult> { + let PatternAst::Binary { + op: parent_op, + children, + } = self + else { + return Ok(Transformed::no(self)); + }; + + if children.len() != 2 { + panic!("expect binary tree AST"); + } + + if parent_op != BinaryOp::Or { + return Ok(Transformed::no(PatternAst::Binary { + op: parent_op, + children, + })); + } + + fn is_must_or_must_not(ast: &PatternAst) -> bool { + match ast { + PatternAst::Literal { op, .. } => *op == UnaryOp::Must || *op == UnaryOp::Negative, + PatternAst::Binary { .. } | PatternAst::Group { .. } => false, + } + } + + if is_must_or_must_not(&children[0]) || is_must_or_must_not(&children[1]) { + Ok(Transformed::yes(PatternAst::Binary { + op: BinaryOp::And, + children, + })) + } else { + Ok(Transformed::no(PatternAst::Binary { + op: parent_op, + children, + })) + } + } + /// Collapse binary branch with the same operator. I.e., this transformer /// changes the binary-tree AST into a multiple branching AST. /// @@ -237,7 +294,7 @@ impl PatternAst { for child in children { match child { - PatternAst::Literal { .. } => { + PatternAst::Literal { .. } | PatternAst::Group { .. } => { collapsed.push(child); } PatternAst::Binary { op, children } => { @@ -289,8 +346,10 @@ impl PatternAst { match child { PatternAst::Literal { op, - pattern: ref _pattern, - } => match op { + // pattern: ref _pattern, + .. + } + | PatternAst::Group { op, .. } => match op { UnaryOp::Must => must_list.push(child), UnaryOp::Optional => optional_list.push(child), UnaryOp::Negative => must_not_list.push(child), @@ -336,7 +395,7 @@ impl PatternAst { })) } - /// Eliminate single child node. If a binary node has only one child, it can be + /// Eliminate single child [`PatternAst::Binary`] node. If a binary node has only one child, it can be /// replaced by its only child. /// /// This function prefers to be applied in a top-down manner. But it's not required. @@ -351,7 +410,7 @@ impl PatternAst { children: grand_children, .. } => !grand_children.is_empty(), - PatternAst::Literal { .. } => true, + PatternAst::Literal { .. } | PatternAst::Group { .. } => true, }); if children.len() == 1 { @@ -375,9 +434,9 @@ impl TreeNode for PatternAst { return Ok(TreeNodeRecursion::Stop); } } - Ok(TreeNodeRecursion::Continue) } + PatternAst::Group { op: _, child } => f(child), } } @@ -396,6 +455,12 @@ impl TreeNode for PatternAst { children: new_children, }) }), + PatternAst::Group { op, child } => f(*child)?.map_data(|new_child| { + Ok(PatternAst::Group { + op, + child: Box::new(new_child), + }) + }), } } } @@ -409,7 +474,10 @@ impl ParserContext { pub fn parse_pattern(mut self, pattern: &str) -> Result { let tokenizer = Tokenizer::default(); let raw_tokens = tokenizer.tokenize(pattern)?; + let raw_tokens = Self::accomplish_optional_unary_op(raw_tokens); + println!("[DEBUG] accomplished tokens: {:?}", raw_tokens); let mut tokens = Self::to_rpn(raw_tokens)?; + println!("[DEBUG] rpn tokens: {:?}", tokens); while !tokens.is_empty() { self.parse_one_impl(&mut tokens)?; @@ -433,6 +501,54 @@ impl ParserContext { } } + /// Add [`Token::Optional`] for all bare [`Token::Phase`] and [`Token::Or`] + /// for all adjacent [`Token::Phase`]s. + fn accomplish_optional_unary_op(raw_tokens: Vec) -> Vec { + println!("[DEBUG] raw tokens before accomplish: {:?}", raw_tokens); + let mut is_prev_unary_op = false; + // The first one doesn't need binary op + let mut is_binary_op_before = true; + let mut new_tokens = Vec::with_capacity(raw_tokens.len()); + for token in raw_tokens { + // fill `Token::Or` + if !is_binary_op_before + && matches!( + token, + Token::Phase(_) + | Token::OpenParen + | Token::Must + | Token::Optional + | Token::Negative + ) + { + is_binary_op_before = true; + new_tokens.push(Token::Or); + } + if matches!( + token, + Token::OpenParen // treat open paren as begin of new group + | Token::And | Token::Or + ) { + is_binary_op_before = true; + } else if matches!(token, Token::Phase(_) | Token::CloseParen) { + // need binary op next time + is_binary_op_before = false; + } + + // fill `Token::Optional` + if !is_prev_unary_op && matches!(token, Token::Phase(_) | Token::OpenParen) { + // is_prev_unary_op = false; + new_tokens.push(Token::Optional); + } else { + is_prev_unary_op = matches!(token, Token::Must | Token::Negative); + } + + new_tokens.push(token); + } + + new_tokens + } + /// Convert infix token stream to RPN fn to_rpn(mut raw_tokens: Vec) -> Result> { let mut operator_stack = vec![]; @@ -442,20 +558,24 @@ impl ParserContext { while let Some(token) = raw_tokens.pop() { match token { Token::Phase(_) => result.push(token), - Token::Must | Token::Negative => { - // unary operator with paren is not handled yet - let phase = raw_tokens.pop().context(InvalidFuncArgsSnafu { - err_msg: "Unexpected end of pattern, expected a phase after unary operator", - })?; - result.push(phase); - result.push(token); + Token::Must | Token::Negative | Token::Optional => { + // TODO: unary operator with paren is not handled yet + // let phase = raw_tokens.pop().context(InvalidFuncArgsSnafu { + // err_msg: "Unexpected end of pattern, expected a phase after unary operator", + // })?; + // result.push(phase); + operator_stack.push(token); } Token::OpenParen => operator_stack.push(token), Token::And | Token::Or => { - // Or has lower priority than And + // - Or has lower priority than And + // - Binary op have lower priority than unary op while let Some(stack_top) = operator_stack.last() - && *stack_top == Token::And - && token == Token::Or + && ((*stack_top == Token::And && token == Token::Or) + || matches!( + *stack_top, + Token::Must | Token::Optional | Token::Negative + )) { result.push(operator_stack.pop().unwrap()); } @@ -489,36 +609,88 @@ impl ParserContext { if let Some(token) = tokens.pop() { match token { Token::Must => { - let phase = tokens.pop().context(InvalidFuncArgsSnafu { - err_msg: "Unexpected end of pattern, expected a phase after \"+\" operator", + if self.stack.is_empty() { + // self.unwind_for_phase(tokens, token)?; + self.parse_one_impl(tokens)?; + } + let phase_or_group = self.stack.pop().context(InvalidFuncArgsSnafu { + err_msg: "Invalid pattern, \"+\" operator should have one operand", })?; - self.stack.push(PatternAst::Literal { - op: UnaryOp::Must, - pattern: Self::unwrap_phase(phase)?, - }); + match phase_or_group { + PatternAst::Literal { op: _, pattern } => { + self.stack.push(PatternAst::Literal { + op: UnaryOp::Must, + pattern, + }); + } + PatternAst::Binary { .. } | PatternAst::Group { .. } => { + self.stack.push(PatternAst::Group { + op: UnaryOp::Must, + child: Box::new(phase_or_group), + }) + } + } return Ok(()); } Token::Negative => { - let phase = tokens.pop().context(InvalidFuncArgsSnafu { - err_msg: "Unexpected end of pattern, expected a phase after \"-\" operator", + if self.stack.is_empty() { + // self.unwind_for_phase(tokens, token)?; + self.parse_one_impl(tokens)?; + } + let phase_or_group = self.stack.pop().context(InvalidFuncArgsSnafu { + err_msg: "Invalid pattern, \"-\" operator should have one operand", })?; - self.stack.push(PatternAst::Literal { - op: UnaryOp::Negative, - pattern: Self::unwrap_phase(phase)?, - }); + match phase_or_group { + PatternAst::Literal { op: _, pattern } => { + self.stack.push(PatternAst::Literal { + op: UnaryOp::Negative, + pattern, + }); + } + PatternAst::Binary { .. } | PatternAst::Group { .. } => { + self.stack.push(PatternAst::Group { + op: UnaryOp::Negative, + child: Box::new(phase_or_group), + }) + } + } + return Ok(()); + } + Token::Optional => { + if self.stack.is_empty() { + // self.unwind_for_phase(tokens, token)?; + self.parse_one_impl(tokens)?; + } + let phase_or_group = self.stack.pop().context(InvalidFuncArgsSnafu { + err_msg: + "Invalid pattern, OPTIONAL(space) operator should have one operand", + })?; + match phase_or_group { + PatternAst::Literal { op: _, pattern } => { + self.stack.push(PatternAst::Literal { + op: UnaryOp::Optional, + pattern, + }); + } + PatternAst::Binary { .. } | PatternAst::Group { .. } => { + self.stack.push(PatternAst::Group { + op: UnaryOp::Optional, + child: Box::new(phase_or_group), + }) + } + } return Ok(()); } - Token::Phase(token) => { - let phase = token.clone(); + Token::Phase(pattern) => { self.stack.push(PatternAst::Literal { + // Op here is a placeholder op: UnaryOp::Optional, - pattern: phase, - }); - return Ok(()); + pattern, + }) } Token::And => { if self.stack.is_empty() { - self.parse_one_impl(tokens)? + self.parse_one_impl(tokens)?; }; let rhs = self.stack.pop().context(InvalidFuncArgsSnafu { err_msg: "Invalid pattern, \"AND\" operator should have two operands", @@ -566,6 +738,18 @@ impl ParserContext { Ok(()) } + /// Try unwind one token if the next one is phase. + fn unwind_for_phase(&mut self, tokens: &mut Vec, unwind: Token) -> Result<()> { + let stack_top = tokens.last().context(InvalidFuncArgsSnafu { + err_msg: "Unexpected EOF, want a phase or group", + })?; + if matches!(*stack_top, Token::Phase(_)) { + tokens.push(unwind); + } + + Ok(()) + } + fn unwrap_phase(token: Token) -> Result { match token { Token::Phase(phase) => Ok(phase), @@ -593,6 +777,12 @@ enum Token { CloseParen, /// Any other phases Phase(String), + + /// This is not a token from user input, but a placeholder for internal use. + /// It's used to accomplish the unary operator class with Must and Negative. + /// In user provided pattern, optional is expressed by a bare phase or group + /// (simply nothing or writespace). + Optional, } #[derive(Default)] @@ -862,19 +1052,19 @@ mod test { children: vec![ PatternAst::Literal { op: UnaryOp::Optional, - pattern: "d".to_string(), + pattern: "a".to_string(), }, PatternAst::Literal { op: UnaryOp::Optional, - pattern: "c".to_string(), + pattern: "b".to_string(), }, PatternAst::Literal { op: UnaryOp::Optional, - pattern: "b".to_string(), + pattern: "c".to_string(), }, PatternAst::Literal { op: UnaryOp::Optional, - pattern: "a".to_string(), + pattern: "d".to_string(), }, ], }, @@ -886,11 +1076,11 @@ mod test { children: vec![ PatternAst::Literal { op: UnaryOp::Optional, - pattern: "b".to_string(), + pattern: "a".to_string(), }, PatternAst::Literal { op: UnaryOp::Optional, - pattern: "a".to_string(), + pattern: "b".to_string(), }, PatternAst::Binary { op: BinaryOp::And, @@ -951,18 +1141,21 @@ mod test { PatternAst::Binary { op: BinaryOp::Or, children: vec![ - PatternAst::Binary { - op: BinaryOp::And, - children: vec![ - PatternAst::Literal { - op: UnaryOp::Optional, - pattern: "a".to_string(), - }, - PatternAst::Literal { - op: UnaryOp::Optional, - pattern: "b".to_string(), - }, - ], + PatternAst::Group { + op: UnaryOp::Optional, + child: Box::new(PatternAst::Binary { + op: BinaryOp::And, + children: vec![ + PatternAst::Literal { + op: UnaryOp::Optional, + pattern: "a".to_string(), + }, + PatternAst::Literal { + op: UnaryOp::Optional, + pattern: "b".to_string(), + }, + ], + }), }, PatternAst::Literal { op: UnaryOp::Optional, @@ -980,18 +1173,21 @@ mod test { op: UnaryOp::Optional, pattern: "a".to_string(), }, - PatternAst::Binary { - op: BinaryOp::Or, - children: vec![ - PatternAst::Literal { - op: UnaryOp::Optional, - pattern: "b".to_string(), - }, - PatternAst::Literal { - op: UnaryOp::Optional, - pattern: "c".to_string(), - }, - ], + PatternAst::Group { + op: UnaryOp::Optional, + child: Box::new(PatternAst::Binary { + op: BinaryOp::Or, + children: vec![ + PatternAst::Literal { + op: UnaryOp::Optional, + pattern: "b".to_string(), + }, + PatternAst::Literal { + op: UnaryOp::Optional, + pattern: "c".to_string(), + }, + ], + }), }, ], }, @@ -1001,18 +1197,23 @@ mod test { PatternAst::Binary { op: BinaryOp::Or, children: vec![ - PatternAst::Literal { - op: UnaryOp::Negative, - pattern: "c".to_string(), - }, - PatternAst::Literal { - op: UnaryOp::Must, - pattern: "b".to_string(), - }, PatternAst::Literal { op: UnaryOp::Optional, pattern: "a".to_string(), }, + PatternAst::Binary { + op: BinaryOp::Or, + children: vec![ + PatternAst::Literal { + op: UnaryOp::Must, + pattern: "b".to_string(), + }, + PatternAst::Literal { + op: UnaryOp::Negative, + pattern: "c".to_string(), + }, + ], + }, ], }, ), @@ -1021,17 +1222,25 @@ mod test { PatternAst::Binary { op: BinaryOp::Or, children: vec![ - PatternAst::Literal { + PatternAst::Group { op: UnaryOp::Optional, - pattern: "c".to_string(), - }, - PatternAst::Literal { - op: UnaryOp::Must, - pattern: "b".to_string(), + child: Box::new(PatternAst::Binary { + op: BinaryOp::Or, + children: vec![ + PatternAst::Literal { + op: UnaryOp::Must, + pattern: "a".to_string(), + }, + PatternAst::Literal { + op: UnaryOp::Must, + pattern: "b".to_string(), + }, + ], + }), }, PatternAst::Literal { - op: UnaryOp::Must, - pattern: "a".to_string(), + op: UnaryOp::Optional, + pattern: "c".to_string(), }, ], }, @@ -1138,11 +1347,23 @@ mod test { "fox AND +jumps AND -over", vec![false, false, false, false, true, false, false], ), - // TODO: still parentheses are not supported - // ( - // "(+fox +jumps) over", - // vec![true, true, true, true, true, true, true], - // ), + // weird parentheses cases + ( + "(+fox +jumps) over", + vec![true, true, true, true, true, true, true], + ), + ( + "+(fox jumps) AND over", + vec![true, true, true, true, false, true, true], + ), + ( + "over -(fox jumps)", + vec![false, false, false, false, false, false, false], + ), + ( + "over -(fox AND jumps)", + vec![false, false, true, true, false, false, false], + ), ]; let f = MatchesFunction; From 6e38d5e83f0d051c792bfd66197ff09c8b9df065 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Thu, 4 Jul 2024 22:10:58 +0800 Subject: [PATCH 2/7] clean up Signed-off-by: Ruihang Xia --- src/common/function/src/scalars/matches.rs | 92 ++-------------------- 1 file changed, 6 insertions(+), 86 deletions(-) diff --git a/src/common/function/src/scalars/matches.rs b/src/common/function/src/scalars/matches.rs index 06262e11ff3f..2796a3d44788 100644 --- a/src/common/function/src/scalars/matches.rs +++ b/src/common/function/src/scalars/matches.rs @@ -219,11 +219,7 @@ impl PatternAst { /// Transform this AST with preset rules to make it correct. fn transform_ast(self) -> Result { - self - // .transform_up(Self::replace_or_with_and_fn) - // .context(GeneralDataFusionSnafu) - // .map(|data| data.data)? - .transform_up(Self::collapse_binary_branch_fn) + self.transform_up(Self::collapse_binary_branch_fn) .context(GeneralDataFusionSnafu) .map(|data| data.data)? .transform_up(Self::eliminate_optional_fn) @@ -234,47 +230,6 @@ impl PatternAst { .map(|data| data.data) } - /// This function assumes the AST is in binary-tree form. - fn replace_or_with_and_fn(self) -> DfResult> { - let PatternAst::Binary { - op: parent_op, - children, - } = self - else { - return Ok(Transformed::no(self)); - }; - - if children.len() != 2 { - panic!("expect binary tree AST"); - } - - if parent_op != BinaryOp::Or { - return Ok(Transformed::no(PatternAst::Binary { - op: parent_op, - children, - })); - } - - fn is_must_or_must_not(ast: &PatternAst) -> bool { - match ast { - PatternAst::Literal { op, .. } => *op == UnaryOp::Must || *op == UnaryOp::Negative, - PatternAst::Binary { .. } | PatternAst::Group { .. } => false, - } - } - - if is_must_or_must_not(&children[0]) || is_must_or_must_not(&children[1]) { - Ok(Transformed::yes(PatternAst::Binary { - op: BinaryOp::And, - children, - })) - } else { - Ok(Transformed::no(PatternAst::Binary { - op: parent_op, - children, - })) - } - } - /// Collapse binary branch with the same operator. I.e., this transformer /// changes the binary-tree AST into a multiple branching AST. /// @@ -344,12 +299,7 @@ impl PatternAst { for child in children { match child { - PatternAst::Literal { - op, - // pattern: ref _pattern, - .. - } - | PatternAst::Group { op, .. } => match op { + PatternAst::Literal { op, .. } | PatternAst::Group { op, .. } => match op { UnaryOp::Must => must_list.push(child), UnaryOp::Optional => optional_list.push(child), UnaryOp::Negative => must_not_list.push(child), @@ -475,9 +425,7 @@ impl ParserContext { let tokenizer = Tokenizer::default(); let raw_tokens = tokenizer.tokenize(pattern)?; let raw_tokens = Self::accomplish_optional_unary_op(raw_tokens); - println!("[DEBUG] accomplished tokens: {:?}", raw_tokens); let mut tokens = Self::to_rpn(raw_tokens)?; - println!("[DEBUG] rpn tokens: {:?}", tokens); while !tokens.is_empty() { self.parse_one_impl(&mut tokens)?; @@ -504,7 +452,6 @@ impl ParserContext { /// Add [`Token::Optional`] for all bare [`Token::Phase`] and [`Token::Or`] /// for all adjacent [`Token::Phase`]s. fn accomplish_optional_unary_op(raw_tokens: Vec) -> Vec { - println!("[DEBUG] raw tokens before accomplish: {:?}", raw_tokens); let mut is_prev_unary_op = false; // The first one doesn't need binary op let mut is_binary_op_before = true; @@ -537,7 +484,6 @@ impl ParserContext { // fill `Token::Optional` if !is_prev_unary_op && matches!(token, Token::Phase(_) | Token::OpenParen) { - // is_prev_unary_op = false; new_tokens.push(Token::Optional); } else { is_prev_unary_op = matches!(token, Token::Must | Token::Negative); @@ -559,11 +505,6 @@ impl ParserContext { match token { Token::Phase(_) => result.push(token), Token::Must | Token::Negative | Token::Optional => { - // TODO: unary operator with paren is not handled yet - // let phase = raw_tokens.pop().context(InvalidFuncArgsSnafu { - // err_msg: "Unexpected end of pattern, expected a phase after unary operator", - // })?; - // result.push(phase); operator_stack.push(token); } Token::OpenParen => operator_stack.push(token), @@ -610,7 +551,6 @@ impl ParserContext { match token { Token::Must => { if self.stack.is_empty() { - // self.unwind_for_phase(tokens, token)?; self.parse_one_impl(tokens)?; } let phase_or_group = self.stack.pop().context(InvalidFuncArgsSnafu { @@ -634,7 +574,6 @@ impl ParserContext { } Token::Negative => { if self.stack.is_empty() { - // self.unwind_for_phase(tokens, token)?; self.parse_one_impl(tokens)?; } let phase_or_group = self.stack.pop().context(InvalidFuncArgsSnafu { @@ -658,7 +597,6 @@ impl ParserContext { } Token::Optional => { if self.stack.is_empty() { - // self.unwind_for_phase(tokens, token)?; self.parse_one_impl(tokens)?; } let phase_or_group = self.stack.pop().context(InvalidFuncArgsSnafu { @@ -737,28 +675,6 @@ impl ParserContext { Ok(()) } - - /// Try unwind one token if the next one is phase. - fn unwind_for_phase(&mut self, tokens: &mut Vec, unwind: Token) -> Result<()> { - let stack_top = tokens.last().context(InvalidFuncArgsSnafu { - err_msg: "Unexpected EOF, want a phase or group", - })?; - if matches!(*stack_top, Token::Phase(_)) { - tokens.push(unwind); - } - - Ok(()) - } - - fn unwrap_phase(token: Token) -> Result { - match token { - Token::Phase(phase) => Ok(phase), - _ => InvalidFuncArgsSnafu { - err_msg: format!("Unexpected token: {:?}, want a phase", token), - } - .fail(), - } - } } #[derive(Clone, Debug, PartialEq, Eq)] @@ -1364,6 +1280,10 @@ mod test { "over -(fox AND jumps)", vec![false, false, true, true, false, false, false], ), + ( + "over AND -(-(fox OR jumps))", + vec![true, true, true, true, false, true, true], + ), ]; let f = MatchesFunction; From 5d94be21e1850c1b9d7243a099421958123b3a15 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Thu, 4 Jul 2024 22:17:45 +0800 Subject: [PATCH 3/7] add comment Signed-off-by: Ruihang Xia --- src/common/function/src/scalars/matches.rs | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/common/function/src/scalars/matches.rs b/src/common/function/src/scalars/matches.rs index 2796a3d44788..799a67fe3c39 100644 --- a/src/common/function/src/scalars/matches.rs +++ b/src/common/function/src/scalars/matches.rs @@ -138,18 +138,16 @@ impl MatchesFunction { #[derive(Debug, Clone, PartialEq, Eq)] enum PatternAst { - Literal { - op: UnaryOp, - pattern: String, - }, + // Distinguish this with `Group` for simplicity + /// A leaf node that matches a column with `pattern` + Literal { op: UnaryOp, pattern: String }, + /// Flattened binary chains Binary { op: BinaryOp, children: Vec, }, - Group { - op: UnaryOp, - child: Box, - }, + /// A sub-tree enclosed by parenthesis + Group { op: UnaryOp, child: Box }, } #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -1225,7 +1223,7 @@ mod test { "-over AND -lazy", vec![false, false, false, false, false, false, false], ), - // test priority between AND & OR + // priority between AND & OR ( "fox AND jumps OR over", vec![true, true, true, true, true, true, true], From de124e973ca30b9b61b5333e46f0f80cad1fb386 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Thu, 4 Jul 2024 22:50:07 +0800 Subject: [PATCH 4/7] add sqlness test Signed-off-by: Ruihang Xia --- src/common/function/src/function_registry.rs | 4 +++ src/common/function/src/scalars/matches.rs | 17 ++++++---- .../standalone/common/select/matches.result | 34 +++++++++++++++++++ .../standalone/common/select/matches.sql | 17 ++++++++++ 4 files changed, 65 insertions(+), 7 deletions(-) create mode 100644 tests/cases/standalone/common/select/matches.result create mode 100644 tests/cases/standalone/common/select/matches.sql diff --git a/src/common/function/src/function_registry.rs b/src/common/function/src/function_registry.rs index 04eb673b56ce..31b2b223a67b 100644 --- a/src/common/function/src/function_registry.rs +++ b/src/common/function/src/function_registry.rs @@ -22,6 +22,7 @@ use crate::function::FunctionRef; use crate::scalars::aggregate::{AggregateFunctionMetaRef, AggregateFunctions}; use crate::scalars::date::DateFunction; use crate::scalars::expression::ExpressionFunction; +use crate::scalars::matches::MatchesFunction; use crate::scalars::math::MathFunction; use crate::scalars::numpy::NumpyFunction; use crate::scalars::timestamp::TimestampFunction; @@ -86,6 +87,9 @@ pub static FUNCTION_REGISTRY: Lazy> = Lazy::new(|| { // Aggregate functions AggregateFunctions::register(&function_registry); + // Full text search function + MatchesFunction::register(&function_registry); + // System and administration functions SystemFunction::register(&function_registry); TableFunction::register(&function_registry); diff --git a/src/common/function/src/scalars/matches.rs b/src/common/function/src/scalars/matches.rs index 799a67fe3c39..e9565629d46f 100644 --- a/src/common/function/src/scalars/matches.rs +++ b/src/common/function/src/scalars/matches.rs @@ -33,10 +33,19 @@ use snafu::{ensure, OptionExt, ResultExt}; use store_api::storage::ConcreteDataType; use crate::function::{Function, FunctionContext}; +use crate::function_registry::FunctionRegistry; /// `matches` for full text search. +/// +/// Usage: matches(``, ``) -> boolean #[derive(Clone, Debug, Default)] -struct MatchesFunction; +pub(crate) struct MatchesFunction; + +impl MatchesFunction { + pub fn register(registry: &FunctionRegistry) { + registry.register(Arc::new(MatchesFunction)); + } +} impl fmt::Display for MatchesFunction { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { @@ -74,12 +83,6 @@ impl Function for MatchesFunction { ), } ); - ensure!( - columns[1].len() == 1, - InvalidFuncArgsSnafu { - err_msg: "The second argument should be a string literal", - } - ); let pattern_vector = &columns[1] .cast(&ConcreteDataType::string_datatype()) .context(InvalidInputTypeSnafu { diff --git a/tests/cases/standalone/common/select/matches.result b/tests/cases/standalone/common/select/matches.result new file mode 100644 index 000000000000..94969c025f01 --- /dev/null +++ b/tests/cases/standalone/common/select/matches.result @@ -0,0 +1,34 @@ +create table fox ( + ts timestamp time index, + fox string, +); + +Affected Rows: 0 + +insert into fox values + (1, 'The quick brown fox jumps over the lazy dog'), + (2, 'The fox jumps over the lazy dog'), + (3, 'The quick brown jumps over the lazy dog'), + (4, 'The quick brown fox over the lazy dog'), + (5, 'The quick brown fox jumps the lazy dog'), + (6, 'The quick brown fox jumps over dog'), + (7, 'The quick brown fox jumps over the dog'); + +Affected Rows: 7 + +select fox from fox where matches(fox, '"fox jumps"') order by ts; + ++---------------------------------------------+ +| fox | ++---------------------------------------------+ +| The quick brown fox jumps over the lazy dog | +| The fox jumps over the lazy dog | +| The quick brown fox jumps the lazy dog | +| The quick brown fox jumps over dog | +| The quick brown fox jumps over the dog | ++---------------------------------------------+ + +drop table fox; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/select/matches.sql b/tests/cases/standalone/common/select/matches.sql new file mode 100644 index 000000000000..aab0b876f00a --- /dev/null +++ b/tests/cases/standalone/common/select/matches.sql @@ -0,0 +1,17 @@ +create table fox ( + ts timestamp time index, + fox string, +); + +insert into fox values + (1, 'The quick brown fox jumps over the lazy dog'), + (2, 'The fox jumps over the lazy dog'), + (3, 'The quick brown jumps over the lazy dog'), + (4, 'The quick brown fox over the lazy dog'), + (5, 'The quick brown fox jumps the lazy dog'), + (6, 'The quick brown fox jumps over dog'), + (7, 'The quick brown fox jumps over the dog'); + +select fox from fox where matches(fox, '"fox jumps"') order by ts; + +drop table fox; From 07472f868ecf388d745615a2be13d9e719e3d743 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Mon, 8 Jul 2024 16:49:21 +0800 Subject: [PATCH 5/7] check tokens before convert to RPN Signed-off-by: Ruihang Xia --- src/common/function/src/lib.rs | 1 + src/common/function/src/scalars/matches.rs | 61 +++++++++++++++++++--- 2 files changed, 55 insertions(+), 7 deletions(-) diff --git a/src/common/function/src/lib.rs b/src/common/function/src/lib.rs index cee3b1e79a94..204333601ceb 100644 --- a/src/common/function/src/lib.rs +++ b/src/common/function/src/lib.rs @@ -13,6 +13,7 @@ // limitations under the License. #![feature(let_chains)] +#![feature(try_blocks)] mod macros; pub mod scalars; diff --git a/src/common/function/src/scalars/matches.rs b/src/common/function/src/scalars/matches.rs index e9565629d46f..6ffa9240bd98 100644 --- a/src/common/function/src/scalars/matches.rs +++ b/src/common/function/src/scalars/matches.rs @@ -425,7 +425,7 @@ impl ParserContext { pub fn parse_pattern(mut self, pattern: &str) -> Result { let tokenizer = Tokenizer::default(); let raw_tokens = tokenizer.tokenize(pattern)?; - let raw_tokens = Self::accomplish_optional_unary_op(raw_tokens); + let raw_tokens = Self::accomplish_optional_unary_op(raw_tokens)?; let mut tokens = Self::to_rpn(raw_tokens)?; while !tokens.is_empty() { @@ -452,10 +452,14 @@ impl ParserContext { /// Add [`Token::Optional`] for all bare [`Token::Phase`] and [`Token::Or`] /// for all adjacent [`Token::Phase`]s. - fn accomplish_optional_unary_op(raw_tokens: Vec) -> Vec { + /// + /// This function also does some checks by the way. Like if two unary ops are + /// adjacent. + fn accomplish_optional_unary_op(raw_tokens: Vec) -> Result> { let mut is_prev_unary_op = false; // The first one doesn't need binary op let mut is_binary_op_before = true; + let mut is_unary_op_before = false; let mut new_tokens = Vec::with_capacity(raw_tokens.len()); for token in raw_tokens { // fill `Token::Or` @@ -490,10 +494,23 @@ impl ParserContext { is_prev_unary_op = matches!(token, Token::Must | Token::Negative); } + // check if unary ops are adjacent by the way + if matches!(token, Token::Must | Token::Optional | Token::Negative) { + if is_unary_op_before { + return InvalidFuncArgsSnafu { + err_msg: "Invalid pattern, unary operators should not be adjacent", + } + .fail(); + } + is_unary_op_before = true; + } else { + is_unary_op_before = false; + } + new_tokens.push(token); } - new_tokens + Ok(new_tokens) } /// Convert infix token stream to RPN @@ -524,24 +541,32 @@ impl ParserContext { operator_stack.push(token); } Token::CloseParen => { + let mut is_open_paren_found = false; while let Some(op) = operator_stack.pop() { if op == Token::OpenParen { + is_open_paren_found = true; break; } result.push(op); } + if !is_open_paren_found { + return InvalidFuncArgsSnafu { + err_msg: "Unmatched close parentheses", + } + .fail(); + } } } } - while let Some(operand) = operator_stack.pop() { - if operand == Token::OpenParen { + while let Some(operator) = operator_stack.pop() { + if operator == Token::OpenParen { return InvalidFuncArgsSnafu { err_msg: "Unmatched parentheses", } .fail(); } - result.push(operand); + result.push(operator); } Ok(result) @@ -1025,6 +1050,28 @@ mod test { } } + #[test] + fn invalid_ast() { + let cases = [ + (r#"a b (c"#, "Unmatched parentheses"), + (r#"a b) c"#, "Unmatched close parentheses"), + (r#"a +-b"#, "unary operators should not be adjacent"), + ]; + + for (query, expected) in cases { + let result: Result<()> = try { + let parser = ParserContext { stack: vec![] }; + let ast = parser.parse_pattern(query)?; + let _ast = ast.transform_ast()?; + () + }; + + assert!(result.is_err(), "{query}"); + let actual_error = result.unwrap_err().to_string(); + assert!(actual_error.contains(expected), "{query}, {actual_error}"); + } + } + #[test] fn valid_matches_parser() { let cases = [ @@ -1188,7 +1235,7 @@ mod test { } #[test] - fn evaluate_matches_without_parenthesis() { + fn evaluate_matches() { let input_data = vec![ "The quick brown fox jumps over the lazy dog", "The fox jumps over the lazy dog", From b0a6c36639fbe7b2104456ec0afd8ecf476870cf Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Mon, 8 Jul 2024 17:01:47 +0800 Subject: [PATCH 6/7] add test cases to sqlness Signed-off-by: Ruihang Xia --- .../standalone/common/select/matches.result | 225 ++++++++++++++++++ .../standalone/common/select/matches.sql | 40 ++++ 2 files changed, 265 insertions(+) diff --git a/tests/cases/standalone/common/select/matches.result b/tests/cases/standalone/common/select/matches.result index 94969c025f01..a3e4cad9a27f 100644 --- a/tests/cases/standalone/common/select/matches.result +++ b/tests/cases/standalone/common/select/matches.result @@ -28,6 +28,231 @@ select fox from fox where matches(fox, '"fox jumps"') order by ts; | The quick brown fox jumps over the dog | +---------------------------------------------+ +select fox from fox where matches(fox, '"quick brown"') order by ts; + ++---------------------------------------------+ +| fox | ++---------------------------------------------+ +| The quick brown fox jumps over the lazy dog | +| The quick brown jumps over the lazy dog | +| The quick brown fox over the lazy dog | +| The quick brown fox jumps the lazy dog | +| The quick brown fox jumps over dog | +| The quick brown fox jumps over the dog | ++---------------------------------------------+ + +select fox from fox where matches(fox, '"fox jumps"') order by ts; + ++---------------------------------------------+ +| fox | ++---------------------------------------------+ +| The quick brown fox jumps over the lazy dog | +| The fox jumps over the lazy dog | +| The quick brown fox jumps the lazy dog | +| The quick brown fox jumps over dog | +| The quick brown fox jumps over the dog | ++---------------------------------------------+ + +select fox from fox where matches(fox, 'fox OR lazy') order by ts; + ++---------------------------------------------+ +| fox | ++---------------------------------------------+ +| The quick brown fox jumps over the lazy dog | +| The fox jumps over the lazy dog | +| The quick brown jumps over the lazy dog | +| The quick brown fox over the lazy dog | +| The quick brown fox jumps the lazy dog | +| The quick brown fox jumps over dog | +| The quick brown fox jumps over the dog | ++---------------------------------------------+ + +select fox from fox where matches(fox, 'fox AND lazy') order by ts; + ++---------------------------------------------+ +| fox | ++---------------------------------------------+ +| The quick brown fox jumps over the lazy dog | +| The fox jumps over the lazy dog | +| The quick brown fox over the lazy dog | +| The quick brown fox jumps the lazy dog | ++---------------------------------------------+ + +select fox from fox where matches(fox, '-over -lazy') order by ts; + +++ +++ + +select fox from fox where matches(fox, '-over AND -lazy') order by ts; + +++ +++ + +select fox from fox where matches(fox, 'fox AND jumps OR over') order by ts; + ++---------------------------------------------+ +| fox | ++---------------------------------------------+ +| The quick brown fox jumps over the lazy dog | +| The fox jumps over the lazy dog | +| The quick brown jumps over the lazy dog | +| The quick brown fox over the lazy dog | +| The quick brown fox jumps the lazy dog | +| The quick brown fox jumps over dog | +| The quick brown fox jumps over the dog | ++---------------------------------------------+ + +select fox from fox where matches(fox, 'fox OR brown AND quick') order by ts; + ++---------------------------------------------+ +| fox | ++---------------------------------------------+ +| The quick brown fox jumps over the lazy dog | +| The fox jumps over the lazy dog | +| The quick brown jumps over the lazy dog | +| The quick brown fox over the lazy dog | +| The quick brown fox jumps the lazy dog | +| The quick brown fox jumps over dog | +| The quick brown fox jumps over the dog | ++---------------------------------------------+ + +select fox from fox where matches(fox, '(fox OR brown) AND quick') order by ts; + ++---------------------------------------------+ +| fox | ++---------------------------------------------+ +| The quick brown fox jumps over the lazy dog | +| The quick brown jumps over the lazy dog | +| The quick brown fox over the lazy dog | +| The quick brown fox jumps the lazy dog | +| The quick brown fox jumps over dog | +| The quick brown fox jumps over the dog | ++---------------------------------------------+ + +select fox from fox where matches(fox, 'brown AND quick OR fox') order by ts; + ++---------------------------------------------+ +| fox | ++---------------------------------------------+ +| The quick brown fox jumps over the lazy dog | +| The fox jumps over the lazy dog | +| The quick brown jumps over the lazy dog | +| The quick brown fox over the lazy dog | +| The quick brown fox jumps the lazy dog | +| The quick brown fox jumps over dog | +| The quick brown fox jumps over the dog | ++---------------------------------------------+ + +select fox from fox where matches(fox, 'brown AND (quick OR fox)') order by ts; + ++---------------------------------------------+ +| fox | ++---------------------------------------------+ +| The quick brown fox jumps over the lazy dog | +| The quick brown jumps over the lazy dog | +| The quick brown fox over the lazy dog | +| The quick brown fox jumps the lazy dog | +| The quick brown fox jumps over dog | +| The quick brown fox jumps over the dog | ++---------------------------------------------+ + +select fox from fox where matches(fox, 'brown AND quick AND fox OR jumps AND over AND lazy') order by ts; + ++---------------------------------------------+ +| fox | ++---------------------------------------------+ +| The quick brown fox jumps over the lazy dog | +| The fox jumps over the lazy dog | +| The quick brown jumps over the lazy dog | +| The quick brown fox over the lazy dog | +| The quick brown fox jumps the lazy dog | +| The quick brown fox jumps over dog | +| The quick brown fox jumps over the dog | ++---------------------------------------------+ + +select fox from fox where matches(fox, 'quick brown fox +jumps') order by ts; + ++---------------------------------------------+ +| fox | ++---------------------------------------------+ +| The quick brown fox jumps over the lazy dog | +| The fox jumps over the lazy dog | +| The quick brown jumps over the lazy dog | +| The quick brown fox jumps the lazy dog | +| The quick brown fox jumps over dog | +| The quick brown fox jumps over the dog | ++---------------------------------------------+ + +select fox from fox where matches(fox, 'fox +jumps -over') order by ts; + ++---------------------------------------------+ +| fox | ++---------------------------------------------+ +| The quick brown fox jumps the lazy dog | ++---------------------------------------------+ + +select fox from fox where matches(fox, 'fox AND +jumps AND -over') order by ts; + ++---------------------------------------------+ +| fox | ++---------------------------------------------+ +| The quick brown fox jumps the lazy dog | ++---------------------------------------------+ + +select fox from fox where matches(fox, '(+fox +jumps) over') order by ts; + ++---------------------------------------------+ +| fox | ++---------------------------------------------+ +| The quick brown fox jumps over the lazy dog | +| The fox jumps over the lazy dog | +| The quick brown jumps over the lazy dog | +| The quick brown fox over the lazy dog | +| The quick brown fox jumps the lazy dog | +| The quick brown fox jumps over dog | +| The quick brown fox jumps over the dog | ++---------------------------------------------+ + +select fox from fox where matches(fox, '+(fox jumps) AND over') order by ts; + ++---------------------------------------------+ +| fox | ++---------------------------------------------+ +| The quick brown fox jumps over the lazy dog | +| The fox jumps over the lazy dog | +| The quick brown jumps over the lazy dog | +| The quick brown fox over the lazy dog | +| The quick brown fox jumps over dog | +| The quick brown fox jumps over the dog | ++---------------------------------------------+ + +select fox from fox where matches(fox, 'over -(fox jumps)') order by ts; + +++ +++ + +select fox from fox where matches(fox, 'over -(fox AND jumps)') order by ts; + ++---------------------------------------------+ +| fox | ++---------------------------------------------+ +| The quick brown jumps over the lazy dog | +| The quick brown fox over the lazy dog | ++---------------------------------------------+ + +select fox from fox where matches(fox, 'over AND -(-(fox OR jumps))') order by ts; + ++---------------------------------------------+ +| fox | ++---------------------------------------------+ +| The quick brown fox jumps over the lazy dog | +| The fox jumps over the lazy dog | +| The quick brown jumps over the lazy dog | +| The quick brown fox over the lazy dog | +| The quick brown fox jumps over dog | +| The quick brown fox jumps over the dog | ++---------------------------------------------+ + drop table fox; Affected Rows: 0 diff --git a/tests/cases/standalone/common/select/matches.sql b/tests/cases/standalone/common/select/matches.sql index aab0b876f00a..41357a8ff811 100644 --- a/tests/cases/standalone/common/select/matches.sql +++ b/tests/cases/standalone/common/select/matches.sql @@ -14,4 +14,44 @@ insert into fox values select fox from fox where matches(fox, '"fox jumps"') order by ts; +select fox from fox where matches(fox, '"quick brown"') order by ts; + +select fox from fox where matches(fox, '"fox jumps"') order by ts; + +select fox from fox where matches(fox, 'fox OR lazy') order by ts; + +select fox from fox where matches(fox, 'fox AND lazy') order by ts; + +select fox from fox where matches(fox, '-over -lazy') order by ts; + +select fox from fox where matches(fox, '-over AND -lazy') order by ts; + +select fox from fox where matches(fox, 'fox AND jumps OR over') order by ts; + +select fox from fox where matches(fox, 'fox OR brown AND quick') order by ts; + +select fox from fox where matches(fox, '(fox OR brown) AND quick') order by ts; + +select fox from fox where matches(fox, 'brown AND quick OR fox') order by ts; + +select fox from fox where matches(fox, 'brown AND (quick OR fox)') order by ts; + +select fox from fox where matches(fox, 'brown AND quick AND fox OR jumps AND over AND lazy') order by ts; + +select fox from fox where matches(fox, 'quick brown fox +jumps') order by ts; + +select fox from fox where matches(fox, 'fox +jumps -over') order by ts; + +select fox from fox where matches(fox, 'fox AND +jumps AND -over') order by ts; + +select fox from fox where matches(fox, '(+fox +jumps) over') order by ts; + +select fox from fox where matches(fox, '+(fox jumps) AND over') order by ts; + +select fox from fox where matches(fox, 'over -(fox jumps)') order by ts; + +select fox from fox where matches(fox, 'over -(fox AND jumps)') order by ts; + +select fox from fox where matches(fox, 'over AND -(-(fox OR jumps))') order by ts; + drop table fox; From 322079ad69964b70d6f88d2656966f62847bd058 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Mon, 8 Jul 2024 17:14:15 +0800 Subject: [PATCH 7/7] fix clippy Signed-off-by: Ruihang Xia --- src/common/function/src/scalars/matches.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/common/function/src/scalars/matches.rs b/src/common/function/src/scalars/matches.rs index 6ffa9240bd98..d3276f83a2a0 100644 --- a/src/common/function/src/scalars/matches.rs +++ b/src/common/function/src/scalars/matches.rs @@ -1063,7 +1063,6 @@ mod test { let parser = ParserContext { stack: vec![] }; let ast = parser.parse_pattern(query)?; let _ast = ast.transform_ast()?; - () }; assert!(result.is_err(), "{query}");