From f365029f2273121dd3f59b47bddd81aedec3a91a Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Fri, 5 Dec 2025 00:26:04 +0530 Subject: [PATCH] GenericDialect: support colon operator for JsonAccess - Port JsonAccess colon operator from Snowflake to Generic dialect - This will be used in variant data type support in Datafusion - see discussion in https://github.com/datafusion-contrib/datafusion-variant/issues/2 --- src/dialect/mod.rs | 10 +++ src/dialect/mssql.rs | 9 +++ src/dialect/postgresql.rs | 3 + src/parser/mod.rs | 8 +- tests/sqlparser_common.rs | 145 +++++++++++++++++++++++++++++++++++ tests/sqlparser_snowflake.rs | 115 +-------------------------- 6 files changed, 174 insertions(+), 116 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 873108ee9..d1728566e 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -759,6 +759,13 @@ pub trait Dialect: Debug + Any { Token::DoubleColon | Token::ExclamationMark | Token::LBracket | Token::CaretAt => { Ok(p!(DoubleColon)) } + Token::Colon => match parser.peek_nth_token(1).token { + // When colon is followed by a string or a number, it's usually in MAP syntax. + Token::SingleQuotedString(_) | Token::Number(_, _) => Ok(self.prec_unknown()), + // In other cases, it's used in semi-structured data traversal like in variant or JSON + // string columns. See `JsonAccess`. + _ => Ok(p!(Colon)), + }, Token::Arrow | Token::LongArrow | Token::HashArrow @@ -812,6 +819,7 @@ pub trait Dialect: Debug + Any { Precedence::Ampersand => 23, Precedence::Caret => 22, Precedence::Pipe => 21, + Precedence::Colon => 21, Precedence::Between => 20, Precedence::Eq => 20, Precedence::Like => 19, @@ -1269,6 +1277,8 @@ pub enum Precedence { Caret, /// Bitwise `OR` / pipe operator (`|`). Pipe, + /// `:` operator for json/variant access. + Colon, /// `BETWEEN` operator. Between, /// Equality operator (`=`). diff --git a/src/dialect/mssql.rs b/src/dialect/mssql.rs index faf3402c2..a28545250 100644 --- a/src/dialect/mssql.rs +++ b/src/dialect/mssql.rs @@ -160,6 +160,15 @@ impl Dialect for MsSqlDialect { None } } + + fn get_next_precedence(&self, parser: &Parser) -> Option> { + let token = parser.peek_token(); + match token.token { + // lowest prec to prevent it from turning into a binary op + Token::Colon => Some(Ok(self.prec_unknown())), + _ => None, + } + } } impl MsSqlDialect { diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index 02bab0e06..7c9e7db86 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -136,6 +136,8 @@ impl Dialect for PostgreSqlDialect { | Token::ShiftRight | Token::ShiftLeft | Token::CustomBinaryOperator(_) => Some(Ok(PG_OTHER_PREC)), + // lowest prec to prevent it from turning into a binary op + Token::Colon => Some(Ok(self.prec_unknown())), _ => None, } } @@ -159,6 +161,7 @@ impl Dialect for PostgreSqlDialect { Precedence::Ampersand => PG_OTHER_PREC, Precedence::Caret => CARET_PREC, Precedence::Pipe => PG_OTHER_PREC, + Precedence::Colon => PG_OTHER_PREC, Precedence::Between => BETWEEN_LIKE_PREC, Precedence::Eq => EQ_PREC, Precedence::Like => BETWEEN_LIKE_PREC, diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 4cee5c33e..52bd0bc51 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -3921,7 +3921,7 @@ impl<'a> Parser<'a> { expr: Box::new(expr), }) } else if Token::LBracket == *tok && self.dialect.supports_partiql() - || (dialect_of!(self is SnowflakeDialect | GenericDialect) && Token::Colon == *tok) + || (Token::Colon == *tok) { self.prev_token(); self.parse_json_access(expr) @@ -3957,7 +3957,8 @@ impl<'a> Parser<'a> { let lower_bound = if self.consume_token(&Token::Colon) { None } else { - Some(self.parse_expr()?) + // parse expr until we hit a colon (or any token with lower precedence) + Some(self.parse_subexpr(self.dialect.prec_value(Precedence::Colon))?) }; // check for end @@ -3985,7 +3986,8 @@ impl<'a> Parser<'a> { stride: None, }); } else { - Some(self.parse_expr()?) + // parse expr until we hit a colon (or any token with lower precedence) + Some(self.parse_subexpr(self.dialect.prec_value(Precedence::Colon))?) }; // check for end diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs index 95ad9a209..bbbf0d835 100644 --- a/tests/sqlparser_common.rs +++ b/tests/sqlparser_common.rs @@ -18067,3 +18067,148 @@ fn test_binary_kw_as_cast() { all_dialects_where(|d| d.supports_binary_kw_as_cast()) .one_statement_parses_to("SELECT BINARY 1+1", "SELECT CAST(1 + 1 AS BINARY)"); } + +#[test] +fn parse_semi_structured_data_traversal() { + let dialects = TestedDialects::new(vec![ + Box::new(GenericDialect {}), + Box::new(SnowflakeDialect {}), + Box::new(DatabricksDialect {}), + ]); + + // most basic case + let sql = "SELECT a:b FROM t"; + let select = dialects.verified_only_select(sql); + assert_eq!( + SelectItem::UnnamedExpr(Expr::JsonAccess { + value: Box::new(Expr::Identifier(Ident::new("a"))), + path: JsonPath { + path: vec![JsonPathElem::Dot { + key: "b".to_owned(), + quoted: false + }] + }, + }), + select.projection[0] + ); + + // identifier can be quoted + let sql = r#"SELECT a:"my long object key name" FROM t"#; + let select = dialects.verified_only_select(sql); + assert_eq!( + SelectItem::UnnamedExpr(Expr::JsonAccess { + value: Box::new(Expr::Identifier(Ident::new("a"))), + path: JsonPath { + path: vec![JsonPathElem::Dot { + key: "my long object key name".to_owned(), + quoted: true + }] + }, + }), + select.projection[0] + ); + + dialects.verified_stmt("SELECT a:b::INT FROM t"); + + // unquoted keywords are permitted in the object key + let sql = "SELECT a:select, a:from FROM t"; + let select = dialects.verified_only_select(sql); + assert_eq!( + vec![ + SelectItem::UnnamedExpr(Expr::JsonAccess { + value: Box::new(Expr::Identifier(Ident::new("a"))), + path: JsonPath { + path: vec![JsonPathElem::Dot { + key: "select".to_owned(), + quoted: false + }] + }, + }), + SelectItem::UnnamedExpr(Expr::JsonAccess { + value: Box::new(Expr::Identifier(Ident::new("a"))), + path: JsonPath { + path: vec![JsonPathElem::Dot { + key: "from".to_owned(), + quoted: false + }] + }, + }) + ], + select.projection + ); + + // multiple levels can be traversed + // https://docs.snowflake.com/en/user-guide/querying-semistructured#dot-notation + let sql = r#"SELECT a:foo."bar".baz"#; + let select = dialects.verified_only_select(sql); + assert_eq!( + vec![SelectItem::UnnamedExpr(Expr::JsonAccess { + value: Box::new(Expr::Identifier(Ident::new("a"))), + path: JsonPath { + path: vec![ + JsonPathElem::Dot { + key: "foo".to_owned(), + quoted: false, + }, + JsonPathElem::Dot { + key: "bar".to_owned(), + quoted: true, + }, + JsonPathElem::Dot { + key: "baz".to_owned(), + quoted: false, + } + ] + }, + })], + select.projection + ); + + // dot and bracket notation can be mixed (starting with : case) + // https://docs.snowflake.com/en/user-guide/querying-semistructured#dot-notation + let sql = r#"SELECT a:foo[0].bar"#; + let select = dialects.verified_only_select(sql); + assert_eq!( + vec![SelectItem::UnnamedExpr(Expr::JsonAccess { + value: Box::new(Expr::Identifier(Ident::new("a"))), + path: JsonPath { + path: vec![ + JsonPathElem::Dot { + key: "foo".to_owned(), + quoted: false, + }, + JsonPathElem::Bracket { + key: Expr::value(number("0")), + }, + JsonPathElem::Dot { + key: "bar".to_owned(), + quoted: false, + } + ] + }, + })], + select.projection + ); +} + +#[test] +fn parse_array_subscript() { + let dialects = all_dialects_except(|d| { + d.is::() + || d.is::() + || d.is::() + || d.is::() + }); + + dialects.verified_stmt("SELECT arr[1]"); + dialects.verified_stmt("SELECT arr[:]"); + dialects.verified_stmt("SELECT arr[1:2]"); + dialects.verified_stmt("SELECT arr[1:2:4]"); + dialects.verified_stmt("SELECT arr[1:array_length(arr)]"); + dialects.verified_stmt("SELECT arr[array_length(arr) - 1:array_length(arr)]"); + dialects + .verified_stmt("SELECT arr[array_length(arr) - 2:array_length(arr) - 1:array_length(arr)]"); + + dialects.verified_stmt("SELECT arr[1][2]"); + dialects.verified_stmt("SELECT arr[:][:]"); +} diff --git a/tests/sqlparser_snowflake.rs b/tests/sqlparser_snowflake.rs index 37e9f8cb4..5889b2bd0 100644 --- a/tests/sqlparser_snowflake.rs +++ b/tests/sqlparser_snowflake.rs @@ -1265,37 +1265,8 @@ fn parse_lateral_flatten() { // https://docs.snowflake.com/en/user-guide/querying-semistructured #[test] fn parse_semi_structured_data_traversal() { - // most basic case - let sql = "SELECT a:b FROM t"; - let select = snowflake().verified_only_select(sql); - assert_eq!( - SelectItem::UnnamedExpr(Expr::JsonAccess { - value: Box::new(Expr::Identifier(Ident::new("a"))), - path: JsonPath { - path: vec![JsonPathElem::Dot { - key: "b".to_owned(), - quoted: false - }] - }, - }), - select.projection[0] - ); - - // identifier can be quoted - let sql = r#"SELECT a:"my long object key name" FROM t"#; - let select = snowflake().verified_only_select(sql); - assert_eq!( - SelectItem::UnnamedExpr(Expr::JsonAccess { - value: Box::new(Expr::Identifier(Ident::new("a"))), - path: JsonPath { - path: vec![JsonPathElem::Dot { - key: "my long object key name".to_owned(), - quoted: true - }] - }, - }), - select.projection[0] - ); + // see `tests/sqlparser_common.rs` -> `parse_semi_structured_data_traversal` for more test + // cases. This test only has Snowflake-specific syntax like array access. // expressions are allowed in bracket notation let sql = r#"SELECT a[2 + 2] FROM t"#; @@ -1316,88 +1287,6 @@ fn parse_semi_structured_data_traversal() { select.projection[0] ); - snowflake().verified_stmt("SELECT a:b::INT FROM t"); - - // unquoted keywords are permitted in the object key - let sql = "SELECT a:select, a:from FROM t"; - let select = snowflake().verified_only_select(sql); - assert_eq!( - vec![ - SelectItem::UnnamedExpr(Expr::JsonAccess { - value: Box::new(Expr::Identifier(Ident::new("a"))), - path: JsonPath { - path: vec![JsonPathElem::Dot { - key: "select".to_owned(), - quoted: false - }] - }, - }), - SelectItem::UnnamedExpr(Expr::JsonAccess { - value: Box::new(Expr::Identifier(Ident::new("a"))), - path: JsonPath { - path: vec![JsonPathElem::Dot { - key: "from".to_owned(), - quoted: false - }] - }, - }) - ], - select.projection - ); - - // multiple levels can be traversed - // https://docs.snowflake.com/en/user-guide/querying-semistructured#dot-notation - let sql = r#"SELECT a:foo."bar".baz"#; - let select = snowflake().verified_only_select(sql); - assert_eq!( - vec![SelectItem::UnnamedExpr(Expr::JsonAccess { - value: Box::new(Expr::Identifier(Ident::new("a"))), - path: JsonPath { - path: vec![ - JsonPathElem::Dot { - key: "foo".to_owned(), - quoted: false, - }, - JsonPathElem::Dot { - key: "bar".to_owned(), - quoted: true, - }, - JsonPathElem::Dot { - key: "baz".to_owned(), - quoted: false, - } - ] - }, - })], - select.projection - ); - - // dot and bracket notation can be mixed (starting with : case) - // https://docs.snowflake.com/en/user-guide/querying-semistructured#dot-notation - let sql = r#"SELECT a:foo[0].bar"#; - let select = snowflake().verified_only_select(sql); - assert_eq!( - vec![SelectItem::UnnamedExpr(Expr::JsonAccess { - value: Box::new(Expr::Identifier(Ident::new("a"))), - path: JsonPath { - path: vec![ - JsonPathElem::Dot { - key: "foo".to_owned(), - quoted: false, - }, - JsonPathElem::Bracket { - key: Expr::value(number("0")), - }, - JsonPathElem::Dot { - key: "bar".to_owned(), - quoted: false, - } - ] - }, - })], - select.projection - ); - // dot and bracket notation can be mixed (starting with bracket case) // https://docs.snowflake.com/en/user-guide/querying-semistructured#dot-notation let sql = r#"SELECT a[0].foo.bar"#;