Compare commits

...

1 Commits

Author SHA1 Message Date
Beto Dealmeida
a5fb5dcd54 feat: Impala dialect for sqlglot 2025-07-28 16:40:46 -04:00
4 changed files with 582 additions and 3 deletions

View File

@@ -17,5 +17,6 @@
from .dremio import Dremio
from .firebolt import Firebolt, FireboltOld
from .impala import Impala
__all__ = ["Dremio", "Firebolt", "FireboltOld"]
__all__ = ["Dremio", "Firebolt", "FireboltOld", "Impala"]

View File

@@ -0,0 +1,259 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations
import typing as t
from sqlglot import exp, generator, parser
from sqlglot.dialects.hive import Hive
from sqlglot.helper import seq_get
class Impala(Hive):
"""
A sqlglot dialect for Impala.
Impala is similar to Hive but with some key differences:
- No support for LATERAL VIEW, use JOIN with UNNEST instead
- Different date/time functions
- No support for TRANSFORM
- Limited support for certain Hive-specific features
"""
class Parser(Hive.Parser):
FUNCTIONS = {
**Hive.Parser.FUNCTIONS,
# Impala-specific functions
"MONTHS_ADD": lambda args: exp.DateAdd(
this=seq_get(args, 0),
expression=seq_get(args, 1),
unit=exp.Literal.string("MONTH"),
),
"MONTHS_SUB": lambda args: exp.DateSub(
this=seq_get(args, 0),
expression=seq_get(args, 1),
unit=exp.Literal.string("MONTH"),
),
"YEARS_ADD": lambda args: exp.DateAdd(
this=seq_get(args, 0),
expression=seq_get(args, 1),
unit=exp.Literal.string("YEAR"),
),
"YEARS_SUB": lambda args: exp.DateSub(
this=seq_get(args, 0),
expression=seq_get(args, 1),
unit=exp.Literal.string("YEAR"),
),
"DAYS_ADD": lambda args: exp.DateAdd(
this=seq_get(args, 0),
expression=seq_get(args, 1),
unit=exp.Literal.string("DAY"),
),
"DAYS_SUB": lambda args: exp.DateSub(
this=seq_get(args, 0),
expression=seq_get(args, 1),
unit=exp.Literal.string("DAY"),
),
"WEEKS_ADD": lambda args: exp.DateAdd(
this=seq_get(args, 0),
expression=seq_get(args, 1),
unit=exp.Literal.string("WEEK"),
),
"WEEKS_SUB": lambda args: exp.DateSub(
this=seq_get(args, 0),
expression=seq_get(args, 1),
unit=exp.Literal.string("WEEK"),
),
# Impala uses different names for some functions
"DATE_PART": lambda args: _parse_date_part(args),
"EXTRACT": lambda args: _parse_extract(args),
# Override Hive functions that Impala doesn't support
"STR_TO_MAP": None, # Not supported in Impala
"XPATH": None, # Not supported in Impala
"XPATH_BOOLEAN": None,
"XPATH_DOUBLE": None,
"XPATH_FLOAT": None,
"XPATH_INT": None,
"XPATH_LONG": None,
"XPATH_SHORT": None,
"XPATH_STRING": None,
}
NO_PAREN_FUNCTION_PARSERS = {
**parser.Parser.NO_PAREN_FUNCTION_PARSERS,
# Remove TRANSFORM as it's not supported in Impala
}
NO_PAREN_FUNCTION_PARSERS.pop("TRANSFORM", None)
def _parse_lateral(self) -> t.Optional[exp.Lateral]:
# Impala doesn't support LATERAL VIEW, it uses different syntax
# This prevents parsing LATERAL VIEW syntax
return None
class Generator(Hive.Generator):
# Impala-specific type mappings
TYPE_MAPPING = {
**Hive.Generator.TYPE_MAPPING,
exp.DataType.Type.VARCHAR: "STRING", # Impala treats VARCHAR as STRING
exp.DataType.Type.NVARCHAR: "STRING",
exp.DataType.Type.CHAR: "STRING", # Impala treats CHAR as STRING
exp.DataType.Type.NCHAR: "STRING",
}
TRANSFORMS = {
**Hive.Generator.TRANSFORMS,
# Date/time functions
exp.DateAdd: lambda self, e: _date_add_sql(self, e),
exp.DateSub: lambda self, e: _date_sub_sql(self, e),
# Impala doesn't support certain Hive features
exp.StrToMap: lambda self, e: self.unsupported(
"STR_TO_MAP is not supported in Impala"
),
exp.Transform: lambda self, e: self.unsupported(
"TRANSFORM is not supported in Impala"
),
exp.QueryTransform: lambda self, e: self.unsupported(
"TRANSFORM is not supported in Impala"
),
# Override LATERAL VIEW handling
exp.Lateral: lambda self, e: _lateral_sql(self, e),
# JSON functions have different names in Impala
exp.JSONExtract: lambda self, e: self.func(
"JSON_QUERY", e.this, e.expression
),
exp.JSONExtractScalar: lambda self, e: self.func(
"JSON_VALUE", e.this, e.expression
),
# Impala uses different syntax for COLLECT_LIST/SET
exp.ArrayAgg: lambda self, e: self.func(
"GROUP_CONCAT",
e.this.this if isinstance(e.this, exp.Order) else e.this,
exp.Literal.string(","),
),
exp.ArrayUniqueAgg: lambda self, e: self.func(
"GROUP_CONCAT",
self.sql(exp.Distinct(expressions=[e.this])),
exp.Literal.string(","),
),
}
def datatype_sql(self, expression: exp.DataType) -> str:
# Impala treats CHAR/VARCHAR as STRING
if expression.is_type("char", "varchar", "nchar", "nvarchar"):
return "STRING"
return super().datatype_sql(expression)
def lateral_sql(self, expression: exp.Lateral) -> str:
# Impala doesn't use LATERAL VIEW syntax
# Instead, it uses regular JOIN with UNNEST
if isinstance(expression.this, exp.Unnest):
return self.sql(expression.this)
return super().lateral_sql(expression)
def _parse_date_part(args: t.List[exp.Expression]) -> exp.Expression:
"""Parse DATE_PART function which extracts date parts."""
part = seq_get(args, 0)
date = seq_get(args, 1)
if isinstance(part, exp.Literal):
part_name = part.name.upper()
if part_name == "YEAR":
return exp.Year(this=date)
elif part_name == "MONTH":
return exp.Month(this=date)
elif part_name == "DAY":
return exp.Day(this=date)
elif part_name == "HOUR":
return exp.Hour(this=date)
elif part_name == "MINUTE":
return exp.Minute(this=date)
elif part_name == "SECOND":
return exp.Second(this=date)
return exp.Extract(this=part, expression=date)
def _parse_extract(args: t.List[exp.Expression]) -> exp.Expression:
"""Parse EXTRACT function."""
return exp.Extract(this=seq_get(args, 0), expression=seq_get(args, 1))
def _date_add_sql(self: generator.Generator, expression: exp.DateAdd) -> str:
"""Generate SQL for date addition in Impala."""
unit = expression.text("unit").upper()
# Map generic units to Impala-specific functions
unit_map = {
"YEAR": "YEARS_ADD",
"MONTH": "MONTHS_ADD",
"WEEK": "WEEKS_ADD",
"DAY": "DAYS_ADD",
}
func_name = unit_map.get(unit, "DATE_ADD")
if func_name != "DATE_ADD":
return self.func(func_name, expression.this, expression.expression)
# For other units, use DATE_ADD with INTERVAL
return self.func(
"DATE_ADD",
expression.this,
self.sql(exp.Interval(this=expression.expression, unit=expression.unit)),
)
def _date_sub_sql(self: generator.Generator, expression: exp.DateSub) -> str:
"""Generate SQL for date subtraction in Impala."""
unit = expression.text("unit").upper()
# Map generic units to Impala-specific functions
unit_map = {
"YEAR": "YEARS_SUB",
"MONTH": "MONTHS_SUB",
"WEEK": "WEEKS_SUB",
"DAY": "DAYS_SUB",
}
func_name = unit_map.get(unit, "DATE_SUB")
if func_name != "DATE_SUB":
return self.func(func_name, expression.this, expression.expression)
# For other units, use DATE_SUB with INTERVAL
return self.func(
"DATE_SUB",
expression.this,
self.sql(exp.Interval(this=expression.expression, unit=expression.unit)),
)
def _lateral_sql(self: generator.Generator, expression: exp.Lateral) -> str:
"""Generate SQL for LATERAL expressions in Impala."""
# Impala doesn't support LATERAL VIEW syntax
# It uses regular JOIN with UNNEST instead
this = expression.this
if isinstance(this, exp.Unnest):
# Just return the UNNEST expression without LATERAL VIEW
return self.sql(this)
# For other cases, try to generate standard SQL
return self.sql(this)

View File

@@ -44,7 +44,7 @@ from sqlglot.optimizer.scope import (
)
from superset.exceptions import QueryClauseValidationException, SupersetParseError
from superset.sql.dialects import Dremio, Firebolt
from superset.sql.dialects import Dremio, Firebolt, Impala
if TYPE_CHECKING:
from superset.models.core import Database
@@ -81,7 +81,7 @@ SQLGLOT_DIALECTS = {
"hana": Dialects.POSTGRES,
"hive": Dialects.HIVE,
# "ibmi": ???
# "impala": ???
"impala": Impala,
# "kustosql": ???
# "kylin": ???
"mariadb": Dialects.MYSQL,

View File

@@ -0,0 +1,319 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import pytest
from sqlglot import exp, parse_one
from superset.sql.dialects.impala import Impala
def test_impala_date_add_functions() -> None:
"""
Test Impala-specific date addition functions.
"""
# Test MONTHS_ADD
sql = "SELECT MONTHS_ADD(date_col, 3) FROM table1"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert regenerated == "SELECT MONTHS_ADD(date_col, 3) FROM table1"
# Test YEARS_ADD
sql = "SELECT YEARS_ADD(date_col, 2) FROM table1"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert regenerated == "SELECT YEARS_ADD(date_col, 2) FROM table1"
# Test DAYS_ADD
sql = "SELECT DAYS_ADD(date_col, 7) FROM table1"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert regenerated == "SELECT DAYS_ADD(date_col, 7) FROM table1"
# Test WEEKS_ADD
sql = "SELECT WEEKS_ADD(date_col, 4) FROM table1"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert regenerated == "SELECT WEEKS_ADD(date_col, 4) FROM table1"
def test_impala_date_sub_functions() -> None:
"""
Test Impala-specific date subtraction functions.
"""
# Test MONTHS_SUB
sql = "SELECT MONTHS_SUB(date_col, 3) FROM table1"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert regenerated == "SELECT MONTHS_SUB(date_col, 3) FROM table1"
# Test YEARS_SUB
sql = "SELECT YEARS_SUB(date_col, 2) FROM table1"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert regenerated == "SELECT YEARS_SUB(date_col, 2) FROM table1"
# Test DAYS_SUB
sql = "SELECT DAYS_SUB(date_col, 7) FROM table1"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert regenerated == "SELECT DAYS_SUB(date_col, 7) FROM table1"
# Test WEEKS_SUB
sql = "SELECT WEEKS_SUB(date_col, 4) FROM table1"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert regenerated == "SELECT WEEKS_SUB(date_col, 4) FROM table1"
def test_impala_date_part_function() -> None:
"""
Test DATE_PART function parsing.
"""
# Test DATE_PART with YEAR
sql = "SELECT DATE_PART('YEAR', date_col) FROM table1"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert regenerated == "SELECT YEAR(date_col) FROM table1"
# Test DATE_PART with MONTH
sql = "SELECT DATE_PART('MONTH', date_col) FROM table1"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert regenerated == "SELECT MONTH(date_col) FROM table1"
# Test DATE_PART with DAY
sql = "SELECT DATE_PART('DAY', date_col) FROM table1"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert regenerated == "SELECT DAY(date_col) FROM table1"
def test_impala_data_types() -> None:
"""
Test that Impala treats VARCHAR/CHAR as STRING.
"""
# Test VARCHAR conversion
sql = "CREATE TABLE test (col1 VARCHAR(100))"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert regenerated == "CREATE TABLE test (col1 STRING)"
# Test CHAR conversion
sql = "CREATE TABLE test (col1 CHAR(10))"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert regenerated == "CREATE TABLE test (col1 STRING)"
# Test NVARCHAR conversion
sql = "CREATE TABLE test (col1 NVARCHAR(50))"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert regenerated == "CREATE TABLE test (col1 STRING)"
def test_impala_unsupported_functions() -> None:
"""
Test that unsupported Hive functions are handled properly.
"""
# STR_TO_MAP is not supported
with pytest.raises(Exception) as exc_info:
sql = "SELECT STR_TO_MAP('a:1,b:2', ',', ':') FROM table1"
ast = parse_one(sql, dialect=Impala)
ast.sql(dialect=Impala)
assert "STR_TO_MAP is not supported" in str(exc_info.value)
# TRANSFORM is not supported
with pytest.raises(Exception) as exc_info:
sql = "SELECT TRANSFORM(col) USING 'script.py' FROM table1"
ast = parse_one(sql, dialect=Impala)
ast.sql(dialect=Impala)
assert "TRANSFORM is not supported" in str(exc_info.value)
def test_impala_json_functions() -> None:
"""
Test that JSON functions are mapped to Impala equivalents.
"""
# Test JSON extract scalar to JSON_VALUE
sql = "SELECT GET_JSON_OBJECT(json_col, '$.field') FROM table1"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert regenerated == "SELECT JSON_VALUE(json_col, '$.field') FROM table1"
def test_impala_aggregate_functions() -> None:
"""
Test that COLLECT_LIST/SET are mapped to GROUP_CONCAT.
"""
# Test COLLECT_LIST conversion
sql = "SELECT COLLECT_LIST(col1) FROM table1 GROUP BY col2"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert regenerated == "SELECT GROUP_CONCAT(col1, ',') FROM table1 GROUP BY col2"
# Test COLLECT_SET conversion
sql = "SELECT COLLECT_SET(col1) FROM table1 GROUP BY col2"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert (
regenerated
== "SELECT GROUP_CONCAT(DISTINCT col1, ',') FROM table1 GROUP BY col2"
)
def test_standard_date_add_sub() -> None:
"""
Test standard DATE_ADD/DATE_SUB with generic units.
"""
# Generic DATE_ADD (should be parsed and regenerated as-is)
sql = "SELECT DATE_ADD(date_col, 5) FROM table1"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert regenerated == "SELECT DAYS_ADD(date_col, 5) FROM table1"
def test_unnest_without_lateral_view() -> None:
"""
Test that UNNEST is handled without LATERAL VIEW syntax.
"""
# Impala uses JOIN with UNNEST instead of LATERAL VIEW
sql = "SELECT * FROM table1, UNNEST(array_col) AS t(elem)"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
# Should not contain LATERAL VIEW
assert "LATERAL VIEW" not in regenerated
assert "UNNEST" in regenerated
def test_extract_function() -> None:
"""
Test EXTRACT function.
"""
sql = "SELECT EXTRACT(YEAR FROM date_col) FROM table1"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert regenerated == "SELECT EXTRACT(YEAR FROM date_col) FROM table1"
sql = "SELECT EXTRACT(MONTH FROM date_col) FROM table1"
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
assert regenerated == "SELECT EXTRACT(MONTH FROM date_col) FROM table1"
def test_complex_query() -> None:
"""
Test a more complex query with multiple Impala-specific features.
"""
sql = """
SELECT
customer_id,
GROUP_CONCAT(product_name, ',') as products,
MONTHS_ADD(order_date, 1) as next_month,
JSON_VALUE(order_details, '$.total') as total
FROM orders
WHERE YEARS_SUB(order_date, 1) > '2022-01-01'
GROUP BY customer_id, order_date, order_details
"""
ast = parse_one(sql, dialect=Impala)
regenerated = ast.sql(dialect=Impala)
# Check key components are preserved
assert "GROUP_CONCAT" in regenerated
assert "MONTHS_ADD" in regenerated
assert "JSON_VALUE" in regenerated
assert "YEARS_SUB" in regenerated
@pytest.mark.parametrize(
"func, expected_class, unit",
[
("MONTHS_ADD(x, 1)", exp.DateAdd, "MONTH"),
("YEARS_SUB(x, 2)", exp.DateSub, "YEAR"),
("DAYS_ADD(x, 3)", exp.DateAdd, "DAY"),
("WEEKS_SUB(x, 4)", exp.DateSub, "WEEK"),
],
)
def test_date_functions_parse_correctly(func, expected_class, unit):
parsed = parse_one(func, read=Impala)
assert isinstance(parsed, expected_class)
assert parsed.text("unit").upper() == unit
@pytest.mark.parametrize(
"sql, expected_expr",
[
("DATE_PART('year', d)", exp.Year),
("DATE_PART('second', d)", exp.Second),
("EXTRACT(year FROM d)", exp.Extract),
],
)
def test_date_part_and_extract(sql, expected_expr):
parsed = parse_one(sql, read=Impala)
if expected_expr == exp.Extract:
assert isinstance(parsed, expected_expr)
else:
assert isinstance(parsed, expected_expr)
@pytest.mark.parametrize(
"expr, expected_sql",
[
(
exp.DateAdd(
this=exp.Column(this="x"),
expression=exp.Literal.number(1),
unit=exp.Literal.string("YEAR"),
),
"YEARS_ADD(x, 1)",
),
(
exp.DateSub(
this=exp.Column(this="y"),
expression=exp.Literal.number(2),
unit=exp.Literal.string("MONTH"),
),
"MONTHS_SUB(y, 2)",
),
],
)
def test_sql_generation_for_date_add_sub(expr, expected_sql):
sql = expr.sql(dialect=Impala)
assert sql == expected_sql
def test_string_type_is_mapped():
expr = exp.DataType.build("VARCHAR")
sql = expr.sql(dialect=Impala)
assert sql == "STRING"
def test_unsupported_functions_raise():
gen = Impala.Generator()
with pytest.raises(Exception):
gen.sql(exp.StrToMap(this=exp.Literal.string("x")))
@pytest.mark.parametrize(
"sql",
[
"LATERAL VIEW explode(arr) t",
],
)
def test_lateral_not_supported(sql):
parsed = parse_one(sql, read=Impala)
assert parsed is None or isinstance(parsed, exp.Expression)