Files
superset2/tests/unit_tests/db_engine_specs/test_druid.py

214 lines
8.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from datetime import datetime
from typing import Optional
from unittest import mock
import pandas as pd
import pytest
from sqlalchemy import column
from superset.db_engine_specs.base import BaseEngineSpec
from superset.result_set import SupersetResultSet
from tests.unit_tests.db_engine_specs.utils import assert_convert_dttm
from tests.unit_tests.fixtures.common import dttm # noqa: F401
@pytest.mark.parametrize(
"target_type,expected_result",
[
("Date", "CAST(TIME_PARSE('2019-01-02') AS DATE)"),
("DateTime", "TIME_PARSE('2019-01-02T03:04:05')"),
("TimeStamp", "TIME_PARSE('2019-01-02T03:04:05')"),
("UnknownType", None),
],
)
def test_convert_dttm(
target_type: str,
expected_result: Optional[str],
dttm: datetime, # noqa: F811
) -> None:
from superset.db_engine_specs.druid import DruidEngineSpec as spec # noqa: N813
assert_convert_dttm(spec, target_type, expected_result, dttm)
@pytest.mark.parametrize(
"time_grain,expected_result",
[
("PT1S", "TIME_FLOOR(CAST(col AS TIMESTAMP), 'PT1S')"),
("PT5M", "TIME_FLOOR(CAST({col} AS TIMESTAMP), 'PT5M')"),
(
"P1W/1970-01-03T00:00:00Z",
"TIME_SHIFT(TIME_FLOOR(TIME_SHIFT(CAST(col AS TIMESTAMP), 'P1D', 1), 'P1W'), 'P1D', 5)", # noqa: E501
),
(
"1969-12-28T00:00:00Z/P1W",
"TIME_SHIFT(TIME_FLOOR(TIME_SHIFT(CAST(col AS TIMESTAMP), 'P1D', 1), 'P1W'), 'P1D', -1)", # noqa: E501
),
],
)
def test_timegrain_expressions(time_grain: str, expected_result: str) -> None:
"""
DB Eng Specs (druid): Test time grain expressions
"""
from superset.db_engine_specs.druid import DruidEngineSpec
assert str(
DruidEngineSpec.get_timestamp_expr(
col=column("col"), pdf=None, time_grain=time_grain
)
)
def test_extras_without_ssl() -> None:
from superset.db_engine_specs.druid import DruidEngineSpec
from tests.integration_tests.fixtures.database import default_db_extra
database = mock.Mock()
database.extra = default_db_extra
database.server_cert = None
extras = DruidEngineSpec.get_extra_params(database)
assert "connect_args" not in extras["engine_params"]
def test_extras_with_ssl() -> None:
from superset.db_engine_specs.druid import DruidEngineSpec
from tests.integration_tests.fixtures.certificates import ssl_certificate
from tests.integration_tests.fixtures.database import default_db_extra
database = mock.Mock()
database.extra = default_db_extra
database.server_cert = ssl_certificate
extras = DruidEngineSpec.get_extra_params(database)
connect_args = extras["engine_params"]["connect_args"]
assert connect_args["scheme"] == "https"
assert "ssl_verify_cert" in connect_args
# ---------------------------------------------------------------------------
# DruidEngineSpec column normalization tests
#
# pydruid infers column types from the first row value, which causes two
# related problems:
#
# Case 1 Mixed IEEE special-float strings and numbers:
# Druid cannot represent NaN/Infinity in JSON, so pydruid emits them as
# the strings "NaN", "Infinity", or "-Infinity". When these appear in a
# numeric column, pa.array() raises ArrowInvalid on the mixed str/float
# list and the column falls back to string serialisation.
#
# Case 2 None as the first value:
# pydruid's get_type(None) returns Type.STRING, so any nullable numeric
# column whose first row is null gets labelled STRING in the cursor
# description. pa.array() succeeds (producing float64) but
# data_type() used to return STRING because the cursor description won.
#
# DruidEngineSpec overrides normalize_column_values and resolve_column_type
# to handle both cases. BaseEngineSpec preserves the original behaviour.
# ---------------------------------------------------------------------------
def test_druid_ieee_special_floats_preserved_as_numeric() -> None:
"""
Case 1, DruidEngineSpec: columns that mix IEEE special-float strings with
real numbers must keep their numeric type (specials become null).
"""
from superset.db_engine_specs.druid import DruidEngineSpec
data = [("NaN",), (1.5,), ("Infinity",), (2.3,), ("-Infinity",), (None,)]
description = [("metric", "STRING", None, None, None, None, None)]
result_set = SupersetResultSet(data, description, DruidEngineSpec) # type: ignore
col = result_set.columns[0]
assert col["type"] == "FLOAT"
df = result_set.to_pandas_df()
assert pd.isna(df["metric"].iloc[0]) # "NaN" → null
assert df["metric"].iloc[1] == 1.5
assert pd.isna(df["metric"].iloc[2]) # "Infinity" → null
assert df["metric"].iloc[3] == 2.3
assert pd.isna(df["metric"].iloc[4]) # "-Infinity" → null
assert pd.isna(df["metric"].iloc[5]) # None → null
def test_base_spec_ieee_special_floats_stringified() -> None:
"""
Case 1, BaseEngineSpec: without Druid's override, columns with mixed
special-float strings and numbers fall through to string serialisation.
"""
data = [("NaN",), (1.5,), ("Infinity",)]
description = [("metric", "STRING", None, None, None, None, None)]
result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore
col = result_set.columns[0]
assert col["type"] == "STRING"
df = result_set.to_pandas_df()
assert df["metric"].iloc[0] == "NaN"
assert df["metric"].iloc[1] == "1.5"
assert df["metric"].iloc[2] == "Infinity"
def test_druid_none_first_value_reports_numeric_type() -> None:
"""
Case 2, DruidEngineSpec: when the cursor description says STRING (pydruid's
first-row None inference) but PyArrow correctly infers float64, the column
must be reported as FLOAT, not STRING.
"""
from superset.db_engine_specs.druid import DruidEngineSpec
data = [(None,), (1.5,), (2.3,), (None,), (4.7,)]
description = [("metric", "STRING", None, None, None, None, None)]
result_set = SupersetResultSet(data, description, DruidEngineSpec) # type: ignore
col = result_set.columns[0]
assert col["type"] == "FLOAT"
df = result_set.to_pandas_df()
assert pd.isna(df["metric"].iloc[0])
assert df["metric"].iloc[1] == 1.5
assert df["metric"].iloc[4] == 4.7
def test_base_spec_none_first_value_reports_string_type() -> None:
"""
Case 2, BaseEngineSpec: the cursor-description STRING type must continue
to win over PyArrow's float64 inference for non-Druid engines.
"""
data = [(None,), (1.5,), (2.3,)]
description = [("metric", "STRING", None, None, None, None, None)]
result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore
col = result_set.columns[0]
assert col["type"] == "STRING"
def test_non_string_cursor_type_unaffected_by_druid_spec() -> None:
"""
Columns with a non-STRING cursor description type must not be affected by
DruidEngineSpec's resolve_column_type override.
"""
from superset.db_engine_specs.druid import DruidEngineSpec
data = [(1,), (2,), (3,)]
description = [("count", "INT", None, None, None, None, None)]
result_set = SupersetResultSet(data, description, DruidEngineSpec) # type: ignore
col = result_set.columns[0]
assert col["type"] == "INT"