superset2/tests/unit_tests/result_set_test.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# pylint: disable=import-outside-toplevel, unused-argument

from datetime import datetime, timezone

import numpy as np
import pandas as pd
from numpy.core.multiarray import array
from pytest_mock import MockerFixture

from superset.db_engine_specs.base import BaseEngineSpec
from superset.result_set import (
    stringify_extension_columns,
    stringify_values,
    SupersetResultSet,
)
from superset.superset_typing import DbapiResult
from superset.utils import json as superset_json


def test_column_names_as_bytes() -> None:
    """
    Test that we can handle column names as bytes.
    """
    from superset.db_engine_specs.redshift import RedshiftEngineSpec
    from superset.result_set import SupersetResultSet

    data = (
        [
            "2016-01-26",
            392.002014,
            397.765991,
            390.575012,
            392.153015,
            392.153015,
            58147000,
        ],
        [
            "2016-01-27",
            392.444,
            396.842987,
            391.782013,
            394.971985,
            394.971985,
            47424400,
        ],
    )
    description = [
        (b"date", 1043, None, None, None, None, None),
        (b"open", 701, None, None, None, None, None),
        (b"high", 701, None, None, None, None, None),
        (b"low", 701, None, None, None, None, None),
        (b"close", 701, None, None, None, None, None),
        (b"adj close", 701, None, None, None, None, None),
        (b"volume", 20, None, None, None, None, None),
    ]
    result_set = SupersetResultSet(data, description, RedshiftEngineSpec)  # type: ignore

    assert (
        result_set.to_pandas_df().to_markdown()
        == """
|    | date       |    open |    high |     low |   close |   adj close |   volume |
|---:|:-----------|--------:|--------:|--------:|--------:|------------:|---------:|
|  0 | 2016-01-26 | 392.002 | 397.766 | 390.575 | 392.153 |     392.153 | 58147000 |
|  1 | 2016-01-27 | 392.444 | 396.843 | 391.782 | 394.972 |     394.972 | 47424400 |
    """.strip()
    )


def test_stringify_with_null_integers():
    """
    Test that we can safely handle type errors when an integer column has a null value
    """

    data = [
        ("foo", "bar", pd.NA, None),
        ("foo", "bar", pd.NA, True),
        ("foo", "bar", pd.NA, None),
    ]
    numpy_dtype = [
        ("id", "object"),
        ("value", "object"),
        ("num", "object"),
        ("bool", "object"),
    ]

    array2 = np.array(data, dtype=numpy_dtype)
    column_names = ["id", "value", "num", "bool"]

    result_set = np.array([stringify_values(array2[column]) for column in column_names])

    expected = np.array(
        [
            array(["foo", "foo", "foo"], dtype=object),
            array(["bar", "bar", "bar"], dtype=object),
            array([None, None, None], dtype=object),
            array([None, "True", None], dtype=object),
        ]
    )

    assert np.array_equal(result_set, expected)


def test_stringify_with_null_timestamps():
    """
    Test that we can safely handle type errors when a timestamp column has a null value
    """

    data = [
        ("foo", "bar", pd.NaT, None),
        ("foo", "bar", pd.NaT, True),
        ("foo", "bar", pd.NaT, None),
    ]
    numpy_dtype = [
        ("id", "object"),
        ("value", "object"),
        ("num", "object"),
        ("bool", "object"),
    ]

    array2 = np.array(data, dtype=numpy_dtype)
    column_names = ["id", "value", "num", "bool"]

    result_set = np.array([stringify_values(array2[column]) for column in column_names])

    expected = np.array(
        [
            array(["foo", "foo", "foo"], dtype=object),
            array(["bar", "bar", "bar"], dtype=object),
            array([None, None, None], dtype=object),
            array([None, "True", None], dtype=object),
        ]
    )

    assert np.array_equal(result_set, expected)


def test_timezone_series(mocker: MockerFixture) -> None:
    """
    Test that we can handle timezone-aware datetimes correctly.

    This covers a regression that happened when upgrading from Pandas 1.5.3 to 2.0.3.
    """
    logger = mocker.patch("superset.result_set.logger")

    data = [[datetime(2023, 1, 1, tzinfo=timezone.utc)]]
    description = [(b"__time", "datetime", None, None, None, None, False)]
    result_set = SupersetResultSet(
        data,
        description,  # type: ignore
        BaseEngineSpec,
    )
    assert result_set.to_pandas_df().values.tolist() == [
        [pd.Timestamp("2023-01-01 00:00:00+0000", tz="UTC")]
    ]
    logger.exception.assert_not_called()


def test_out_of_bounds_datetime_coerced_to_nat(mocker: MockerFixture) -> None:
    """
    Dates beyond ~2262-04-11 overflow pandas' int64 nanosecond representation.
    SupersetResultSet must coerce them to NaT rather than raising OutOfBoundsDatetime
    and logging an ERROR (which would surface as noise in observability tooling).
    """
    logger = mocker.patch("superset.result_set.logger")

    data = [[datetime(3118, 1, 1, tzinfo=timezone.utc)]]
    description = [(b"dt", "datetime", None, None, None, None, False)]
    result_set = SupersetResultSet(
        data,
        description,  # type: ignore
        BaseEngineSpec,
    )
    df = result_set.to_pandas_df()
    assert pd.isna(df["dt"].iloc[0])
    logger.exception.assert_not_called()


def test_get_column_description_from_empty_data_using_cursor_description(
    mocker: MockerFixture,
) -> None:
    """
    Test that we can handle get_column_decription from the cursor description
    when data is empty
    """
    logger = mocker.patch("superset.result_set.logger")

    data: DbapiResult = []
    description = [(b"__time", "datetime", None, None, None, None, 1, 0, 255)]
    result_set = SupersetResultSet(
        data,
        description,  # type: ignore
        BaseEngineSpec,
    )
    assert any(col.get("column_name") == "__time" for col in result_set.columns)
    logger.exception.assert_not_called()


def test_empty_column_names_get_synthetic_names() -> None:
    """
    SQL Server returns an empty-string column name in cursor.description for
    any un-aliased expression (e.g. ``SELECT COUNT(*) FROM t``).  An empty
    field name is illegal in NumPy structured arrays and PyArrow tables.

    SupersetResultSet must replace empty column names with synthetic names
    so queries like ``SELECT COUNT(*) FROM t`` succeed on MSSQL.

    Regression test for https://github.com/apache/superset/issues/23848
    """
    data = [(42,)]
    description = [("", 3, None, None, None, None, None)]
    result_set = SupersetResultSet(data, description, BaseEngineSpec)  # type: ignore

    assert result_set.columns[0]["column_name"] == "_col_0"
    df = result_set.to_pandas_df()
    assert list(df.columns) == ["_col_0"]
    assert df["_col_0"].iloc[0] == 42


def test_multiple_empty_column_names_get_unique_synthetic_names() -> None:
    """
    When several columns have empty names (e.g. ``SELECT COUNT(*), SUM(x)``
    on MSSQL), each must receive a distinct synthetic name.
    """
    data = [(10, 20)]
    description = [
        ("", 3, None, None, None, None, None),
        ("", 3, None, None, None, None, None),
    ]
    result_set = SupersetResultSet(data, description, BaseEngineSpec)  # type: ignore

    col_names = [c["column_name"] for c in result_set.columns]
    assert len(col_names) == 2
    assert len(set(col_names)) == 2  # all unique
    df = result_set.to_pandas_df()
    assert df.iloc[0].tolist() == [10, 20]


def test_empty_column_names_do_not_rename_explicit_synthetic_names() -> None:
    """
    Synthetic names assigned to empty columns must not collide with explicit
    user-selected names that already look like Superset fallbacks.
    """
    data = [(10, 20)]
    description = [
        ("", 3, None, None, None, None, None),
        ("_col_0", 3, None, None, None, None, None),
    ]
    result_set = SupersetResultSet(data, description, BaseEngineSpec)  # type: ignore

    col_names = [c["column_name"] for c in result_set.columns]
    assert col_names == ["_col_1", "_col_0"]
    df = result_set.to_pandas_df()
    assert list(df.columns) == ["_col_1", "_col_0"]
    assert df.iloc[0].tolist() == [10, 20]


def test_json_data_type_preserved_as_objects() -> None:
    """
    Test that JSON/JSONB data is preserved as Python objects (dicts/lists)
    instead of being converted to strings.

    This is important for Handlebars templates and other features that need
    to access JSON data as objects rather than strings.

    See: https://github.com/apache/superset/issues/25125
    """
    # Simulate data from PostgreSQL JSONB column - psycopg2 returns dicts
    data = [
        (1, {"key": "value1", "nested": {"a": 1}}, "text1"),
        (2, {"key": "value2", "items": [1, 2, 3]}, "text2"),
        (3, None, "text3"),
        (4, {"mixed": "string"}, "text4"),
    ]
    description = [
        ("id", 23, None, None, None, None, None),  # INT
        ("json_col", 3802, None, None, None, None, None),  # JSONB
        ("text_col", 1043, None, None, None, None, None),  # VARCHAR
    ]
    result_set = SupersetResultSet(data, description, BaseEngineSpec)  # type: ignore
    df = result_set.to_pandas_df()

    # JSON column should be preserved as Python objects, not strings
    assert df["json_col"].iloc[0] == {"key": "value1", "nested": {"a": 1}}
    assert isinstance(df["json_col"].iloc[0], dict)
    assert df["json_col"].iloc[1] == {"key": "value2", "items": [1, 2, 3]}
    assert df["json_col"].iloc[2] is None
    assert df["json_col"].iloc[3] == {"mixed": "string"}

    # Plain TEXT/VARCHAR columns must be left untouched as strings, even when
    # adjacent to a JSON column.
    assert df["text_col"].iloc[0] == "text1"
    assert df["text_col"].iloc[3] == "text4"

    # Verify the data can be serialized to JSON (as it would be for API response)
    records = df.to_dict(orient="records")
    json_output = superset_json.dumps(records)
    parsed = superset_json.loads(json_output)
    assert parsed[0]["json_col"]["key"] == "value1"
    assert parsed[0]["json_col"]["nested"]["a"] == 1
    assert parsed[1]["json_col"]["items"] == [1, 2, 3]


def test_json_formatted_string_in_text_column_stays_string() -> None:
    """
    A plain TEXT/VARCHAR column whose values happen to be JSON-formatted strings
    must be left unchanged as strings. There is no content-sniffing: only columns
    that the driver returns as actual nested Python objects (dicts/lists) are
    preserved as objects.

    See: https://github.com/apache/superset/issues/25125
    """
    data = [
        (1, '{"key": "val"}'),
        (2, "[1, 2, 3]"),
        (3, "not json at all"),
    ]
    description = [
        ("id", 23, None, None, None, None, None),  # INT
        ("text_col", 1043, None, None, None, None, None),  # VARCHAR
    ]
    result_set = SupersetResultSet(data, description, BaseEngineSpec)  # type: ignore
    df = result_set.to_pandas_df()

    # Values stay as raw strings, never parsed into dict/list.
    assert df["text_col"].iloc[0] == '{"key": "val"}'
    assert isinstance(df["text_col"].iloc[0], str)
    assert df["text_col"].iloc[1] == "[1, 2, 3]"
    assert isinstance(df["text_col"].iloc[1], str)
    assert df["text_col"].iloc[2] == "not json at all"


def test_json_data_with_homogeneous_structure() -> None:
    """
    Test that JSON data with consistent structure is also preserved as objects.
    """
    # All rows have the same JSON structure
    data = [
        (1, {"name": "Alice", "age": 30}),
        (2, {"name": "Bob", "age": 25}),
        (3, {"name": "Charlie", "age": 35}),
    ]
    description = [
        ("id", 23, None, None, None, None, None),
        ("data", 3802, None, None, None, None, None),
    ]
    result_set = SupersetResultSet(data, description, BaseEngineSpec)  # type: ignore
    df = result_set.to_pandas_df()

    # Should be preserved as dicts
    assert isinstance(df["data"].iloc[0], dict)
    assert df["data"].iloc[0]["name"] == "Alice"
    assert df["data"].iloc[1]["age"] == 25


def test_array_data_type_preserved() -> None:
    """
    Test that array data is also preserved as Python lists.
    """
    data = [
        (1, [1, 2, 3]),
        (2, [4, 5, 6]),
        (3, None),
    ]
    description = [
        ("id", 23, None, None, None, None, None),
        ("arr", 1007, None, None, None, None, None),  # INT ARRAY
    ]
    result_set = SupersetResultSet(data, description, BaseEngineSpec)  # type: ignore
    df = result_set.to_pandas_df()

    # Arrays should be preserved as lists
    assert df["arr"].iloc[0] == [1, 2, 3]
    assert isinstance(df["arr"].iloc[0], list)
    assert df["arr"].iloc[2] is None


def test_uuid_column_is_stringified() -> None:
    """
    UUID columns must render as readable strings, not raw bytes.

    PyArrow >= 21 infers Python ``uuid.UUID`` values as the canonical ``uuid``
    extension type (16-byte binary) instead of raising while building the array.
    That bypasses the stringification fallback, so without explicit handling the
    values surface in the results grid as garbled bytes / ``[bytes]``.
    ``SupersetResultSet`` must stringify any Arrow extension type.

    Regression test for the pyarrow 20 -> 24 upgrade.
    """
    import uuid

    ids = [
        uuid.UUID("f4787a4f-2541-4f8a-9b5e-1e2d3c4b5a6f"),
        uuid.UUID("00000000-0000-0000-0000-000000000000"),
    ]
    data = [(ids[0],), (ids[1],)]
    description = [("uuid", "uuid", None, None, None, None, True)]
    result_set = SupersetResultSet(data, description, BaseEngineSpec)  # type: ignore

    df = result_set.to_pandas_df()
    assert df["uuid"].tolist() == [str(i) for i in ids]
    # values are readable UUID strings, not raw bytes
    assert all(value is None or isinstance(value, str) for value in df["uuid"].tolist())


def test_stringify_extension_columns() -> None:
    """
    ``stringify_extension_columns`` converts Arrow extension columns (e.g. the
    canonical ``uuid`` type) to readable strings while leaving plain binary and
    other columns untouched. This is the shared helper used by both
    ``SupersetResultSet`` and the semantic-layers mapper.
    """
    import uuid

    import pyarrow as pa

    first = uuid.UUID("f4787a4f-2541-4f8a-9b5e-1e2d3c4b5a6f")
    uuid_col = pa.ExtensionArray.from_storage(
        pa.uuid(), pa.array([first.bytes, None], pa.binary(16))
    )
    table = pa.table(
        {
            "id": uuid_col,
            "blob": pa.array([b"\x89PNG", None], pa.binary()),
            "n": pa.array([1, 2]),
        }
    )

    result = stringify_extension_columns(table)

    # uuid extension -> readable string column
    assert pa.types.is_string(result.schema.field("id").type)
    assert result.column("id").to_pylist() == [str(first), None]
    # plain binary BLOBs and other types are left untouched
    assert pa.types.is_binary(result.schema.field("blob").type)
    assert pa.types.is_integer(result.schema.field("n").type)