superset2/tests/unit_tests/utils/csv_tests.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.


import pandas as pd
import pyarrow as pa
import pytest  # noqa: F401
from pandas.api.types import is_datetime64_any_dtype

from superset.utils import csv, json
from superset.utils.core import GenericDataType
from superset.utils.csv import (
    df_to_escaped_csv,
    get_chart_dataframe,
)


def test_escape_value():
    result = csv.escape_value("value")
    assert result == "value"

    result = csv.escape_value("-10")
    assert result == "-10"

    result = csv.escape_value("@value")
    assert result == "'@value"

    result = csv.escape_value("+value")
    assert result == "'+value"

    result = csv.escape_value("-value")
    assert result == "'-value"

    result = csv.escape_value("=value")
    assert result == "'=value"

    result = csv.escape_value("|value")
    assert result == r"'\|value"

    result = csv.escape_value("%value")
    assert result == "'%value"

    result = csv.escape_value("=cmd|' /C calc'!A0")
    assert result == r"'=cmd\|' /C calc'!A0"

    result = csv.escape_value('""=10+2')
    assert result == '\'""=10+2'

    result = csv.escape_value(" =10+2")
    assert result == "' =10+2"


def fake_get_chart_csv_data_none(chart_url, auth_cookies=None):
    return None


def fake_get_chart_csv_data_empty(chart_url, auth_cookies=None):
    # Return JSON with empty data so that the resulting DataFrame is empty
    fake_result = {
        "result": [{"data": {}, "coltypes": [], "colnames": [], "indexnames": []}]
    }
    return json.dumps(fake_result).encode("utf-8")


def fake_get_chart_csv_data_valid(chart_url, auth_cookies=None):
    # Return JSON with non-temporal data and valid indexnames so that they are used.
    fake_result = {
        "result": [
            {
                "data": {"col1": [1, 2], "col2": ["a", "b"]},
                "coltypes": [GenericDataType.NUMERIC, GenericDataType.STRING],
                "colnames": ["col1", "col2"],
                # Provide two index names so that a MultiIndex is built.
                "indexnames": ["idx1", "idx2"],
            }
        ]
    }
    return json.dumps(fake_result).encode("utf-8")


def fake_get_chart_csv_data_temporal(chart_url, auth_cookies=None):
    """
    Return JSON with a temporal column and valid indexnames
    so that a MultiIndex is built.
    """
    fake_result = {
        "result": [
            {
                "data": {"date": [1609459200000, 1612137600000], "val": [10, 20]},
                "coltypes": [GenericDataType.TEMPORAL, GenericDataType.NUMERIC],
                "colnames": ["date", "val"],
                # Provide two index names so a MultiIndex is built.
                "indexnames": [0, 1],
            }
        ]
    }
    return json.dumps(fake_result).encode("utf-8")


def fake_get_chart_csv_data_hierarchical(chart_url, auth_cookies=None):
    # Return JSON with hierarchical column (list-based) and matching index names.
    fake_result = {
        "result": [
            {
                "data": {"a": [1, 2]},
                "coltypes": [GenericDataType.NUMERIC],
                "colnames": [["level1", "a"]],
                # Provide two index tuples for two rows
                "indexnames": [["idx"], ["idx"]],
            }
        ]
    }
    return json.dumps(fake_result).encode("utf-8")


def fake_get_chart_csv_data_with_na_values(chart_url, auth_cookies=None):
    # Return JSON with data containing "NA" string value that will be treated as null
    fake_result = {
        "result": [
            {
                "data": {"first_name": ["Jeff", "Alice"], "last_name": ["Smith", "NA"]},
                "coltypes": [GenericDataType.STRING, GenericDataType.STRING],
                "colnames": ["first_name", "last_name"],
                "indexnames": ["idx1", "idx2"],
            }
        ]
    }
    return json.dumps(fake_result).encode("utf-8")


def test_df_to_escaped_csv():
    df = pd.DataFrame(
        data={
            "value": [
                "a",
                "col_a",
                "=func()",
                "-10",
                "=cmd|' /C calc'!A0",
                '""=b',
                " =a",
                "\x00",
            ]
        }
    )

    escaped_csv_str = df_to_escaped_csv(
        df,
        encoding="utf8",
        index=False,
        header=False,
    )

    escaped_csv_rows = [row.split(",") for row in escaped_csv_str.strip().split("\n")]

    assert escaped_csv_rows == [
        ["a"],
        ["col_a"],
        ["'=func()"],
        ["-10"],
        [r"'=cmd\\|' /C calc'!A0"],
        ['"\'""""=b"'],
        ["' =a"],
        ["\x00"],
    ]

    df = pa.array([1, None]).to_pandas(integer_object_nulls=True).to_frame()
    assert df_to_escaped_csv(df, encoding="utf8", index=False) == '0\n1\n""\n'


def test_get_chart_dataframe_returns_none_when_no_content(
    monkeypatch: pytest.MonkeyPatch,
):
    monkeypatch.setattr(csv, "get_chart_csv_data", fake_get_chart_csv_data_none)
    result = get_chart_dataframe("http://dummy-url")
    assert result is None


def test_get_chart_dataframe_returns_none_for_empty_data(
    monkeypatch: pytest.MonkeyPatch,
):
    monkeypatch.setattr(csv, "get_chart_csv_data", fake_get_chart_csv_data_empty)
    result = get_chart_dataframe("http://dummy-url")
    # When data is empty, the function should return None
    assert result is None


def test_get_chart_dataframe_valid_non_temporal(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setattr(csv, "get_chart_csv_data", fake_get_chart_csv_data_valid)
    df = get_chart_dataframe("http://dummy-url")
    assert df is not None

    expected_columns = pd.MultiIndex.from_tuples([("col1",), ("col2",)])
    pd.testing.assert_index_equal(df.columns, expected_columns)

    expected_index = pd.MultiIndex.from_tuples([("idx1",), ("idx2",)])
    pd.testing.assert_index_equal(df.index, expected_index)

    pd.testing.assert_series_equal(
        df[("col1",)], pd.Series([1, 2], name=("col1",), index=df.index)
    )
    pd.testing.assert_series_equal(
        df[("col2",)], pd.Series(["a", "b"], name=("col2",), index=df.index)
    )
    markdown_str = df.to_markdown()
    expected_markdown_str = """
|           |   ('col1',) | ('col2',)   |
|:----------|------------:|:------------|
| ('idx1',) |           1 | a           |
| ('idx2',) |           2 | b           |
"""
    assert markdown_str.strip() == expected_markdown_str.strip()


def test_get_chart_dataframe_valid_temporal(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setattr(csv, "get_chart_csv_data", fake_get_chart_csv_data_temporal)
    df = get_chart_dataframe("http://dummy-url")
    expected_columns = pd.MultiIndex.from_tuples([("date",), ("val",)])
    assert df is not None
    pd.testing.assert_index_equal(df.columns, expected_columns)

    expected_index = pd.MultiIndex.from_tuples([(0,), (1,)])
    pd.testing.assert_index_equal(df.index, expected_index)

    assert is_datetime64_any_dtype(df[("date",)])
    expected_dates = pd.to_datetime([1609459200000, 1612137600000], unit="ms").astype(
        "datetime64[ms]"
    )
    actual_dates = df[("date",)].reset_index(drop=True)
    pd.testing.assert_series_equal(
        actual_dates, pd.Series(expected_dates, name=("date",)), check_names=False
    )
    pd.testing.assert_series_equal(
        df[("val",)], pd.Series([10, 20], name=("val",), index=df.index)
    )
    markdown_str = df.to_markdown()
    expected_markdown_str = """
|      | ('date',)           |   ('val',) |
|:-----|:--------------------|-----------:|
| (0,) | 2021-01-01 00:00:00 |         10 |
| (1,) | 2021-02-01 00:00:00 |         20 |
"""
    assert markdown_str.strip() == expected_markdown_str.strip()


def test_get_chart_dataframe_with_hierarchical_columns(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setattr(csv, "get_chart_csv_data", fake_get_chart_csv_data_hierarchical)
    df = get_chart_dataframe("http://dummy-url")
    assert df is not None
    expected_columns = pd.MultiIndex.from_tuples([("level1", "a")])
    pd.testing.assert_index_equal(df.columns, expected_columns)

    expected_index = pd.MultiIndex.from_tuples([("idx",)] * len(df))
    pd.testing.assert_index_equal(df.index, expected_index)

    pd.testing.assert_series_equal(
        df[("level1", "a")], pd.Series([1, 2], name=("level1", "a"), index=df.index)
    )
    markdown_str = df.to_markdown()
    expected_markdown_str = """
|          |   ('level1', 'a') |
|:---------|------------------:|
| ('idx',) |                 1 |
| ('idx',) |                 2 |
"""
    assert markdown_str.strip() == expected_markdown_str.strip()


def test_get_chart_dataframe_preserves_na_string_values(
    monkeypatch: pytest.MonkeyPatch,
):
    """
    Test that get_chart_dataframe currently preserves rows containing "NA"
    string values.
    This test verifies the existing behavior before implementing custom NA handling.
    """
    monkeypatch.setattr(
        csv, "get_chart_csv_data", fake_get_chart_csv_data_with_na_values
    )
    df = get_chart_dataframe("http://dummy-url")
    assert df is not None

    # Verify the DataFrame structure
    expected_columns = pd.MultiIndex.from_tuples([("first_name",), ("last_name",)])
    pd.testing.assert_index_equal(df.columns, expected_columns)

    expected_index = pd.MultiIndex.from_tuples([("idx1",), ("idx2",)])
    pd.testing.assert_index_equal(df.index, expected_index)

    # Check that we have both rows initially
    assert len(df) == 2

    # Verify the data contains the "NA" string value (not converted to NaN)
    pd.testing.assert_series_equal(
        df[("first_name",)],
        pd.Series(["Jeff", "Alice"], name=("first_name",), index=df.index),
    )
    pd.testing.assert_series_equal(
        df[("last_name",)],
        pd.Series(["Smith", "NA"], name=("last_name",), index=df.index),
    )

    last_name_values = df[("last_name",)].values
    assert last_name_values[0] == "Smith"
    assert last_name_values[1] == "NA"