Replace pandas.DataFrame with PyArrow.Table for nullable int typing (#8733)

* Use PyArrow Table for query result serialization * Cleanup dev comments * Additional cleanup * WIP: tests * Remove explicit dtype logic from db_engine_specs * Remove obsolete column property * SupersetTable column types * Port SupersetDataFrame methods to SupersetTable * Add test for nullable boolean columns * Support datetime values with timezone offsets * Black formatting * Pylint * More linting/formatting * Resolve issue with timezones not appearing in results * Types * Enable running of tests in tests/db_engine_specs * Resolve application context errors * Refactor and add tests for pyodbc.Row conversion * Appease isort, regardless of isort:skip * Re-enable RESULTS_BACKEND_USE_MSGPACK default based on benchmarks * Dataframe typing and nits * Renames to reduce ambiguity
2026-04-19 16:14:52 +00:00 · 2020-01-03 16:55:39 +00:00
parent 4f8bf2b04d
commit 6537d5ed8c
16 changed files with 438 additions and 513 deletions
--- a/tests/dataframe_test.py
+++ b/tests/dataframe_test.py
@@ -17,143 +17,35 @@
 import numpy as np
 import pandas as pd

-from superset.dataframe import dedup, SupersetDataFrame
+from superset.dataframe import df_to_records
 from superset.db_engine_specs import BaseEngineSpec
-from superset.db_engine_specs.presto import PrestoEngineSpec
+from superset.result_set import SupersetResultSet

 from .base_tests import SupersetTestCase


 class SupersetDataFrameTestCase(SupersetTestCase):
-    def test_dedup(self):
-        self.assertEqual(dedup(["foo", "bar"]), ["foo", "bar"])
-        self.assertEqual(
-            dedup(["foo", "bar", "foo", "bar", "Foo"]),
-            ["foo", "bar", "foo__1", "bar__1", "Foo"],
-        )
-        self.assertEqual(
-            dedup(["foo", "bar", "bar", "bar", "Bar"]),
-            ["foo", "bar", "bar__1", "bar__2", "Bar"],
-        )
-        self.assertEqual(
-            dedup(["foo", "bar", "bar", "bar", "Bar"], case_sensitive=False),
-            ["foo", "bar", "bar__1", "bar__2", "Bar__3"],
-        )
-
-    def test_get_columns_basic(self):
+    def test_df_to_records(self):
        data = [("a1", "b1", "c1"), ("a2", "b2", "c2")]
        cursor_descr = (("a", "string"), ("b", "string"), ("c", "string"))
-        cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec)
+        results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
+        df = results.to_pandas_df()
+
        self.assertEqual(
-            cdf.columns,
+            df_to_records(df),
+            [{"a": "a1", "b": "b1", "c": "c1"}, {"a": "a2", "b": "b2", "c": "c2"}],
+        )
+
+    def test_js_max_int(self):
+        data = [(1, 1239162456494753670, "c1"), (2, 100, "c2")]
+        cursor_descr = (("a", "int"), ("b", "int"), ("c", "string"))
+        results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
+        df = results.to_pandas_df()
+
+        self.assertEqual(
+            df_to_records(df),
            [
-                {"is_date": False, "type": "STRING", "name": "a", "is_dim": True},
-                {"is_date": False, "type": "STRING", "name": "b", "is_dim": True},
-                {"is_date": False, "type": "STRING", "name": "c", "is_dim": True},
+                {"a": 1, "b": "1239162456494753670", "c": "c1"},
+                {"a": 2, "b": 100, "c": "c2"},
            ],
        )
-
-    def test_get_columns_with_int(self):
-        data = [("a1", 1), ("a2", 2)]
-        cursor_descr = (("a", "string"), ("b", "int"))
-        cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec)
-        self.assertEqual(
-            cdf.columns,
-            [
-                {"is_date": False, "type": "STRING", "name": "a", "is_dim": True},
-                {
-                    "is_date": False,
-                    "type": "INT",
-                    "name": "b",
-                    "is_dim": False,
-                    "agg": "sum",
-                },
-            ],
-        )
-
-    def test_get_columns_type_inference(self):
-        data = [(1.2, 1), (3.14, 2)]
-        cursor_descr = (("a", None), ("b", None))
-        cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec)
-        self.assertEqual(
-            cdf.columns,
-            [
-                {
-                    "is_date": False,
-                    "type": "FLOAT",
-                    "name": "a",
-                    "is_dim": False,
-                    "agg": "sum",
-                },
-                {
-                    "is_date": False,
-                    "type": "INT",
-                    "name": "b",
-                    "is_dim": False,
-                    "agg": "sum",
-                },
-            ],
-        )
-
-    def test_is_date(self):
-        f = SupersetDataFrame.is_date
-        self.assertEqual(f(np.dtype("M"), ""), True)
-        self.assertEqual(f(np.dtype("f"), "DATETIME"), True)
-        self.assertEqual(f(np.dtype("i"), "TIMESTAMP"), True)
-        self.assertEqual(f(None, "DATETIME"), True)
-        self.assertEqual(f(None, "TIMESTAMP"), True)
-
-        self.assertEqual(f(None, ""), False)
-        self.assertEqual(f(np.dtype(np.int32), ""), False)
-
-    def test_dedup_with_data(self):
-        data = [("a", 1), ("a", 2)]
-        cursor_descr = (("a", "string"), ("a", "string"))
-        cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec)
-        self.assertListEqual(cdf.column_names, ["a", "a__1"])
-
-    def test_int64_with_missing_data(self):
-        data = [(None,), (1239162456494753670,), (None,), (None,), (None,), (None,)]
-        cursor_descr = [("user_id", "bigint", None, None, None, None, True)]
-
-        # the base engine spec does not provide a dtype based on the cursor
-        # description, so the column is inferred as float64 because of the
-        # missing data
-        cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec)
-        np.testing.assert_array_equal(
-            cdf.raw_df.values.tolist(),
-            [[np.nan], [1.2391624564947538e18], [np.nan], [np.nan], [np.nan], [np.nan]],
-        )
-
-        # currently only Presto provides a dtype based on the cursor description
-        cdf = SupersetDataFrame(data, cursor_descr, PrestoEngineSpec)
-        np.testing.assert_array_equal(
-            cdf.raw_df.values.tolist(),
-            [[np.nan], [1239162456494753670], [np.nan], [np.nan], [np.nan], [np.nan]],
-        )
-
-    def test_pandas_datetime64(self):
-        data = [(None,)]
-        cursor_descr = [("ds", "timestamp", None, None, None, None, True)]
-        cdf = SupersetDataFrame(data, cursor_descr, PrestoEngineSpec)
-        self.assertEqual(cdf.raw_df.dtypes[0], np.dtype("<M8[ns]"))
-
-    def test_no_type_coercion(self):
-        data = [("a", 1), ("b", 2)]
-        cursor_descr = [
-            ("one", "varchar", None, None, None, None, True),
-            ("two", "integer", None, None, None, None, True),
-        ]
-        cdf = SupersetDataFrame(data, cursor_descr, PrestoEngineSpec)
-        self.assertEqual(cdf.raw_df.dtypes[0], np.dtype("O"))
-        self.assertEqual(cdf.raw_df.dtypes[1], pd.Int64Dtype())
-
-    def test_empty_data(self):
-        data = []
-        cursor_descr = [
-            ("one", "varchar", None, None, None, None, True),
-            ("two", "integer", None, None, None, None, True),
-        ]
-        cdf = SupersetDataFrame(data, cursor_descr, PrestoEngineSpec)
-        self.assertEqual(cdf.raw_df.dtypes[0], np.dtype("O"))
-        self.assertEqual(cdf.raw_df.dtypes[1], pd.Int64Dtype())