mirror of
https://github.com/apache/superset.git
synced 2026-04-19 08:04:53 +00:00
[SQL Lab] Async query results serialization with MessagePack and PyArrow (#8069)
* Add support for msgpack results_backend serialization * Serialize DataFrame with PyArrow rather than JSON * Adjust dependencies, de-lint * Add tests for (de)serialization methods * Add MessagePack config info to Installation docs * Enable msgpack/arrow serialization by default * [Fix] Prevent msgpack serialization on synchronous queries * Add type annotations
This commit is contained in:
committed by
Maxime Beauchemin
parent
56566c2645
commit
7595d9e5fd
@@ -15,12 +15,16 @@
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
"""Unit tests for Superset Celery worker"""
|
||||
import datetime
|
||||
import json
|
||||
import subprocess
|
||||
import time
|
||||
import unittest
|
||||
import unittest.mock as mock
|
||||
|
||||
from superset import app, db
|
||||
from superset import app, db, sql_lab
|
||||
from superset.dataframe import SupersetDataFrame
|
||||
from superset.db_engine_specs.base import BaseEngineSpec
|
||||
from superset.models.helpers import QueryStatus
|
||||
from superset.models.sql_lab import Query
|
||||
from superset.sql_parse import ParsedQuery
|
||||
@@ -242,6 +246,114 @@ class CeleryTestCase(SupersetTestCase):
|
||||
self.assertEqual(True, query.select_as_cta)
|
||||
self.assertEqual(True, query.select_as_cta_used)
|
||||
|
||||
def test_default_data_serialization(self):
|
||||
data = [("a", 4, 4.0, datetime.datetime(2019, 8, 18, 16, 39, 16, 660000))]
|
||||
cursor_descr = (
|
||||
("a", "string"),
|
||||
("b", "int"),
|
||||
("c", "float"),
|
||||
("d", "datetime"),
|
||||
)
|
||||
db_engine_spec = BaseEngineSpec()
|
||||
cdf = SupersetDataFrame(data, cursor_descr, db_engine_spec)
|
||||
|
||||
with mock.patch.object(
|
||||
db_engine_spec, "expand_data", wraps=db_engine_spec.expand_data
|
||||
) as expand_data:
|
||||
data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data(
|
||||
cdf, db_engine_spec, False
|
||||
)
|
||||
expand_data.assert_called_once()
|
||||
|
||||
self.assertIsInstance(data, list)
|
||||
|
||||
def test_new_data_serialization(self):
|
||||
data = [("a", 4, 4.0, datetime.datetime(2019, 8, 18, 16, 39, 16, 660000))]
|
||||
cursor_descr = (
|
||||
("a", "string"),
|
||||
("b", "int"),
|
||||
("c", "float"),
|
||||
("d", "datetime"),
|
||||
)
|
||||
db_engine_spec = BaseEngineSpec()
|
||||
cdf = SupersetDataFrame(data, cursor_descr, db_engine_spec)
|
||||
|
||||
with mock.patch.object(
|
||||
db_engine_spec, "expand_data", wraps=db_engine_spec.expand_data
|
||||
) as expand_data:
|
||||
data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data(
|
||||
cdf, db_engine_spec, True
|
||||
)
|
||||
expand_data.assert_not_called()
|
||||
|
||||
self.assertIsInstance(data, bytes)
|
||||
|
||||
def test_default_payload_serialization(self):
|
||||
use_new_deserialization = False
|
||||
data = [("a", 4, 4.0, datetime.datetime(2019, 8, 18, 16, 39, 16, 660000))]
|
||||
cursor_descr = (
|
||||
("a", "string"),
|
||||
("b", "int"),
|
||||
("c", "float"),
|
||||
("d", "datetime"),
|
||||
)
|
||||
db_engine_spec = BaseEngineSpec()
|
||||
cdf = SupersetDataFrame(data, cursor_descr, db_engine_spec)
|
||||
query = {
|
||||
"database_id": 1,
|
||||
"sql": "SELECT * FROM birth_names LIMIT 100",
|
||||
"status": QueryStatus.PENDING,
|
||||
}
|
||||
serialized_data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data(
|
||||
cdf, db_engine_spec, use_new_deserialization
|
||||
)
|
||||
payload = {
|
||||
"query_id": 1,
|
||||
"status": QueryStatus.SUCCESS,
|
||||
"state": QueryStatus.SUCCESS,
|
||||
"data": serialized_data,
|
||||
"columns": all_columns,
|
||||
"selected_columns": selected_columns,
|
||||
"expanded_columns": expanded_columns,
|
||||
"query": query,
|
||||
}
|
||||
|
||||
serialized = sql_lab._serialize_payload(payload, use_new_deserialization)
|
||||
self.assertIsInstance(serialized, str)
|
||||
|
||||
def test_msgpack_payload_serialization(self):
|
||||
use_new_deserialization = True
|
||||
data = [("a", 4, 4.0, datetime.datetime(2019, 8, 18, 16, 39, 16, 660000))]
|
||||
cursor_descr = (
|
||||
("a", "string"),
|
||||
("b", "int"),
|
||||
("c", "float"),
|
||||
("d", "datetime"),
|
||||
)
|
||||
db_engine_spec = BaseEngineSpec()
|
||||
cdf = SupersetDataFrame(data, cursor_descr, db_engine_spec)
|
||||
query = {
|
||||
"database_id": 1,
|
||||
"sql": "SELECT * FROM birth_names LIMIT 100",
|
||||
"status": QueryStatus.PENDING,
|
||||
}
|
||||
serialized_data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data(
|
||||
cdf, db_engine_spec, use_new_deserialization
|
||||
)
|
||||
payload = {
|
||||
"query_id": 1,
|
||||
"status": QueryStatus.SUCCESS,
|
||||
"state": QueryStatus.SUCCESS,
|
||||
"data": serialized_data,
|
||||
"columns": all_columns,
|
||||
"selected_columns": selected_columns,
|
||||
"expanded_columns": expanded_columns,
|
||||
"query": query,
|
||||
}
|
||||
|
||||
serialized = sql_lab._serialize_payload(payload, use_new_deserialization)
|
||||
self.assertIsInstance(serialized, bytes)
|
||||
|
||||
@staticmethod
|
||||
def de_unicode_dict(d):
|
||||
def str_if_basestring(o):
|
||||
|
||||
@@ -39,6 +39,7 @@ from superset.db_engine_specs.mssql import MssqlEngineSpec
|
||||
from superset.models import core as models
|
||||
from superset.models.sql_lab import Query
|
||||
from superset.utils import core as utils
|
||||
from superset.views import core as views
|
||||
from superset.views.database.views import DatabaseView
|
||||
from .base_tests import SupersetTestCase
|
||||
from .fixtures.pyodbcRow import Row
|
||||
@@ -776,6 +777,98 @@ class CoreTests(SupersetTestCase):
|
||||
resp = self.get_resp(f"/superset/select_star/{examples_db.id}/birth_names")
|
||||
self.assertIn("gender", resp)
|
||||
|
||||
def test_results_default_deserialization(self):
|
||||
use_new_deserialization = False
|
||||
data = [("a", 4, 4.0, "2019-08-18T16:39:16.660000")]
|
||||
cursor_descr = (
|
||||
("a", "string"),
|
||||
("b", "int"),
|
||||
("c", "float"),
|
||||
("d", "datetime"),
|
||||
)
|
||||
db_engine_spec = BaseEngineSpec()
|
||||
cdf = dataframe.SupersetDataFrame(data, cursor_descr, db_engine_spec)
|
||||
query = {
|
||||
"database_id": 1,
|
||||
"sql": "SELECT * FROM birth_names LIMIT 100",
|
||||
"status": utils.QueryStatus.PENDING,
|
||||
}
|
||||
serialized_data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data(
|
||||
cdf, db_engine_spec, use_new_deserialization
|
||||
)
|
||||
payload = {
|
||||
"query_id": 1,
|
||||
"status": utils.QueryStatus.SUCCESS,
|
||||
"state": utils.QueryStatus.SUCCESS,
|
||||
"data": serialized_data,
|
||||
"columns": all_columns,
|
||||
"selected_columns": selected_columns,
|
||||
"expanded_columns": expanded_columns,
|
||||
"query": query,
|
||||
}
|
||||
|
||||
serialized_payload = sql_lab._serialize_payload(
|
||||
payload, use_new_deserialization
|
||||
)
|
||||
self.assertIsInstance(serialized_payload, str)
|
||||
|
||||
query_mock = mock.Mock()
|
||||
deserialized_payload = views._deserialize_results_payload(
|
||||
serialized_payload, query_mock, use_new_deserialization
|
||||
)
|
||||
|
||||
self.assertDictEqual(deserialized_payload, payload)
|
||||
query_mock.assert_not_called()
|
||||
|
||||
def test_results_msgpack_deserialization(self):
|
||||
use_new_deserialization = True
|
||||
data = [("a", 4, 4.0, "2019-08-18T16:39:16.660000")]
|
||||
cursor_descr = (
|
||||
("a", "string"),
|
||||
("b", "int"),
|
||||
("c", "float"),
|
||||
("d", "datetime"),
|
||||
)
|
||||
db_engine_spec = BaseEngineSpec()
|
||||
cdf = dataframe.SupersetDataFrame(data, cursor_descr, db_engine_spec)
|
||||
query = {
|
||||
"database_id": 1,
|
||||
"sql": "SELECT * FROM birth_names LIMIT 100",
|
||||
"status": utils.QueryStatus.PENDING,
|
||||
}
|
||||
serialized_data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data(
|
||||
cdf, db_engine_spec, use_new_deserialization
|
||||
)
|
||||
payload = {
|
||||
"query_id": 1,
|
||||
"status": utils.QueryStatus.SUCCESS,
|
||||
"state": utils.QueryStatus.SUCCESS,
|
||||
"data": serialized_data,
|
||||
"columns": all_columns,
|
||||
"selected_columns": selected_columns,
|
||||
"expanded_columns": expanded_columns,
|
||||
"query": query,
|
||||
}
|
||||
|
||||
serialized_payload = sql_lab._serialize_payload(
|
||||
payload, use_new_deserialization
|
||||
)
|
||||
self.assertIsInstance(serialized_payload, bytes)
|
||||
|
||||
with mock.patch.object(
|
||||
db_engine_spec, "expand_data", wraps=db_engine_spec.expand_data
|
||||
) as expand_data:
|
||||
query_mock = mock.Mock()
|
||||
query_mock.database.db_engine_spec.expand_data = expand_data
|
||||
|
||||
deserialized_payload = views._deserialize_results_payload(
|
||||
serialized_payload, query_mock, use_new_deserialization
|
||||
)
|
||||
payload["data"] = dataframe.SupersetDataFrame.format_data(cdf.raw_df)
|
||||
|
||||
self.assertDictEqual(deserialized_payload, payload)
|
||||
expand_data.assert_called_once()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -47,7 +47,7 @@ from superset.utils.core import (
|
||||
setup_cache,
|
||||
validate_json,
|
||||
zlib_compress,
|
||||
zlib_decompress_to_string,
|
||||
zlib_decompress,
|
||||
)
|
||||
|
||||
|
||||
@@ -140,7 +140,7 @@ class UtilsTestCase(unittest.TestCase):
|
||||
def test_zlib_compression(self):
|
||||
json_str = '{"test": 1}'
|
||||
blob = zlib_compress(json_str)
|
||||
got_str = zlib_decompress_to_string(blob)
|
||||
got_str = zlib_decompress(blob)
|
||||
self.assertEquals(json_str, got_str)
|
||||
|
||||
@patch("superset.utils.core.to_adhoc", mock_to_adhoc)
|
||||
|
||||
Reference in New Issue
Block a user