[SQL Lab] Async query results serialization with MessagePack and PyArrow (#8069)

* Add support for msgpack results_backend serialization

* Serialize DataFrame with PyArrow rather than JSON

* Adjust dependencies, de-lint

* Add tests for (de)serialization methods

* Add MessagePack config info to Installation docs

* Enable msgpack/arrow serialization by default

* [Fix] Prevent msgpack serialization on synchronous queries

* Add type annotations
This commit is contained in:
Rob DiCiuccio
2019-08-27 14:23:40 -07:00
committed by Maxime Beauchemin
parent 56566c2645
commit 7595d9e5fd
13 changed files with 362 additions and 28 deletions

View File

@@ -15,12 +15,16 @@
# specific language governing permissions and limitations
# under the License.
"""Unit tests for Superset Celery worker"""
import datetime
import json
import subprocess
import time
import unittest
import unittest.mock as mock
from superset import app, db
from superset import app, db, sql_lab
from superset.dataframe import SupersetDataFrame
from superset.db_engine_specs.base import BaseEngineSpec
from superset.models.helpers import QueryStatus
from superset.models.sql_lab import Query
from superset.sql_parse import ParsedQuery
@@ -242,6 +246,114 @@ class CeleryTestCase(SupersetTestCase):
self.assertEqual(True, query.select_as_cta)
self.assertEqual(True, query.select_as_cta_used)
def test_default_data_serialization(self):
data = [("a", 4, 4.0, datetime.datetime(2019, 8, 18, 16, 39, 16, 660000))]
cursor_descr = (
("a", "string"),
("b", "int"),
("c", "float"),
("d", "datetime"),
)
db_engine_spec = BaseEngineSpec()
cdf = SupersetDataFrame(data, cursor_descr, db_engine_spec)
with mock.patch.object(
db_engine_spec, "expand_data", wraps=db_engine_spec.expand_data
) as expand_data:
data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data(
cdf, db_engine_spec, False
)
expand_data.assert_called_once()
self.assertIsInstance(data, list)
def test_new_data_serialization(self):
data = [("a", 4, 4.0, datetime.datetime(2019, 8, 18, 16, 39, 16, 660000))]
cursor_descr = (
("a", "string"),
("b", "int"),
("c", "float"),
("d", "datetime"),
)
db_engine_spec = BaseEngineSpec()
cdf = SupersetDataFrame(data, cursor_descr, db_engine_spec)
with mock.patch.object(
db_engine_spec, "expand_data", wraps=db_engine_spec.expand_data
) as expand_data:
data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data(
cdf, db_engine_spec, True
)
expand_data.assert_not_called()
self.assertIsInstance(data, bytes)
def test_default_payload_serialization(self):
use_new_deserialization = False
data = [("a", 4, 4.0, datetime.datetime(2019, 8, 18, 16, 39, 16, 660000))]
cursor_descr = (
("a", "string"),
("b", "int"),
("c", "float"),
("d", "datetime"),
)
db_engine_spec = BaseEngineSpec()
cdf = SupersetDataFrame(data, cursor_descr, db_engine_spec)
query = {
"database_id": 1,
"sql": "SELECT * FROM birth_names LIMIT 100",
"status": QueryStatus.PENDING,
}
serialized_data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data(
cdf, db_engine_spec, use_new_deserialization
)
payload = {
"query_id": 1,
"status": QueryStatus.SUCCESS,
"state": QueryStatus.SUCCESS,
"data": serialized_data,
"columns": all_columns,
"selected_columns": selected_columns,
"expanded_columns": expanded_columns,
"query": query,
}
serialized = sql_lab._serialize_payload(payload, use_new_deserialization)
self.assertIsInstance(serialized, str)
def test_msgpack_payload_serialization(self):
use_new_deserialization = True
data = [("a", 4, 4.0, datetime.datetime(2019, 8, 18, 16, 39, 16, 660000))]
cursor_descr = (
("a", "string"),
("b", "int"),
("c", "float"),
("d", "datetime"),
)
db_engine_spec = BaseEngineSpec()
cdf = SupersetDataFrame(data, cursor_descr, db_engine_spec)
query = {
"database_id": 1,
"sql": "SELECT * FROM birth_names LIMIT 100",
"status": QueryStatus.PENDING,
}
serialized_data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data(
cdf, db_engine_spec, use_new_deserialization
)
payload = {
"query_id": 1,
"status": QueryStatus.SUCCESS,
"state": QueryStatus.SUCCESS,
"data": serialized_data,
"columns": all_columns,
"selected_columns": selected_columns,
"expanded_columns": expanded_columns,
"query": query,
}
serialized = sql_lab._serialize_payload(payload, use_new_deserialization)
self.assertIsInstance(serialized, bytes)
@staticmethod
def de_unicode_dict(d):
def str_if_basestring(o):

View File

@@ -39,6 +39,7 @@ from superset.db_engine_specs.mssql import MssqlEngineSpec
from superset.models import core as models
from superset.models.sql_lab import Query
from superset.utils import core as utils
from superset.views import core as views
from superset.views.database.views import DatabaseView
from .base_tests import SupersetTestCase
from .fixtures.pyodbcRow import Row
@@ -776,6 +777,98 @@ class CoreTests(SupersetTestCase):
resp = self.get_resp(f"/superset/select_star/{examples_db.id}/birth_names")
self.assertIn("gender", resp)
def test_results_default_deserialization(self):
use_new_deserialization = False
data = [("a", 4, 4.0, "2019-08-18T16:39:16.660000")]
cursor_descr = (
("a", "string"),
("b", "int"),
("c", "float"),
("d", "datetime"),
)
db_engine_spec = BaseEngineSpec()
cdf = dataframe.SupersetDataFrame(data, cursor_descr, db_engine_spec)
query = {
"database_id": 1,
"sql": "SELECT * FROM birth_names LIMIT 100",
"status": utils.QueryStatus.PENDING,
}
serialized_data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data(
cdf, db_engine_spec, use_new_deserialization
)
payload = {
"query_id": 1,
"status": utils.QueryStatus.SUCCESS,
"state": utils.QueryStatus.SUCCESS,
"data": serialized_data,
"columns": all_columns,
"selected_columns": selected_columns,
"expanded_columns": expanded_columns,
"query": query,
}
serialized_payload = sql_lab._serialize_payload(
payload, use_new_deserialization
)
self.assertIsInstance(serialized_payload, str)
query_mock = mock.Mock()
deserialized_payload = views._deserialize_results_payload(
serialized_payload, query_mock, use_new_deserialization
)
self.assertDictEqual(deserialized_payload, payload)
query_mock.assert_not_called()
def test_results_msgpack_deserialization(self):
use_new_deserialization = True
data = [("a", 4, 4.0, "2019-08-18T16:39:16.660000")]
cursor_descr = (
("a", "string"),
("b", "int"),
("c", "float"),
("d", "datetime"),
)
db_engine_spec = BaseEngineSpec()
cdf = dataframe.SupersetDataFrame(data, cursor_descr, db_engine_spec)
query = {
"database_id": 1,
"sql": "SELECT * FROM birth_names LIMIT 100",
"status": utils.QueryStatus.PENDING,
}
serialized_data, selected_columns, all_columns, expanded_columns = sql_lab._serialize_and_expand_data(
cdf, db_engine_spec, use_new_deserialization
)
payload = {
"query_id": 1,
"status": utils.QueryStatus.SUCCESS,
"state": utils.QueryStatus.SUCCESS,
"data": serialized_data,
"columns": all_columns,
"selected_columns": selected_columns,
"expanded_columns": expanded_columns,
"query": query,
}
serialized_payload = sql_lab._serialize_payload(
payload, use_new_deserialization
)
self.assertIsInstance(serialized_payload, bytes)
with mock.patch.object(
db_engine_spec, "expand_data", wraps=db_engine_spec.expand_data
) as expand_data:
query_mock = mock.Mock()
query_mock.database.db_engine_spec.expand_data = expand_data
deserialized_payload = views._deserialize_results_payload(
serialized_payload, query_mock, use_new_deserialization
)
payload["data"] = dataframe.SupersetDataFrame.format_data(cdf.raw_df)
self.assertDictEqual(deserialized_payload, payload)
expand_data.assert_called_once()
if __name__ == "__main__":
unittest.main()

View File

@@ -47,7 +47,7 @@ from superset.utils.core import (
setup_cache,
validate_json,
zlib_compress,
zlib_decompress_to_string,
zlib_decompress,
)
@@ -140,7 +140,7 @@ class UtilsTestCase(unittest.TestCase):
def test_zlib_compression(self):
json_str = '{"test": 1}'
blob = zlib_compress(json_str)
got_str = zlib_decompress_to_string(blob)
got_str = zlib_decompress(blob)
self.assertEquals(json_str, got_str)
@patch("superset.utils.core.to_adhoc", mock_to_adhoc)