Improve database type inference (#4724)

* Improve database type inference Python's DBAPI isn't super clear and homogeneous on the cursor.description specification, and this PR attempts to improve inferring the datatypes returned in the cursor. This work started around Presto's TIMESTAMP type being mishandled as string as the database driver (pyhive) returns it as a string. The work here fixes this bug and does a better job at inferring MySQL and Presto types. It also creates a new method in db_engine_specs allowing for other databases engines to implement and become more precise on type-inference as needed. * Fixing tests * Adressing comments * Using infer_objects * Removing faulty line * Addressing PrestoSpec redundant method comment * Fix rebase issue * Fix tests
2026-04-20 00:24:38 +00:00 · 2018-06-27 21:35:12 -07:00
parent 04fc1d1089
commit 777d876a52
8 changed files with 224 additions and 117 deletions
--- a/tests/celery_tests.py
+++ b/tests/celery_tests.py
@@ -14,7 +14,7 @@ import unittest
 import pandas as pd
 from past.builtins import basestring

-from superset import app, cli, dataframe, db, security_manager
+from superset import app, cli, db, security_manager
 from superset.models.helpers import QueryStatus
 from superset.models.sql_lab import Query
 from superset.sql_parse import SupersetQuery
@@ -245,55 +245,6 @@ class CeleryTestCase(SupersetTestCase):
    def dictify_list_of_dicts(cls, l, k):
        return {str(o[k]): cls.de_unicode_dict(o) for o in l}

-    def test_get_columns(self):
-        main_db = self.get_main_database(db.session)
-        df = main_db.get_df('SELECT * FROM multiformat_time_series', None)
-        cdf = dataframe.SupersetDataFrame(df)
-
-        # Making ordering non-deterministic
-        cols = self.dictify_list_of_dicts(cdf.columns, 'name')
-
-        if main_db.sqlalchemy_uri.startswith('sqlite'):
-            self.assertEqual(self.dictify_list_of_dicts([
-                {'is_date': True, 'type': 'STRING', 'name': 'ds',
-                    'is_dim': False},
-                {'is_date': True, 'type': 'STRING', 'name': 'ds2',
-                    'is_dim': False},
-                {'agg': 'sum', 'is_date': False, 'type': 'INT',
-                    'name': 'epoch_ms', 'is_dim': False},
-                {'agg': 'sum', 'is_date': False, 'type': 'INT',
-                    'name': 'epoch_s', 'is_dim': False},
-                {'is_date': True, 'type': 'STRING', 'name': 'string0',
-                    'is_dim': False},
-                {'is_date': False, 'type': 'STRING',
-                    'name': 'string1', 'is_dim': True},
-                {'is_date': True, 'type': 'STRING', 'name': 'string2',
-                    'is_dim': False},
-                {'is_date': False, 'type': 'STRING',
-                    'name': 'string3', 'is_dim': True}], 'name'),
-                cols,
-            )
-        else:
-            self.assertEqual(self.dictify_list_of_dicts([
-                {'is_date': True, 'type': 'DATETIME', 'name': 'ds',
-                    'is_dim': False},
-                {'is_date': True, 'type': 'DATETIME',
-                    'name': 'ds2', 'is_dim': False},
-                {'agg': 'sum', 'is_date': False, 'type': 'INT',
-                    'name': 'epoch_ms', 'is_dim': False},
-                {'agg': 'sum', 'is_date': False, 'type': 'INT',
-                    'name': 'epoch_s', 'is_dim': False},
-                {'is_date': True, 'type': 'STRING', 'name': 'string0',
-                    'is_dim': False},
-                {'is_date': False, 'type': 'STRING',
-                    'name': 'string1', 'is_dim': True},
-                {'is_date': True, 'type': 'STRING', 'name': 'string2',
-                    'is_dim': False},
-                {'is_date': False, 'type': 'STRING',
-                    'name': 'string3', 'is_dim': True}], 'name'),
-                cols,
-            )
-

 if __name__ == '__main__':
    unittest.main()