Files
superset2/superset/dataframe.py
Maxime Beauchemin 1f8e48b374 [sqllab] assign types for visualize flow (#2458)
* [sqllab] assign types for visualize flow

Somehow when using the visualize flow, the types were not
assigned at all, creating some bugs downstream. This PR attempts to get
the information required based on what pandas is knows and the types in
the data itself.

* Fixing tests

* Fixing tests

* Fixing more tests

* Fixing the last py3 tests
2017-03-24 09:23:51 -07:00

144 lines
4.5 KiB
Python

""" Superset wrapper around pandas.DataFrame.
TODO(bkyryliuk): add support for the conventions like: *_dim or dim_*
dimensions, *_ts, ts_*, ds_*, *_ds - datetime, etc.
TODO(bkyryliuk): recognize integer encoded enums.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from datetime import datetime, date
from past.builtins import basestring
import pandas as pd
import numpy as np
INFER_COL_TYPES_THRESHOLD = 95
INFER_COL_TYPES_SAMPLE_SIZE = 100
class SupersetDataFrame(object):
# Mapping numpy dtype.char to generic database types
type_map = {
'b': 'BOOL', # boolean
'i': 'INT', # (signed) integer
'u': 'INT', # unsigned integer
'l': 'INT', # 64bit integer
'f': 'FLOAT', # floating-point
'c': 'FLOAT', # complex-floating point
'm': None, # timedelta
'M': 'DATETIME', # datetime
'O': 'OBJECT', # (Python) objects
'S': 'BYTE', # (byte-)string
'U': 'STRING', # Unicode
'V': None, # raw data (void)
}
def __init__(self, df):
self.__df = df.where((pd.notnull(df)), None)
@property
def size(self):
return len(self.__df.index)
@property
def data(self):
return self.__df.to_dict(orient='records')
@classmethod
def db_type(cls, dtype):
"""Given a numpy dtype, Returns a generic database type"""
return cls.type_map.get(dtype.char)
@classmethod
def datetime_conversion_rate(cls, data_series):
success = 0
total = 0
for value in data_series:
total += 1
try:
pd.to_datetime(value)
success += 1
except Exception:
continue
return 100 * success / total
@classmethod
def is_date(cls, dtype):
if dtype.name:
return dtype.name.startswith('datetime')
@classmethod
def is_dimension(cls, dtype, column_name):
if cls.is_id(column_name):
return False
return dtype.name in ('object', 'bool')
@classmethod
def is_id(cls, column_name):
return column_name.startswith('id') or column_name.endswith('id')
@classmethod
def agg_func(cls, dtype, column_name):
# consider checking for key substring too.
if cls.is_id(column_name):
return 'count_distinct'
if np.issubdtype(dtype, np.number):
return 'sum'
@property
def columns(self):
"""Provides metadata about columns for data visualization.
:return: dict, with the fields name, type, is_date, is_dim and agg.
"""
if self.__df.empty:
return None
columns = []
sample_size = min(INFER_COL_TYPES_SAMPLE_SIZE, len(self.__df.index))
sample = self.__df
if sample_size:
sample = self.__df.sample(sample_size)
for col in self.__df.dtypes.keys():
col_db_type = self.db_type(self.__df.dtypes[col])
column = {
'name': col,
'agg': self.agg_func(self.__df.dtypes[col], col),
'type': col_db_type,
'is_date': self.is_date(self.__df.dtypes[col]),
'is_dim': self.is_dimension(self.__df.dtypes[col], col),
}
if column['type'] in ('OBJECT', None):
v = sample[col].iloc[0] if not sample[col].empty else None
if isinstance(v, basestring):
column['type'] = 'STRING'
elif isinstance(v, int):
column['type'] = 'INT'
elif isinstance(v, float):
column['type'] = 'FLOAT'
elif isinstance(v, (datetime, date)):
column['type'] = 'DATETIME'
column['is_date'] = True
column['is_dim'] = False
# check if encoded datetime
if (
column['type'] == 'STRING' and
self.datetime_conversion_rate(sample[col]) >
INFER_COL_TYPES_THRESHOLD):
column.update({
'is_date': True,
'is_dim': False,
'agg': None
})
# 'agg' is optional attribute
if not column['agg']:
column.pop('agg', None)
columns.append(column)
return columns