# -*- coding: utf-8 -*- # pylint: disable=C,R,W """ Superset wrapper around pandas.DataFrame. TODO(bkyryliuk): add support for the conventions like: *_dim or dim_* dimensions, *_ts, ts_*, ds_*, *_ds - datetime, etc. TODO(bkyryliuk): recognize integer encoded enums. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals from datetime import date, datetime import numpy as np import pandas as pd from pandas.core.common import _maybe_box_datetimelike from pandas.core.dtypes.dtypes import ExtensionDtype from past.builtins import basestring from superset.utils import JS_MAX_INTEGER INFER_COL_TYPES_THRESHOLD = 95 INFER_COL_TYPES_SAMPLE_SIZE = 100 class SupersetDataFrame(object): # Mapping numpy dtype.char to generic database types type_map = { 'b': 'BOOL', # boolean 'i': 'INT', # (signed) integer 'u': 'INT', # unsigned integer 'l': 'INT', # 64bit integer 'f': 'FLOAT', # floating-point 'c': 'FLOAT', # complex-floating point 'm': None, # timedelta 'M': 'DATETIME', # datetime 'O': 'OBJECT', # (Python) objects 'S': 'BYTE', # (byte-)string 'U': 'STRING', # Unicode 'V': None, # raw data (void) } def __init__(self, df): self.__df = df.where((pd.notnull(df)), None) @property def size(self): return len(self.__df.index) @property def data(self): # work around for https://github.com/pandas-dev/pandas/issues/18372 data = [dict((k, _maybe_box_datetimelike(v)) for k, v in zip(self.__df.columns, np.atleast_1d(row))) for row in self.__df.values] for d in data: for k, v in list(d.items()): # if an int is too big for Java Script to handle # convert it to a string if isinstance(v, int): if abs(v) > JS_MAX_INTEGER: d[k] = str(v) return data @classmethod def db_type(cls, dtype): """Given a numpy dtype, Returns a generic database type""" if isinstance(dtype, ExtensionDtype): return cls.type_map.get(dtype.kind) return cls.type_map.get(dtype.char) @classmethod def datetime_conversion_rate(cls, data_series): success = 0 total = 0 for value in data_series: total += 1 try: pd.to_datetime(value) success += 1 except Exception: continue return 100 * success / total @classmethod def is_date(cls, dtype): if dtype.name: return dtype.name.startswith('datetime') @classmethod def is_dimension(cls, dtype, column_name): if cls.is_id(column_name): return False return dtype.name in ('object', 'bool') @classmethod def is_id(cls, column_name): return column_name.startswith('id') or column_name.endswith('id') @classmethod def agg_func(cls, dtype, column_name): # consider checking for key substring too. if cls.is_id(column_name): return 'count_distinct' if (issubclass(dtype.type, np.generic) and np.issubdtype(dtype, np.number)): return 'sum' return None @property def columns(self): """Provides metadata about columns for data visualization. :return: dict, with the fields name, type, is_date, is_dim and agg. """ if self.__df.empty: return None columns = [] sample_size = min(INFER_COL_TYPES_SAMPLE_SIZE, len(self.__df.index)) sample = self.__df if sample_size: sample = self.__df.sample(sample_size) for col in self.__df.dtypes.keys(): col_db_type = self.db_type(self.__df.dtypes[col]) column = { 'name': col, 'agg': self.agg_func(self.__df.dtypes[col], col), 'type': col_db_type, 'is_date': self.is_date(self.__df.dtypes[col]), 'is_dim': self.is_dimension(self.__df.dtypes[col], col), } if column['type'] in ('OBJECT', None): v = sample[col].iloc[0] if not sample[col].empty else None if isinstance(v, basestring): column['type'] = 'STRING' elif isinstance(v, int): column['type'] = 'INT' elif isinstance(v, float): column['type'] = 'FLOAT' elif isinstance(v, (datetime, date)): column['type'] = 'DATETIME' column['is_date'] = True column['is_dim'] = False # check if encoded datetime if ( column['type'] == 'STRING' and self.datetime_conversion_rate(sample[col]) > INFER_COL_TYPES_THRESHOLD): column.update({ 'is_date': True, 'is_dim': False, 'agg': None, }) # 'agg' is optional attribute if not column['agg']: column.pop('agg', None) columns.append(column) return columns