mirror of
https://github.com/apache/superset.git
synced 2026-04-17 07:05:04 +00:00
164 lines
5.3 KiB
Python
164 lines
5.3 KiB
Python
# -*- coding: utf-8 -*-
|
|
# pylint: disable=C,R,W
|
|
""" Superset wrapper around pandas.DataFrame.
|
|
|
|
TODO(bkyryliuk): add support for the conventions like: *_dim or dim_*
|
|
dimensions, *_ts, ts_*, ds_*, *_ds - datetime, etc.
|
|
TODO(bkyryliuk): recognize integer encoded enums.
|
|
|
|
"""
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
|
|
from datetime import date, datetime
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from pandas.core.common import _maybe_box_datetimelike
|
|
from pandas.core.dtypes.dtypes import ExtensionDtype
|
|
from past.builtins import basestring
|
|
|
|
from superset.utils import JS_MAX_INTEGER
|
|
|
|
INFER_COL_TYPES_THRESHOLD = 95
|
|
INFER_COL_TYPES_SAMPLE_SIZE = 100
|
|
|
|
|
|
class SupersetDataFrame(object):
|
|
# Mapping numpy dtype.char to generic database types
|
|
type_map = {
|
|
'b': 'BOOL', # boolean
|
|
'i': 'INT', # (signed) integer
|
|
'u': 'INT', # unsigned integer
|
|
'l': 'INT', # 64bit integer
|
|
'f': 'FLOAT', # floating-point
|
|
'c': 'FLOAT', # complex-floating point
|
|
'm': None, # timedelta
|
|
'M': 'DATETIME', # datetime
|
|
'O': 'OBJECT', # (Python) objects
|
|
'S': 'BYTE', # (byte-)string
|
|
'U': 'STRING', # Unicode
|
|
'V': None, # raw data (void)
|
|
}
|
|
|
|
def __init__(self, df):
|
|
self.__df = df.where((pd.notnull(df)), None)
|
|
|
|
@property
|
|
def size(self):
|
|
return len(self.__df.index)
|
|
|
|
@property
|
|
def data(self):
|
|
# work around for https://github.com/pandas-dev/pandas/issues/18372
|
|
data = [dict((k, _maybe_box_datetimelike(v))
|
|
for k, v in zip(self.__df.columns, np.atleast_1d(row)))
|
|
for row in self.__df.values]
|
|
for d in data:
|
|
for k, v in list(d.items()):
|
|
# if an int is too big for Java Script to handle
|
|
# convert it to a string
|
|
if isinstance(v, int):
|
|
if abs(v) > JS_MAX_INTEGER:
|
|
d[k] = str(v)
|
|
return data
|
|
|
|
@classmethod
|
|
def db_type(cls, dtype):
|
|
"""Given a numpy dtype, Returns a generic database type"""
|
|
if isinstance(dtype, ExtensionDtype):
|
|
return cls.type_map.get(dtype.kind)
|
|
return cls.type_map.get(dtype.char)
|
|
|
|
@classmethod
|
|
def datetime_conversion_rate(cls, data_series):
|
|
success = 0
|
|
total = 0
|
|
for value in data_series:
|
|
total += 1
|
|
try:
|
|
pd.to_datetime(value)
|
|
success += 1
|
|
except Exception:
|
|
continue
|
|
return 100 * success / total
|
|
|
|
@classmethod
|
|
def is_date(cls, dtype):
|
|
if dtype.name:
|
|
return dtype.name.startswith('datetime')
|
|
|
|
@classmethod
|
|
def is_dimension(cls, dtype, column_name):
|
|
if cls.is_id(column_name):
|
|
return False
|
|
return dtype.name in ('object', 'bool')
|
|
|
|
@classmethod
|
|
def is_id(cls, column_name):
|
|
return column_name.startswith('id') or column_name.endswith('id')
|
|
|
|
@classmethod
|
|
def agg_func(cls, dtype, column_name):
|
|
# consider checking for key substring too.
|
|
if cls.is_id(column_name):
|
|
return 'count_distinct'
|
|
if (issubclass(dtype.type, np.generic) and
|
|
np.issubdtype(dtype, np.number)):
|
|
return 'sum'
|
|
return None
|
|
|
|
@property
|
|
def columns(self):
|
|
"""Provides metadata about columns for data visualization.
|
|
|
|
:return: dict, with the fields name, type, is_date, is_dim and agg.
|
|
"""
|
|
if self.__df.empty:
|
|
return None
|
|
|
|
columns = []
|
|
sample_size = min(INFER_COL_TYPES_SAMPLE_SIZE, len(self.__df.index))
|
|
sample = self.__df
|
|
if sample_size:
|
|
sample = self.__df.sample(sample_size)
|
|
for col in self.__df.dtypes.keys():
|
|
col_db_type = self.db_type(self.__df.dtypes[col])
|
|
column = {
|
|
'name': col,
|
|
'agg': self.agg_func(self.__df.dtypes[col], col),
|
|
'type': col_db_type,
|
|
'is_date': self.is_date(self.__df.dtypes[col]),
|
|
'is_dim': self.is_dimension(self.__df.dtypes[col], col),
|
|
}
|
|
|
|
if column['type'] in ('OBJECT', None):
|
|
v = sample[col].iloc[0] if not sample[col].empty else None
|
|
if isinstance(v, basestring):
|
|
column['type'] = 'STRING'
|
|
elif isinstance(v, int):
|
|
column['type'] = 'INT'
|
|
elif isinstance(v, float):
|
|
column['type'] = 'FLOAT'
|
|
elif isinstance(v, (datetime, date)):
|
|
column['type'] = 'DATETIME'
|
|
column['is_date'] = True
|
|
column['is_dim'] = False
|
|
# check if encoded datetime
|
|
if (
|
|
column['type'] == 'STRING' and
|
|
self.datetime_conversion_rate(sample[col]) >
|
|
INFER_COL_TYPES_THRESHOLD):
|
|
column.update({
|
|
'is_date': True,
|
|
'is_dim': False,
|
|
'agg': None,
|
|
})
|
|
# 'agg' is optional attribute
|
|
if not column['agg']:
|
|
column.pop('agg', None)
|
|
columns.append(column)
|
|
return columns
|