# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # pylint: disable=C,R,W """ Superset wrapper around pandas.DataFrame. TODO(bkyryliuk): add support for the conventions like: *_dim or dim_* dimensions, *_ts, ts_*, ds_*, *_ds - datetime, etc. TODO(bkyryliuk): recognize integer encoded enums. """ import logging from datetime import date, datetime import numpy as np import pandas as pd from pandas.core.common import maybe_box_datetimelike from pandas.core.dtypes.dtypes import ExtensionDtype from superset.utils.core import JS_MAX_INTEGER INFER_COL_TYPES_THRESHOLD = 95 INFER_COL_TYPES_SAMPLE_SIZE = 100 def dedup(l, suffix="__", case_sensitive=True): """De-duplicates a list of string by suffixing a counter Always returns the same number of entries as provided, and always returns unique values. Case sensitive comparison by default. >>> print(','.join(dedup(['foo', 'bar', 'bar', 'bar', 'Bar']))) foo,bar,bar__1,bar__2,Bar >>> print( ','.join(dedup(['foo', 'bar', 'bar', 'bar', 'Bar'], case_sensitive=False)) ) foo,bar,bar__1,bar__2,Bar__3 """ new_l = [] seen = {} for s in l: s_fixed_case = s if case_sensitive else s.lower() if s_fixed_case in seen: seen[s_fixed_case] += 1 s += suffix + str(seen[s_fixed_case]) else: seen[s_fixed_case] = 0 new_l.append(s) return new_l def is_numeric(dtype): if hasattr(dtype, "_is_numeric"): return dtype._is_numeric return np.issubdtype(dtype, np.number) class SupersetDataFrame(object): # Mapping numpy dtype.char to generic database types type_map = { "b": "BOOL", # boolean "i": "INT", # (signed) integer "u": "INT", # unsigned integer "l": "INT", # 64bit integer "f": "FLOAT", # floating-point "c": "FLOAT", # complex-floating point "m": None, # timedelta "M": "DATETIME", # datetime "O": "OBJECT", # (Python) objects "S": "BYTE", # (byte-)string "U": "STRING", # Unicode "V": None, # raw data (void) } def __init__(self, data, cursor_description, db_engine_spec): data = data or [] column_names = [] dtype = None if cursor_description: # get deduped list of column names column_names = dedup([col[0] for col in cursor_description]) # fix cursor descriptor with the deduped names cursor_description = [ tuple([column_name, *list(description)[1:]]) for column_name, description in zip(column_names, cursor_description) ] # get type for better type casting, if possible dtype = db_engine_spec.get_pandas_dtype(cursor_description) self.column_names = column_names if dtype: # put data in a 2D array so we can efficiently access each column; # the reshape ensures the shape is 2D in case data is empty array = np.array(data, dtype="object").reshape(-1, len(column_names)) # convert each column in data into a Series of the proper dtype; we # need to do this because we can not specify a mixed dtype when # instantiating the DataFrame, and this allows us to have different # dtypes for each column. data = { column: pd.Series(array[:, i], dtype=dtype[column]) for i, column in enumerate(column_names) } self.df = pd.DataFrame(data, columns=column_names) else: self.df = pd.DataFrame(list(data), columns=column_names).infer_objects() self._type_dict = {} try: # The driver may not be passing a cursor.description self._type_dict = { col: db_engine_spec.get_datatype(cursor_description[i][1]) for i, col in enumerate(column_names) if cursor_description } except Exception as e: logging.exception(e) @property def raw_df(self): return self.df @property def size(self): return len(self.df.index) @property def data(self): return self.format_data(self.df) @classmethod def format_data(cls, df): # work around for https://github.com/pandas-dev/pandas/issues/18372 data = [ dict( (k, maybe_box_datetimelike(v)) for k, v in zip(df.columns, np.atleast_1d(row)) ) for row in df.values ] for d in data: for k, v in list(d.items()): # if an int is too big for Java Script to handle # convert it to a string if isinstance(v, int): if abs(v) > JS_MAX_INTEGER: d[k] = str(v) return data @classmethod def db_type(cls, dtype): """Given a numpy dtype, Returns a generic database type""" if isinstance(dtype, ExtensionDtype): return cls.type_map.get(dtype.kind) elif hasattr(dtype, "char"): return cls.type_map.get(dtype.char) @classmethod def datetime_conversion_rate(cls, data_series): success = 0 total = 0 for value in data_series: total += 1 try: pd.to_datetime(value) success += 1 except Exception: continue return 100 * success / total @staticmethod def is_date(np_dtype, db_type_str): def looks_daty(s): if isinstance(s, str): return any([s.lower().startswith(ss) for ss in ("time", "date")]) return False if looks_daty(db_type_str): return True if np_dtype and np_dtype.name and looks_daty(np_dtype.name): return True return False @classmethod def is_dimension(cls, dtype, column_name): if cls.is_id(column_name): return False return dtype.name in ("object", "bool") @classmethod def is_id(cls, column_name): return column_name.startswith("id") or column_name.endswith("id") @classmethod def agg_func(cls, dtype, column_name): # consider checking for key substring too. if cls.is_id(column_name): return "count_distinct" if ( hasattr(dtype, "type") and issubclass(dtype.type, np.generic) and is_numeric(dtype) ): return "sum" return None @property def columns(self): """Provides metadata about columns for data visualization. :return: dict, with the fields name, type, is_date, is_dim and agg. """ if self.df.empty: return None columns = [] sample_size = min(INFER_COL_TYPES_SAMPLE_SIZE, len(self.df.index)) sample = self.df if sample_size: sample = self.df.sample(sample_size) for col in self.df.dtypes.keys(): db_type_str = self._type_dict.get(col) or self.db_type(self.df.dtypes[col]) column = { "name": col, "agg": self.agg_func(self.df.dtypes[col], col), "type": db_type_str, "is_date": self.is_date(self.df.dtypes[col], db_type_str), "is_dim": self.is_dimension(self.df.dtypes[col], col), } if not db_type_str or db_type_str.upper() == "OBJECT": v = sample[col].iloc[0] if not sample[col].empty else None if isinstance(v, str): column["type"] = "STRING" elif isinstance(v, int): column["type"] = "INT" elif isinstance(v, float): column["type"] = "FLOAT" elif isinstance(v, (datetime, date)): column["type"] = "DATETIME" column["is_date"] = True column["is_dim"] = False # check if encoded datetime if ( column["type"] == "STRING" and self.datetime_conversion_rate(sample[col]) > INFER_COL_TYPES_THRESHOLD ): column.update({"is_date": True, "is_dim": False, "agg": None}) # 'agg' is optional attribute if not column["agg"]: column.pop("agg", None) columns.append(column) return columns