Files
superset2/superset/utils/pandas_postprocessing/histogram.py
Janani Gurram c955a5dc08 fix(histogram): add NULL handling for histogram (#35693)
Co-authored-by: Rachel Pan <r.pan@mail.utoronto.ca>
Co-authored-by: Rachel Pan <panrrachel@gmail.com>
Co-authored-by: Janani Gurram <68124448+JG-ctrl@users.noreply.github.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2025-11-14 11:54:13 -08:00

96 lines
3.4 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations
import numpy as np
from pandas import DataFrame, Series, to_numeric
# pylint: disable=too-many-arguments
def histogram(
df: DataFrame,
column: str,
groupby: list[str] | None,
bins: int = 5,
cumulative: bool = False,
normalize: bool = False,
) -> DataFrame:
"""
Generate a histogram DataFrame from a given DataFrame.
Parameters:
df (DataFrame): The input DataFrame.
column (str): The column of the DataFrame to calculate the histogram on.
groupby (list[str]): The columns to group by. If empty, no grouping is performed.
bins (int): The number of bins to use for the histogram. Default is 5.
cumulative (bool): Whether to calculate a cumulative histogram. Default is False.
normalize (bool): Whether to normalize the histogram. Default is False.
Returns:
DataFrame: A DataFrame where each row corresponds to a group (or the entire DataFrame if no grouping is performed),
and each column corresponds to a histogram bin. The values are the counts in each bin.
""" # noqa: E501
if groupby is None:
groupby = []
# drop empty values from the target column
df = df.dropna(subset=[column])
if df.empty:
return df
# convert to numeric, coercing errors to NaN
df[column] = to_numeric(df[column], errors="coerce")
# check if the column contains non-numeric values
if df[column].isna().any():
raise ValueError(f"Column '{column}' contains non-numeric values")
# calculate the histogram bin edges
bin_edges = np.histogram_bin_edges(df[column], bins=bins)
# convert the bin edges to strings
bin_edges_str = [
f"{bin_edges[i]} - {bin_edges[i + 1]}" for i in range(len(bin_edges) - 1)
]
def hist_values(series: Series) -> np.ndarray:
# we might have NaN values as the result of grouping so we need to drop them
result = np.histogram(series.dropna(), bins=bin_edges)[0]
return result if not cumulative else np.cumsum(result)
if len(groupby) == 0:
# without grouping
hist_dict = dict(zip(bin_edges_str, hist_values(df[column]), strict=False))
histogram_df = DataFrame(hist_dict, index=[0])
else:
# with grouping
histogram_df = (
df.groupby(groupby)[column]
.apply(lambda x: Series(hist_values(x)))
.unstack(fill_value=0)
)
histogram_df.columns = bin_edges_str
if normalize:
histogram_df = histogram_df / histogram_df.values.sum()
# reorder the columns to have the groupby columns first
histogram_df = histogram_df.reset_index().loc[:, groupby + bin_edges_str]
return histogram_df