mirror of
https://github.com/apache/superset.git
synced 2026-04-10 11:55:24 +00:00
Co-authored-by: Rachel Pan <r.pan@mail.utoronto.ca> Co-authored-by: Rachel Pan <panrrachel@gmail.com> Co-authored-by: Janani Gurram <68124448+JG-ctrl@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
96 lines
3.4 KiB
Python
96 lines
3.4 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
from __future__ import annotations
|
|
|
|
import numpy as np
|
|
from pandas import DataFrame, Series, to_numeric
|
|
|
|
|
|
# pylint: disable=too-many-arguments
|
|
def histogram(
|
|
df: DataFrame,
|
|
column: str,
|
|
groupby: list[str] | None,
|
|
bins: int = 5,
|
|
cumulative: bool = False,
|
|
normalize: bool = False,
|
|
) -> DataFrame:
|
|
"""
|
|
Generate a histogram DataFrame from a given DataFrame.
|
|
|
|
Parameters:
|
|
df (DataFrame): The input DataFrame.
|
|
column (str): The column of the DataFrame to calculate the histogram on.
|
|
groupby (list[str]): The columns to group by. If empty, no grouping is performed.
|
|
bins (int): The number of bins to use for the histogram. Default is 5.
|
|
cumulative (bool): Whether to calculate a cumulative histogram. Default is False.
|
|
normalize (bool): Whether to normalize the histogram. Default is False.
|
|
|
|
Returns:
|
|
DataFrame: A DataFrame where each row corresponds to a group (or the entire DataFrame if no grouping is performed),
|
|
and each column corresponds to a histogram bin. The values are the counts in each bin.
|
|
""" # noqa: E501
|
|
|
|
if groupby is None:
|
|
groupby = []
|
|
|
|
# drop empty values from the target column
|
|
df = df.dropna(subset=[column])
|
|
if df.empty:
|
|
return df
|
|
|
|
# convert to numeric, coercing errors to NaN
|
|
df[column] = to_numeric(df[column], errors="coerce")
|
|
|
|
# check if the column contains non-numeric values
|
|
if df[column].isna().any():
|
|
raise ValueError(f"Column '{column}' contains non-numeric values")
|
|
|
|
# calculate the histogram bin edges
|
|
bin_edges = np.histogram_bin_edges(df[column], bins=bins)
|
|
|
|
# convert the bin edges to strings
|
|
bin_edges_str = [
|
|
f"{bin_edges[i]} - {bin_edges[i + 1]}" for i in range(len(bin_edges) - 1)
|
|
]
|
|
|
|
def hist_values(series: Series) -> np.ndarray:
|
|
# we might have NaN values as the result of grouping so we need to drop them
|
|
result = np.histogram(series.dropna(), bins=bin_edges)[0]
|
|
return result if not cumulative else np.cumsum(result)
|
|
|
|
if len(groupby) == 0:
|
|
# without grouping
|
|
hist_dict = dict(zip(bin_edges_str, hist_values(df[column]), strict=False))
|
|
histogram_df = DataFrame(hist_dict, index=[0])
|
|
else:
|
|
# with grouping
|
|
histogram_df = (
|
|
df.groupby(groupby)[column]
|
|
.apply(lambda x: Series(hist_values(x)))
|
|
.unstack(fill_value=0)
|
|
)
|
|
histogram_df.columns = bin_edges_str
|
|
|
|
if normalize:
|
|
histogram_df = histogram_df / histogram_df.values.sum()
|
|
|
|
# reorder the columns to have the groupby columns first
|
|
histogram_df = histogram_df.reset_index().loc[:, groupby + bin_edges_str]
|
|
|
|
return histogram_df
|