diff --git a/UPDATING.md b/UPDATING.md index 87abd23d204..6c15fde3eb0 100644 --- a/UPDATING.md +++ b/UPDATING.md @@ -124,6 +124,7 @@ See `superset/mcp_service/PRODUCTION.md` for deployment guides. --- +- [35621](https://github.com/apache/superset/pull/35621): The default hash algorithm has changed from MD5 to SHA-256 for improved security and FedRAMP compliance. This affects cache keys for thumbnails, dashboard digests, chart digests, and filter option names. Existing cached data will be invalidated upon upgrade. To opt out of this change and maintain backward compatibility, set `HASH_ALGORITHM = "md5"` in your `superset_config.py`. - [33055](https://github.com/apache/superset/pull/33055): Upgrades Flask-AppBuilder to 5.0.0. The AUTH_OID authentication type has been deprecated and is no longer available as an option in Flask-AppBuilder. OpenID (OID) is considered a deprecated authentication protocol - if you are using AUTH_OID, you will need to migrate to an alternative authentication method such as OAuth, LDAP, or database authentication before upgrading. - [35062](https://github.com/apache/superset/pull/35062): Changed the function signature of `setupExtensions` to `setupCodeOverrides` with options as arguments. - [34871](https://github.com/apache/superset/pull/34871): Fixed Jest test hanging issue from Ant Design v5 upgrade. MessageChannel is now mocked in test environment to prevent rc-overflow from causing Jest to hang. Test environment only - no production impact. diff --git a/superset/commands/dashboard/permalink/create.py b/superset/commands/dashboard/permalink/create.py index 20bc5118f57..055cd7a9972 100644 --- a/superset/commands/dashboard/permalink/create.py +++ b/superset/commands/dashboard/permalink/create.py @@ -29,7 +29,12 @@ from superset.key_value.exceptions import ( KeyValueCodecEncodeException, KeyValueUpsertFailedError, ) -from superset.key_value.utils import encode_permalink_key, get_deterministic_uuid +from superset.key_value.utils import ( + encode_permalink_key, + get_deterministic_uuid, + get_deterministic_uuid_with_algorithm, + get_fallback_algorithms, +) from superset.utils.core import get_user_id from superset.utils.decorators import on_error, transaction @@ -71,9 +76,31 @@ class CreateDashboardPermalinkCommand(BaseDashboardPermalinkCommand): "state": self.state, } user_id = get_user_id() - entry = KeyValueDAO.upsert_entry( + payload = (user_id, value) + + # Try to find existing entry with current algorithm + uuid_key = get_deterministic_uuid(self.salt, payload) + entry = KeyValueDAO.get_entry(self.resource, uuid_key) + + # Fallback: check configured fallback algorithms for backward compatibility + if not entry: + for fallback_algo in get_fallback_algorithms(): + uuid_fallback = get_deterministic_uuid_with_algorithm( + self.salt, payload, fallback_algo + ) + entry = KeyValueDAO.get_entry(self.resource, uuid_fallback) + if entry: + break + + if entry: + # Return existing entry + assert entry.id # for type checks + return encode_permalink_key(key=entry.id, salt=self.salt) + + # Create new entry with current algorithm + entry = KeyValueDAO.create_entry( resource=self.resource, - key=get_deterministic_uuid(self.salt, (user_id, value)), + key=uuid_key, value=value, codec=self.codec, ) diff --git a/superset/common/query_object.py b/superset/common/query_object.py index c691ad30e53..3f0d796ebc4 100644 --- a/superset/common/query_object.py +++ b/superset/common/query_object.py @@ -46,7 +46,7 @@ from superset.utils.core import ( is_adhoc_metric, QueryObjectFilterClause, ) -from superset.utils.hashing import md5_sha_from_dict +from superset.utils.hashing import hash_from_dict from superset.utils.json import json_int_dttm_ser if TYPE_CHECKING: @@ -499,7 +499,7 @@ class QueryObject: # pylint: disable=too-many-instance-attributes # datasource or database do not exist pass - cache_key = md5_sha_from_dict( + cache_key = hash_from_dict( cache_dict, default=json_int_dttm_ser, ignore_nan=True ) # Log QueryObject cache key generation for debugging diff --git a/superset/config.py b/superset/config.py index 70a557db5f8..a33294ed655 100644 --- a/superset/config.py +++ b/superset/config.py @@ -199,6 +199,32 @@ SUPERSET_DASHBOARD_PERIODICAL_REFRESH_WARNING_MESSAGE = None SUPERSET_DASHBOARD_POSITION_DATA_LIMIT = 65535 CUSTOM_SECURITY_MANAGER = None SQLALCHEMY_TRACK_MODIFICATIONS = False + +# --------------------------------------------------------- +# FedRAMP Cryptographic Compliance +# --------------------------------------------------------- + +# Hash algorithm used for non-cryptographic purposes (cache keys, thumbnails, etc.) +# Options: 'md5' (legacy), 'sha256' +# +# IMPORTANT: Changing this value will invalidate all existing cached content. +# Cache will re-warm naturally within 24-48 hours. +# +# For FedRAMP compliance, set to 'sha256' +# For backward compatibility with existing deployments, keep as 'md5' +HASH_ALGORITHM: Literal["md5", "sha256"] = "sha256" + +# Fallback hash algorithms for UUID lookup (backward compatibility) +# When looking up entries by UUID, try these algorithms after the primary one fails. +# This enables gradual migration from MD5 to SHA-256 without breaking existing entries. +# +# Example: When HASH_ALGORITHM='sha256', lookups will try: +# 1. SHA-256 UUID (primary) +# 2. MD5 UUID (fallback for legacy entries) +# +# Set to empty list to disable fallback (strict mode - only use HASH_ALGORITHM) +HASH_ALGORITHM_FALLBACKS: list[Literal["md5", "sha256"]] = ["md5"] + # --------------------------------------------------------- # Your App secret key. Make sure you override it on superset_config.py diff --git a/superset/db_engine_specs/base.py b/superset/db_engine_specs/base.py index 74ba20ee9cb..f52b6250548 100644 --- a/superset/db_engine_specs/base.py +++ b/superset/db_engine_specs/base.py @@ -80,7 +80,7 @@ from superset.superset_typing import ( ) from superset.utils import core as utils, json from superset.utils.core import ColumnSpec, GenericDataType, QuerySource -from superset.utils.hashing import md5_sha_from_str +from superset.utils.hashing import hash_from_str from superset.utils.json import redact_sensitive, reveal_sensitive from superset.utils.network import is_hostname_valid, is_port_open from superset.utils.oauth2 import encode_oauth2_state @@ -1975,7 +1975,7 @@ class BaseEngineSpec: # pylint: disable=too-many-public-methods :param label: Expected expression label :return: Truncated label """ - label = md5_sha_from_str(label) + label = hash_from_str(label) # truncate hash if it exceeds max length if cls.max_column_name_length and len(label) > cls.max_column_name_length: label = label[: cls.max_column_name_length] diff --git a/superset/db_engine_specs/bigquery.py b/superset/db_engine_specs/bigquery.py index 9ff78deaa56..b69361c2d11 100644 --- a/superset/db_engine_specs/bigquery.py +++ b/superset/db_engine_specs/bigquery.py @@ -47,7 +47,7 @@ from superset.exceptions import SupersetException from superset.sql.parse import SQLScript, Table from superset.superset_typing import ResultSetColumnType from superset.utils import core as utils, json -from superset.utils.hashing import md5_sha_from_str +from superset.utils.hashing import hash_from_str if TYPE_CHECKING: from sqlalchemy.sql.expression import Select @@ -268,7 +268,7 @@ class BigQueryEngineSpec(BaseEngineSpec): # pylint: disable=too-many-public-met :param label: Expected expression label :return: Conditionally mutated label """ - label_hashed = "_" + md5_sha_from_str(label) + label_hashed = "_" + hash_from_str(label) # if label starts with number, add underscore as first character label_mutated = "_" + label if re.match(r"^\d", label) else label @@ -290,7 +290,7 @@ class BigQueryEngineSpec(BaseEngineSpec): # pylint: disable=too-many-public-met :param label: expected expression label :return: truncated label """ - return "_" + md5_sha_from_str(label) + return "_" + hash_from_str(label) @classmethod def where_latest_partition( diff --git a/superset/db_engine_specs/clickhouse.py b/superset/db_engine_specs/clickhouse.py index d743e09be37..3f88ffc59e8 100644 --- a/superset/db_engine_specs/clickhouse.py +++ b/superset/db_engine_specs/clickhouse.py @@ -41,7 +41,7 @@ from superset.db_engine_specs.exceptions import SupersetDBAPIDatabaseError from superset.errors import ErrorLevel, SupersetError, SupersetErrorType from superset.extensions import cache_manager from superset.utils.core import GenericDataType -from superset.utils.hashing import md5_sha_from_str +from superset.utils.hashing import hash_from_str from superset.utils.network import is_hostname_valid, is_port_open if TYPE_CHECKING: @@ -417,7 +417,7 @@ class ClickHouseConnectEngineSpec(BasicParametersMixin, ClickHouseEngineSpec): :param label: Expected expression label :return: Conditionally mutated label """ - return f"{label}_{md5_sha_from_str(label)[:6]}" + return f"{label}_{hash_from_str(label)[:6]}" @classmethod def adjust_engine_params( diff --git a/superset/db_engine_specs/databend.py b/superset/db_engine_specs/databend.py index 9789512450b..17cca5280cc 100644 --- a/superset/db_engine_specs/databend.py +++ b/superset/db_engine_specs/databend.py @@ -39,7 +39,7 @@ from superset.db_engine_specs.base import ( from superset.db_engine_specs.exceptions import SupersetDBAPIDatabaseError from superset.errors import ErrorLevel, SupersetError, SupersetErrorType from superset.utils.core import GenericDataType -from superset.utils.hashing import md5_sha_from_str +from superset.utils.hashing import hash_from_str from superset.utils.network import is_hostname_valid, is_port_open if TYPE_CHECKING: @@ -363,4 +363,4 @@ class DatabendConnectEngineSpec(BasicParametersMixin, DatabendEngineSpec): :param label: Expected expression label :return: Conditionally mutated label """ - return f"{label}_{md5_sha_from_str(label)[:6]}" + return f"{label}_{hash_from_str(label)[:6]}" diff --git a/superset/db_engine_specs/dremio.py b/superset/db_engine_specs/dremio.py index 39fe48be635..84dd85198a1 100644 --- a/superset/db_engine_specs/dremio.py +++ b/superset/db_engine_specs/dremio.py @@ -25,7 +25,7 @@ from sqlalchemy import types from superset.constants import TimeGrain from superset.db_engine_specs.base import BaseEngineSpec -from superset.utils.hashing import md5_sha_from_str +from superset.utils.hashing import hash_from_str if TYPE_CHECKING: from superset.models.core import Database @@ -104,4 +104,4 @@ class DremioEngineSpec(BaseEngineSpec): :param label: Expected expression label :return: Conditionally mutated label """ - return f"{label}_{md5_sha_from_str(label)[:6]}" + return f"{label}_{hash_from_str(label)[:6]}" diff --git a/superset/db_engine_specs/drill.py b/superset/db_engine_specs/drill.py index 5a49f5993ae..c4dadfc3b40 100644 --- a/superset/db_engine_specs/drill.py +++ b/superset/db_engine_specs/drill.py @@ -27,7 +27,7 @@ from sqlalchemy.engine.url import URL from superset.constants import TimeGrain from superset.db_engine_specs.base import BaseEngineSpec from superset.db_engine_specs.exceptions import SupersetDBAPIProgrammingError -from superset.utils.hashing import md5_sha_from_str +from superset.utils.hashing import hash_from_str if TYPE_CHECKING: from superset.models.core import Database @@ -154,4 +154,4 @@ class DrillEngineSpec(BaseEngineSpec): :param label: Expected expression label :return: Conditionally mutated label """ - return f"{label}_{md5_sha_from_str(label)[:6]}" + return f"{label}_{hash_from_str(label)[:6]}" diff --git a/superset/extensions/metastore_cache.py b/superset/extensions/metastore_cache.py index 197dac9386d..0c36f8f2d0c 100644 --- a/superset/extensions/metastore_cache.py +++ b/superset/extensions/metastore_cache.py @@ -54,7 +54,7 @@ class SupersetMetastoreCache(BaseCache): cls, app: Flask, config: dict[str, Any], args: list[Any], kwargs: dict[str, Any] ) -> BaseCache: seed = config.get("CACHE_KEY_PREFIX", "") - kwargs["namespace"] = get_uuid_namespace(seed) + kwargs["namespace"] = get_uuid_namespace(seed, app) codec = config.get("CODEC") or PickleKeyValueCodec() if ( has_app_context() diff --git a/superset/key_value/shared_entries.py b/superset/key_value/shared_entries.py index 21a41b560cf..9ee755e4274 100644 --- a/superset/key_value/shared_entries.py +++ b/superset/key_value/shared_entries.py @@ -15,27 +15,74 @@ # specific language governing permissions and limitations # under the License. +import logging from typing import Any, Optional from uuid import uuid3 from superset.daos.key_value import KeyValueDAO from superset.key_value.types import JsonKeyValueCodec, KeyValueResource, SharedKey -from superset.key_value.utils import get_uuid_namespace, random_key +from superset.key_value.utils import ( + get_fallback_algorithms, + get_uuid_namespace, + get_uuid_namespace_with_algorithm, + random_key, +) from superset.utils.decorators import transaction +logger = logging.getLogger(__name__) + RESOURCE = KeyValueResource.APP -NAMESPACE = get_uuid_namespace("") CODEC = JsonKeyValueCodec() def get_shared_value(key: SharedKey) -> Optional[Any]: - uuid_key = uuid3(NAMESPACE, key) - return KeyValueDAO.get_value(RESOURCE, uuid_key, CODEC) + """ + Get a shared value by key, with configurable fallback for backward compatibility. + + If found via fallback algorithm, automatically migrates the entry to current + algorithm's UUID. + """ + # Try with current algorithm + namespace = get_uuid_namespace("") + uuid_key = uuid3(namespace, key) + value = KeyValueDAO.get_value(RESOURCE, uuid_key, CODEC) + + if value is not None: + return value + + # Fallback: try configured fallback algorithms for legacy entries + for fallback_algo in get_fallback_algorithms(): + namespace_fallback = get_uuid_namespace_with_algorithm("", fallback_algo) + uuid_key_fallback = uuid3(namespace_fallback, key) + value = KeyValueDAO.get_value(RESOURCE, uuid_key_fallback, CODEC) + + # If found via fallback, migrate to current algorithm + if value is not None: + try: + # Create new entry with current algorithm UUID + KeyValueDAO.create_entry(RESOURCE, value, CODEC, uuid_key) + # Note: We keep the old entry for safety + # It can be cleaned up later with a manual migration script + except Exception as ex: + # If creation fails (e.g., duplicate), that's fine + # The entry might already exist + logger.debug("Failed to migrate entry to current algorithm: %s", ex) + + return value + + return None @transaction() def set_shared_value(key: SharedKey, value: Any) -> None: - uuid_key = uuid3(NAMESPACE, key) + """ + Set a shared value by key, using current hash algorithm. + + Note: This creates a new entry. To update existing entries, + use KeyValueDAO.upsert_entry directly. + """ + namespace = get_uuid_namespace("") + uuid_key = uuid3(namespace, key) KeyValueDAO.create_entry(RESOURCE, value, CODEC, uuid_key) diff --git a/superset/key_value/utils.py b/superset/key_value/utils.py index e7be4731161..6bc7f0afb44 100644 --- a/superset/key_value/utils.py +++ b/superset/key_value/utils.py @@ -16,12 +16,14 @@ # under the License. from __future__ import annotations +import hashlib from hashlib import md5 from secrets import token_urlsafe from typing import Any from uuid import UUID, uuid3 import hashids +from flask import current_app from flask_babel import gettext as _ from superset.key_value.exceptions import KeyValueParseKeyError @@ -66,13 +68,94 @@ def decode_permalink_id(key: str, salt: str) -> int: raise KeyValueParseKeyError(_("Invalid permalink key")) -def get_uuid_namespace(seed: str) -> UUID: +def _uuid_namespace_from_md5(seed: str) -> UUID: + """Generate UUID namespace from MD5 hash (legacy compatibility).""" md5_obj = md5() # noqa: S324 md5_obj.update(seed.encode("utf-8")) return UUID(md5_obj.hexdigest()) +def _uuid_namespace_from_sha256(seed: str) -> UUID: + """Generate UUID namespace from SHA-256 hash (first 16 bytes).""" + sha256_obj = hashlib.sha256() + sha256_obj.update(seed.encode("utf-8")) + # Use first 16 bytes of SHA-256 digest for UUID + return UUID(bytes=sha256_obj.digest()[:16]) + + +# UUID namespace generator dispatch table +_UUID_NAMESPACE_GENERATORS = { + "md5": _uuid_namespace_from_md5, + "sha256": _uuid_namespace_from_sha256, +} + + +def get_uuid_namespace_with_algorithm(seed: str, algorithm: str) -> UUID: + """ + Generate a UUID namespace from a seed string using specified hash algorithm. + + Args: + seed: Seed string for namespace generation + algorithm: Hash algorithm to use ('sha256' or 'md5') + + Returns: + UUID namespace + """ + generator = _UUID_NAMESPACE_GENERATORS.get(algorithm) + if generator is None: + raise ValueError(f"Unsupported hash algorithm: {algorithm}") + return generator(seed) + + +def get_uuid_namespace(seed: str, app: Any = None) -> UUID: + """ + Generate a UUID namespace from a seed string using configured hash algorithm. + + Args: + seed: Seed string for namespace generation + app: Flask app instance (optional, uses current_app if not provided) + + Returns: + UUID namespace + """ + app = app or current_app + algorithm = app.config["HASH_ALGORITHM"] + return get_uuid_namespace_with_algorithm(seed, algorithm) + + +def get_deterministic_uuid_with_algorithm( + namespace: str, payload: Any, algorithm: str +) -> UUID: + """ + Get a deterministic UUID (uuid3) using specified hash algorithm. + + Args: + namespace: Namespace string for UUID generation + payload: JSON-serializable payload + algorithm: Hash algorithm to use ('sha256' or 'md5') + + Returns: + Deterministic UUID + """ + payload_str = json_dumps_w_dates(payload, sort_keys=True) + return uuid3(get_uuid_namespace_with_algorithm(namespace, algorithm), payload_str) + + def get_deterministic_uuid(namespace: str, payload: Any) -> UUID: """Get a deterministic UUID (uuid3) from a salt and a JSON-serializable payload.""" payload_str = json_dumps_w_dates(payload, sort_keys=True) return uuid3(get_uuid_namespace(namespace), payload_str) + + +def get_fallback_algorithms(app: Any = None) -> list[str]: + """ + Get the list of fallback hash algorithms from config. + + Args: + app: Flask app instance (optional, uses current_app if not provided) + + Returns: + List of fallback algorithm names (empty list if none configured) + """ + app = app or current_app + return app.config.get("HASH_ALGORITHM_FALLBACKS", []) diff --git a/superset/thumbnails/digest.py b/superset/thumbnails/digest.py index 1cdf8d3a642..db4a0249f0f 100644 --- a/superset/thumbnails/digest.py +++ b/superset/thumbnails/digest.py @@ -27,7 +27,7 @@ from superset.tasks.exceptions import ExecutorNotFoundError from superset.tasks.types import ExecutorType from superset.tasks.utils import get_current_user, get_executor from superset.utils.core import override_user -from superset.utils.hashing import md5_sha_from_str +from superset.utils.hashing import hash_from_str if TYPE_CHECKING: from superset.connectors.sqla.models import BaseDatasource, SqlaTable @@ -110,7 +110,7 @@ def get_dashboard_digest(dashboard: Dashboard) -> str | None: unique_string, dashboard.datasources, executor ) - return md5_sha_from_str(unique_string) + return hash_from_str(unique_string) def get_chart_digest(chart: Slice) -> str | None: @@ -130,4 +130,4 @@ def get_chart_digest(chart: Slice) -> str | None: unique_string = _adjust_string_for_executor(unique_string, executor_type, executor) unique_string = _adjust_string_with_rls(unique_string, [chart.datasource], executor) - return md5_sha_from_str(unique_string) + return hash_from_str(unique_string) diff --git a/superset/utils/cache.py b/superset/utils/cache.py index 03e0b57dbd0..76294696e43 100644 --- a/superset/utils/cache.py +++ b/superset/utils/cache.py @@ -31,14 +31,14 @@ from superset import db from superset.constants import CACHE_DISABLED_TIMEOUT from superset.extensions import cache_manager from superset.models.cache import CacheKey -from superset.utils.hashing import md5_sha_from_dict +from superset.utils.hashing import hash_from_dict from superset.utils.json import json_int_dttm_ser logger = logging.getLogger(__name__) def generate_cache_key(values_dict: dict[str, Any], key_prefix: str = "") -> str: - hash_str = md5_sha_from_dict(values_dict, default=json_int_dttm_ser) + hash_str = hash_from_dict(values_dict, default=json_int_dttm_ser) cache_key = f"{key_prefix}{hash_str}" if logger.isEnabledFor(logging.DEBUG): diff --git a/superset/utils/core.py b/superset/utils/core.py index 9879b7d8740..f089aadf70e 100644 --- a/superset/utils/core.py +++ b/superset/utils/core.py @@ -111,7 +111,7 @@ from superset.superset_typing import ( from superset.utils.backports import StrEnum from superset.utils.database import get_example_database from superset.utils.date_parser import parse_human_timedelta -from superset.utils.hashing import md5_sha_from_dict, md5_sha_from_str +from superset.utils.hashing import hash_from_dict, hash_from_str from superset.utils.pandas import detect_datetime_format if TYPE_CHECKING: @@ -992,7 +992,7 @@ def simple_filter_to_adhoc( } if filter_clause.get("isExtra"): result["isExtra"] = True - result["filterOptionName"] = md5_sha_from_dict(cast(dict[Any, Any], result)) + result["filterOptionName"] = hash_from_dict(cast(dict[Any, Any], result)) return result @@ -1005,7 +1005,7 @@ def form_data_to_adhoc(form_data: dict[str, Any], clause: str) -> AdhocFilterCla "expressionType": "SQL", "sqlExpression": form_data.get(clause), } - result["filterOptionName"] = md5_sha_from_dict(cast(dict[Any, Any], result)) + result["filterOptionName"] = hash_from_dict(cast(dict[Any, Any], result)) return result @@ -1471,7 +1471,7 @@ def create_ssl_cert_file(certificate: str) -> str: :return: The path to the certificate file :raises CertificateException: If certificate is not valid/unparseable """ - filename = f"{md5_sha_from_str(certificate)}.crt" + filename = f"{hash_from_str(certificate)}.crt" # pylint: disable=import-outside-toplevel cert_dir = app.config["SSL_CERT_PATH"] diff --git a/superset/utils/hashing.py b/superset/utils/hashing.py index d5c6de68e24..e1ff1a08927 100644 --- a/superset/utils/hashing.py +++ b/superset/utils/hashing.py @@ -14,23 +14,84 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations + import hashlib -from typing import Any, Callable, Optional +import logging +from typing import Any, Callable, Literal, Optional + +from flask import current_app from superset.utils import json +logger = logging.getLogger(__name__) -def md5_sha_from_str(val: str) -> str: - return hashlib.md5(val.encode("utf-8")).hexdigest() # noqa: S324 +HashAlgorithm = Literal["md5", "sha256"] + +# Hash function lookup table for efficient dispatch +_HASH_FUNCTIONS: dict[str, Callable[[bytes], str]] = { + "sha256": lambda data: hashlib.sha256(data).hexdigest(), + "md5": lambda data: hashlib.md5(data).hexdigest(), # noqa: S324 +} -def md5_sha_from_dict( +def get_hash_algorithm() -> HashAlgorithm: + """ + Get the configured hash algorithm for non-cryptographic purposes. + + Returns: + Hash algorithm name ('md5' or 'sha256') + """ + return current_app.config["HASH_ALGORITHM"] + + +def hash_from_str(val: str, algorithm: Optional[HashAlgorithm] = None) -> str: + """ + Generate a hash from a string using the configured or specified algorithm. + + Args: + val: String to hash + algorithm: Hash algorithm to use (defaults to configured algorithm) + + Returns: + Hexadecimal hash digest string + + Examples: + >>> hash_from_str("test") # Uses configured algorithm + '9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08' + >>> hash_from_str("test", algorithm="md5") # Force MD5 + '098f6bcd4621d373cade4e832627b4f6' + """ + if algorithm is None: + algorithm = get_hash_algorithm() + + hash_func = _HASH_FUNCTIONS.get(algorithm) + if hash_func is None: + raise ValueError(f"Unsupported hash algorithm: {algorithm}") + + return hash_func(val.encode("utf-8")) + + +def hash_from_dict( obj: dict[Any, Any], ignore_nan: bool = False, default: Optional[Callable[[Any], Any]] = None, + algorithm: Optional[HashAlgorithm] = None, ) -> str: + """ + Generate a hash from a dictionary using the configured or specified algorithm. + + Args: + obj: Dictionary to hash + ignore_nan: Whether to ignore NaN values in JSON serialization + default: Default function for JSON serialization + algorithm: Hash algorithm to use (defaults to configured algorithm) + + Returns: + Hexadecimal hash digest string + """ json_data = json.dumps( obj, sort_keys=True, ignore_nan=ignore_nan, default=default, allow_nan=True ) - return md5_sha_from_str(json_data) + return hash_from_str(json_data, algorithm=algorithm) diff --git a/superset/utils/screenshots.py b/superset/utils/screenshots.py index b737a2d4950..25c302d35b5 100644 --- a/superset/utils/screenshots.py +++ b/superset/utils/screenshots.py @@ -28,7 +28,7 @@ from flask import current_app as app from superset import feature_flag_manager, thumbnail_cache from superset.exceptions import ScreenshotImageNotAvailableException from superset.extensions import event_logger -from superset.utils.hashing import md5_sha_from_dict +from superset.utils.hashing import hash_from_dict from superset.utils.urls import modify_url_query from superset.utils.webdriver import ( ChartStandaloneMode, @@ -227,7 +227,7 @@ class BaseScreenshot: "window_size": window_size, "thumb_size": thumb_size, } - return md5_sha_from_dict(args) + return hash_from_dict(args) def get_from_cache( self, @@ -396,4 +396,4 @@ class DashboardScreenshot(BaseScreenshot): "thumb_size": thumb_size, "permalink_key": permalink_key, } - return md5_sha_from_dict(args) + return hash_from_dict(args) diff --git a/superset/viz.py b/superset/viz.py index 2e6086ab51b..b641009602f 100644 --- a/superset/viz.py +++ b/superset/viz.py @@ -78,7 +78,7 @@ from superset.utils.core import ( simple_filter_to_adhoc, ) from superset.utils.date_parser import get_since_until, parse_past_timedelta -from superset.utils.hashing import md5_sha_from_str +from superset.utils.hashing import hash_from_str if TYPE_CHECKING: from superset.connectors.sqla.models import BaseDatasource @@ -473,7 +473,7 @@ class BaseViz: # pylint: disable=too-many-public-methods cache_dict["rls"] = security_manager.get_rls_cache_key(self.datasource) cache_dict["changed_on"] = self.datasource.changed_on json_data = self.json_dumps(cache_dict, sort_keys=True) - return md5_sha_from_str(json_data) + return hash_from_str(json_data) @deprecated(deprecated_in="3.0") def get_payload(self, query_obj: QueryObjectDict | None = None) -> VizPayload: diff --git a/tests/integration_tests/db_engine_specs/bigquery_tests.py b/tests/integration_tests/db_engine_specs/bigquery_tests.py index aa584a26ee2..73bd2cbaf5a 100644 --- a/tests/integration_tests/db_engine_specs/bigquery_tests.py +++ b/tests/integration_tests/db_engine_specs/bigquery_tests.py @@ -47,11 +47,12 @@ class TestBigQueryDbEngineSpec(SupersetTestCase): """ DB Eng Specs (bigquery): Test column label """ + # Expected labels with SHA-256 hash suffix (first 5 chars prefixed with _) test_cases = { "Col": "Col", - "SUM(x)": "SUM_x__5f110", - "SUM[x]": "SUM_x__7ebe1", - "12345_col": "_12345_col_8d390", + "SUM(x)": "SUM_x__b681e", + "SUM[x]": "SUM_x__ceaf6", + "12345_col": "_12345_col_b1415", } for original, expected in test_cases.items(): actual = BigQueryEngineSpec.make_label_compatible(column(original).name) diff --git a/tests/integration_tests/sqla_models_tests.py b/tests/integration_tests/sqla_models_tests.py index f770b727319..4f10ede411e 100644 --- a/tests/integration_tests/sqla_models_tests.py +++ b/tests/integration_tests/sqla_models_tests.py @@ -520,7 +520,8 @@ class TestDatabaseModel(SupersetTestCase): sqlaq = table.get_sqla_query(**query_obj) assert sqlaq.labels_expected == ["user", "COUNT_DISTINCT(user)"] sql = table.database.compile_sqla_query(sqlaq.sqla_query) - assert "COUNT_DISTINCT_user__00db1" in sql + # SHA-256 hash of "COUNT_DISTINCT(user)" starts with "01c94" + assert "COUNT_DISTINCT_user__01c94" in sql db.session.delete(table) db.session.delete(database) db.session.commit() diff --git a/tests/integration_tests/thumbnails_tests.py b/tests/integration_tests/thumbnails_tests.py index 8949474250f..e35cd242601 100644 --- a/tests/integration_tests/thumbnails_tests.py +++ b/tests/integration_tests/thumbnails_tests.py @@ -191,7 +191,8 @@ class TestWebDriverSelenium(SupersetTestCase): class TestThumbnails(SupersetTestCase): mock_image = b"bytes mock image" digest_return_value = "foo_bar" - digest_hash = "5c7d96a3dd7a87850a2ef34087565a6e" + # SHA-256 hash of "foo_bar" (default HASH_ALGORITHM is sha256) + digest_hash = "4928cae8b37b3d1113f5e01e60c967df6c2b9e826dc7d91488d23a62fec715ba" def _get_id_and_thumbnail_url(self, url: str) -> tuple[int, str]: rv = self.client.get(url) diff --git a/tests/integration_tests/utils/core_tests.py b/tests/integration_tests/utils/core_tests.py index f04f32dbdc0..0163a34ac30 100644 --- a/tests/integration_tests/utils/core_tests.py +++ b/tests/integration_tests/utils/core_tests.py @@ -19,7 +19,7 @@ import pytest from superset.utils.core import form_data_to_adhoc, simple_filter_to_adhoc -def test_simple_filter_to_adhoc_generates_deterministic_values(): +def test_simple_filter_to_adhoc_generates_deterministic_values(app_context): input_1 = { "op": "IS NOT NULL", "col": "LATITUDE", @@ -30,13 +30,16 @@ def test_simple_filter_to_adhoc_generates_deterministic_values(): # The result is the same when given the same input assert simple_filter_to_adhoc(input_1) == simple_filter_to_adhoc(input_1) + # SHA-256 filterOptionName hash with default HASH_ALGORITHM assert simple_filter_to_adhoc(input_1) == { "clause": "WHERE", "expressionType": "SIMPLE", "comparator": "", "operator": "IS NOT NULL", "subject": "LATITUDE", - "filterOptionName": "6ac89d498115da22396f80a765cffc70", + "filterOptionName": ( + "84ffe4dba1764c30568e19d4dbbf64717fbc514fad1a8a995debfc72b344aa76" + ), } # The result is different when given different input @@ -47,22 +50,27 @@ def test_simple_filter_to_adhoc_generates_deterministic_values(): "comparator": "", "operator": "IS NOT NULL", "subject": "LONGITUDE", - "filterOptionName": "9c984bd3714883ca859948354ce26ab9", + "filterOptionName": ( + "c5a54054b987350b5594ee73772fbe71e9651a475bfcb7ae740e0799f12c8ff7" + ), } -def test_form_data_to_adhoc_generates_deterministic_values(): +def test_form_data_to_adhoc_generates_deterministic_values(app_context): form_data = {"where": "1 = 1", "having": "count(*) > 1"} # The result is the same when given the same input assert form_data_to_adhoc(form_data, "where") == form_data_to_adhoc( form_data, "where" ) + # SHA-256 filterOptionName hash with default HASH_ALGORITHM assert form_data_to_adhoc(form_data, "where") == { "clause": "WHERE", "expressionType": "SQL", "sqlExpression": "1 = 1", - "filterOptionName": "99fe79985afbddea4492626dc6a87b74", + "filterOptionName": ( + "11f7ef40818a0d614cc9a989d5d75ee969b5b3724e973dbf0194e3a339aa0544" + ), } # The result is different when given different input @@ -73,11 +81,13 @@ def test_form_data_to_adhoc_generates_deterministic_values(): "clause": "HAVING", "expressionType": "SQL", "sqlExpression": "count(*) > 1", - "filterOptionName": "1da11f6b709c3190daeabb84f77fc8c2", + "filterOptionName": ( + "8768cb92fa8a8629695dfe3a4010daefc5d7586934d1aa775f22fb03b46b5dcb" + ), } -def test_form_data_to_adhoc_incorrect_clause_type(): +def test_form_data_to_adhoc_incorrect_clause_type(app_context): form_data = {"where": "1 = 1", "having": "count(*) > 1"} with pytest.raises(ValueError): # noqa: PT011 diff --git a/tests/integration_tests/utils/hashing_tests.py b/tests/integration_tests/utils/hashing_tests.py index cfdbfbc5f6a..c6a16cd4eab 100644 --- a/tests/integration_tests/utils/hashing_tests.py +++ b/tests/integration_tests/utils/hashing_tests.py @@ -17,80 +17,184 @@ import datetime import math from typing import Any +from unittest.mock import patch import pytest # noqa: F401 -from superset.utils.hashing import md5_sha_from_dict, md5_sha_from_str +from superset.utils.hashing import hash_from_dict, hash_from_str def test_basic_md5_sha(): - obj = { - "product": "Coffee", - "company": "Gobias Industries", - "price_in_cents": 4000, - } + """Test basic hashing with MD5 (legacy mode).""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="md5"): + obj = { + "product": "Coffee", + "company": "Gobias Industries", + "price_in_cents": 4000, + } - serialized_obj = ( - '{"company": "Gobias Industries", "price_in_cents": 4000, "product": "Coffee"}' - ) + serialized_obj = '{"company": "Gobias Industries", "price_in_cents": 4000, "product": "Coffee"}' # noqa: E501 - assert md5_sha_from_str(serialized_obj) == md5_sha_from_dict(obj) - assert md5_sha_from_str(serialized_obj) == "35f22273cd6a6798b04f8ddef51135e3" + assert hash_from_str(serialized_obj) == hash_from_dict(obj) + assert hash_from_str(serialized_obj) == "35f22273cd6a6798b04f8ddef51135e3" + + +def test_basic_sha256(): + """Test basic hashing with SHA-256 (FedRAMP compliant mode).""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="sha256"): + obj = { + "product": "Coffee", + "company": "Gobias Industries", + "price_in_cents": 4000, + } + + serialized_obj = '{"company": "Gobias Industries", "price_in_cents": 4000, "product": "Coffee"}' # noqa: E501 + + assert hash_from_str(serialized_obj) == hash_from_dict(obj) + # SHA-256 hash of the serialized object + assert ( + hash_from_str(serialized_obj) + == "77bc5927f828903888572ab91c4f3114b36609ca5fb92039bef380d622cef596" + ) def test_sort_order_md5_sha(): - obj_1 = { - "product": "Coffee", - "price_in_cents": 4000, - "company": "Gobias Industries", - } + """Test dictionary key order independence with MD5.""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="md5"): + obj_1 = { + "product": "Coffee", + "price_in_cents": 4000, + "company": "Gobias Industries", + } - obj_2 = { - "product": "Coffee", - "company": "Gobias Industries", - "price_in_cents": 4000, - } + obj_2 = { + "product": "Coffee", + "company": "Gobias Industries", + "price_in_cents": 4000, + } - assert md5_sha_from_dict(obj_1) == md5_sha_from_dict(obj_2) - assert md5_sha_from_dict(obj_1) == "35f22273cd6a6798b04f8ddef51135e3" + assert hash_from_dict(obj_1) == hash_from_dict(obj_2) + assert hash_from_dict(obj_1) == "35f22273cd6a6798b04f8ddef51135e3" + + +def test_sort_order_sha256(): + """Test dictionary key order independence with SHA-256.""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="sha256"): + obj_1 = { + "product": "Coffee", + "price_in_cents": 4000, + "company": "Gobias Industries", + } + + obj_2 = { + "product": "Coffee", + "company": "Gobias Industries", + "price_in_cents": 4000, + } + + assert hash_from_dict(obj_1) == hash_from_dict(obj_2) + assert ( + hash_from_dict(obj_1) + == "77bc5927f828903888572ab91c4f3114b36609ca5fb92039bef380d622cef596" + ) def test_custom_default_md5_sha(): - def custom_datetime_serializer(obj: Any): - if isinstance(obj, datetime.datetime): - return "" + """Test custom serializer with MD5.""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="md5"): - obj = { - "product": "Coffee", - "company": "Gobias Industries", - "datetime": datetime.datetime.now(), - } + def custom_datetime_serializer(obj: Any): + if isinstance(obj, datetime.datetime): + return "" - serialized_obj = '{"company": "Gobias Industries", "datetime": "", "product": "Coffee"}' # noqa: E501 + obj = { + "product": "Coffee", + "company": "Gobias Industries", + "datetime": datetime.datetime.now(), + } - assert md5_sha_from_str(serialized_obj) == md5_sha_from_dict( - obj, default=custom_datetime_serializer - ) - assert md5_sha_from_str(serialized_obj) == "dc280121213aabcaeb8087aef268fd0d" + serialized_obj = '{"company": "Gobias Industries", "datetime": "", "product": "Coffee"}' # noqa: E501 + + assert hash_from_str(serialized_obj) == hash_from_dict( + obj, default=custom_datetime_serializer + ) + assert hash_from_str(serialized_obj) == "dc280121213aabcaeb8087aef268fd0d" + + +def test_custom_default_sha256(): + """Test custom serializer with SHA-256.""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="sha256"): + + def custom_datetime_serializer(obj: Any): + if isinstance(obj, datetime.datetime): + return "" + + obj = { + "product": "Coffee", + "company": "Gobias Industries", + "datetime": datetime.datetime.now(), + } + + serialized_obj = '{"company": "Gobias Industries", "datetime": "", "product": "Coffee"}' # noqa: E501 + + assert hash_from_str(serialized_obj) == hash_from_dict( + obj, default=custom_datetime_serializer + ) + assert ( + hash_from_str(serialized_obj) + == "417b57b6f3979bdd0937286f2dc872089fcd5fdb7daad1d3dbcaae1e34cc564e" + ) def test_ignore_nan_md5_sha(): - obj = { - "product": "Coffee", - "company": "Gobias Industries", - "price": math.nan, - } + """Test NaN handling with MD5.""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="md5"): + obj = { + "product": "Coffee", + "company": "Gobias Industries", + "price": math.nan, + } - serialized_obj = ( - '{"company": "Gobias Industries", "price": NaN, "product": "Coffee"}' - ) + serialized_obj = ( + '{"company": "Gobias Industries", "price": NaN, "product": "Coffee"}' + ) - assert md5_sha_from_str(serialized_obj) == md5_sha_from_dict(obj) - assert md5_sha_from_str(serialized_obj) == "5d129d1dffebc0bacc734366476d586d" + assert hash_from_str(serialized_obj) == hash_from_dict(obj) + assert hash_from_str(serialized_obj) == "5d129d1dffebc0bacc734366476d586d" - serialized_obj = ( - '{"company": "Gobias Industries", "price": null, "product": "Coffee"}' - ) + serialized_obj = ( + '{"company": "Gobias Industries", "price": null, "product": "Coffee"}' + ) - assert md5_sha_from_str(serialized_obj) == md5_sha_from_dict(obj, ignore_nan=True) - assert md5_sha_from_str(serialized_obj) == "40e87d61f6add03816bccdeac5713b9f" + assert hash_from_str(serialized_obj) == hash_from_dict(obj, ignore_nan=True) + assert hash_from_str(serialized_obj) == "40e87d61f6add03816bccdeac5713b9f" + + +def test_ignore_nan_sha256(): + """Test NaN handling with SHA-256.""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="sha256"): + obj = { + "product": "Coffee", + "company": "Gobias Industries", + "price": math.nan, + } + + serialized_obj = ( + '{"company": "Gobias Industries", "price": NaN, "product": "Coffee"}' + ) + + assert hash_from_str(serialized_obj) == hash_from_dict(obj) + assert ( + hash_from_str(serialized_obj) + == "efff87146d137b2d0392eff94b74e7644c3a6b135b91563400029995b9236820" + ) + + serialized_obj = ( + '{"company": "Gobias Industries", "price": null, "product": "Coffee"}' + ) + + assert hash_from_str(serialized_obj) == hash_from_dict(obj, ignore_nan=True) + assert ( + hash_from_str(serialized_obj) + == "9b66e0af1cb74aa58c3ab08654c086ebfdada14b1e6312b4002edc854d99d24d" + ) diff --git a/tests/integration_tests/utils_tests.py b/tests/integration_tests/utils_tests.py index 01d82adafbf..1b28cc0ee48 100644 --- a/tests/integration_tests/utils_tests.py +++ b/tests/integration_tests/utils_tests.py @@ -60,7 +60,7 @@ from superset.utils.core import ( from superset.utils import json from superset.utils.database import get_or_create_db from superset.utils import schema -from superset.utils.hashing import md5_sha_from_str +from superset.utils.hashing import hash_from_str from superset.views.utils import build_extra_filters, get_form_data # noqa: F401 from tests.integration_tests.base_tests import SupersetTestCase from tests.integration_tests.constants import ADMIN_USERNAME @@ -80,7 +80,10 @@ class TestUtils(SupersetTestCase): { "clause": "WHERE", "expressionType": "SQL", - "filterOptionName": "46fb6d7891e23596e42ae38da94a57e0", + # SHA-256 hash with default HASH_ALGORITHM + "filterOptionName": ( + "efcc050e11722b0bc338c0abc71a4270ce71df7a10294fcf8e8f03f5cb8978f3" + ), "sqlExpression": "a = 1", } ] @@ -96,7 +99,10 @@ class TestUtils(SupersetTestCase): "clause": "WHERE", "comparator": "someval", "expressionType": "SIMPLE", - "filterOptionName": "135c7ee246666b840a3d7a9c3a30cf38", + # SHA-256 hash with default HASH_ALGORITHM + "filterOptionName": ( + "d72b098cd87dc5040410c322373562ca65d1a736e1e53e9cae39254394b42a44" + ), "operator": "in", "subject": "a", } @@ -112,7 +118,10 @@ class TestUtils(SupersetTestCase): { "clause": "WHERE", "expressionType": "SQL", - "filterOptionName": "46fb6d7891e23596e42ae38da94a57e0", + # SHA-256 hash with default HASH_ALGORITHM + "filterOptionName": ( + "efcc050e11722b0bc338c0abc71a4270ce71df7a10294fcf8e8f03f5cb8978f3" + ), "sqlExpression": "a = 1", } ] @@ -127,7 +136,10 @@ class TestUtils(SupersetTestCase): { "clause": "HAVING", "expressionType": "SQL", - "filterOptionName": "683f1c26466ab912f75a00842e0f2f7b", + # SHA-256 hash with default HASH_ALGORITHM + "filterOptionName": ( + "63a84e72e4dac2bb08de866699d9c4f8ccc3640f6c3c0b734c75b937fac54bd6" + ), "sqlExpression": "COUNT(1) = 1", } ] @@ -266,7 +278,7 @@ class TestUtils(SupersetTestCase): def test_ssl_certificate_file_creation(self): path = create_ssl_cert_file(ssl_certificate) - expected_filename = md5_sha_from_str(ssl_certificate) + expected_filename = hash_from_str(ssl_certificate) assert expected_filename in path assert os.path.exists(path) diff --git a/tests/integration_tests/viz_tests.py b/tests/integration_tests/viz_tests.py index 23aab391475..7ebcf1197fc 100644 --- a/tests/integration_tests/viz_tests.py +++ b/tests/integration_tests/viz_tests.py @@ -1145,12 +1145,15 @@ class TestBaseDeckGLViz(SupersetTestCase): } datasource = self.get_datasource_mock() + # SHA-256 filterOptionName hashes with default HASH_ALGORITHM expected_results = { "latlong_key": [ { "clause": "WHERE", "expressionType": "SIMPLE", - "filterOptionName": "c7f171cf3204bcbf456acfeac5cd9afd", + "filterOptionName": ( + "980dd3068274177120307d9182ea8e8ee1b7824d34fbc21c529441f5d3279f7f" + ), "comparator": "", "operator": "IS NOT NULL", "subject": "lat", @@ -1158,7 +1161,9 @@ class TestBaseDeckGLViz(SupersetTestCase): { "clause": "WHERE", "expressionType": "SIMPLE", - "filterOptionName": "52634073fbb8ae0a3aa59ad48abac55e", + "filterOptionName": ( + "e368c259da27e5ec6a854772d9bff2c2af8dd5762352cef4ff6afc5bd8b6b9ea" + ), "comparator": "", "operator": "IS NOT NULL", "subject": "lon", @@ -1168,7 +1173,9 @@ class TestBaseDeckGLViz(SupersetTestCase): { "clause": "WHERE", "expressionType": "SIMPLE", - "filterOptionName": "cae5c925c140593743da08499e6fb207", + "filterOptionName": ( + "6ea33b70ab781033af421240019d3e3ad782928a3ad2999538f1f4b2a52305e2" + ), "comparator": "", "operator": "IS NOT NULL", "subject": "lonlat", @@ -1178,7 +1185,9 @@ class TestBaseDeckGLViz(SupersetTestCase): { "clause": "WHERE", "expressionType": "SIMPLE", - "filterOptionName": "d84f55222d8e414e888fa5f990b341d2", + "filterOptionName": ( + "48bbd94cd6afb1885d8550e2928bc01a2d3bc7d1f4f1d0929b10d6f4021b7f14" + ), "comparator": "", "operator": "IS NOT NULL", "subject": "geo", diff --git a/tests/unit_tests/db_engine_specs/test_clickhouse.py b/tests/unit_tests/db_engine_specs/test_clickhouse.py index 5e532e6ffdc..0c42bc19492 100644 --- a/tests/unit_tests/db_engine_specs/test_clickhouse.py +++ b/tests/unit_tests/db_engine_specs/test_clickhouse.py @@ -215,8 +215,9 @@ def test_connect_get_column_spec( @pytest.mark.parametrize( "column_name,expected_result", [ - ("time", "time_07cc69"), - ("count", "count_e2942a"), + # SHA-256 hash suffix (first 6 chars) with default HASH_ALGORITHM + ("time", "time_336074"), + ("count", "count_6c3549"), ], ) def test_connect_make_label_compatible(column_name: str, expected_result: str) -> None: diff --git a/tests/unit_tests/db_engine_specs/test_databend.py b/tests/unit_tests/db_engine_specs/test_databend.py index b1c25bbbe9a..ccced6627cb 100644 --- a/tests/unit_tests/db_engine_specs/test_databend.py +++ b/tests/unit_tests/db_engine_specs/test_databend.py @@ -127,8 +127,9 @@ def test_get_column_spec( @pytest.mark.parametrize( "column_name,expected_result", [ - ("time", "time_07cc69"), - ("count", "count_e2942a"), + # SHA-256 hash suffix (first 6 chars) with default HASH_ALGORITHM + ("time", "time_336074"), + ("count", "count_6c3549"), ], ) def test_make_label_compatible(column_name: str, expected_result: str) -> None: diff --git a/tests/unit_tests/db_engine_specs/test_drill.py b/tests/unit_tests/db_engine_specs/test_drill.py index 60a455e8afc..011baa3bb2a 100644 --- a/tests/unit_tests/db_engine_specs/test_drill.py +++ b/tests/unit_tests/db_engine_specs/test_drill.py @@ -162,8 +162,9 @@ def test_get_schema_from_engine_params() -> None: @pytest.mark.parametrize( "column_name,expected_result", [ - ("time", "time_07cc69"), - ("count", "count_e2942a"), + # SHA-256 hash suffix (first 6 chars) with default HASH_ALGORITHM + ("time", "time_336074"), + ("count", "count_6c3549"), ], ) def test_connect_make_label_compatible(column_name: str, expected_result: str) -> None: diff --git a/tests/unit_tests/db_engine_specs/test_oracle.py b/tests/unit_tests/db_engine_specs/test_oracle.py index 01999b5e274..991eacf8b18 100644 --- a/tests/unit_tests/db_engine_specs/test_oracle.py +++ b/tests/unit_tests/db_engine_specs/test_oracle.py @@ -31,7 +31,8 @@ from tests.unit_tests.fixtures.common import dttm # noqa: F401 @pytest.mark.parametrize( "column_name,expected_result", [ - ("a" * 129, "b325dc1c6f5e7a2b7cf465b9feab7948"), + # SHA-256 hash of 129 'a' characters with default HASH_ALGORITHM + ("a" * 129, "c12cb024a2e5551cca0e08fce8f1c5e314555cc3fef6329ee994a3db752166ae"), ("snake_label", "snake_label"), ("camelLabel", "camelLabel"), ], diff --git a/tests/unit_tests/key_value/test_shared_entries_migration.py b/tests/unit_tests/key_value/test_shared_entries_migration.py new file mode 100644 index 00000000000..1c85020fac6 --- /dev/null +++ b/tests/unit_tests/key_value/test_shared_entries_migration.py @@ -0,0 +1,135 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Test hash algorithm migration for shared entries.""" + +from unittest.mock import MagicMock, patch +from uuid import uuid3 + + +def test_get_shared_value_fallback_to_md5() -> None: + """Test that get_shared_value falls back to MD5 when SHA-256 doesn't find entry.""" + from superset.key_value.shared_entries import get_shared_value + from superset.key_value.types import SharedKey + from superset.key_value.utils import get_uuid_namespace_with_algorithm + + key = SharedKey.DASHBOARD_PERMALINK_SALT + expected_value = "test_salt_value_12345" + + # Calculate what the MD5 UUID would be + namespace_md5 = get_uuid_namespace_with_algorithm("", "md5") + uuid_md5 = uuid3(namespace_md5, key) + + # Mock KeyValueDAO to simulate MD5 entry exists, SHA-256 doesn't + mock_dao = MagicMock() + + def mock_get_value(resource, uuid_key, codec): + # Only return value if UUID matches MD5 version + if uuid_key == uuid_md5: + return expected_value + return None + + mock_dao.get_value.side_effect = mock_get_value + + # Mock current_app to use SHA-256 with MD5 fallback + mock_app = MagicMock() + mock_app.config = { + "HASH_ALGORITHM": "sha256", + "HASH_ALGORITHM_FALLBACKS": ["md5"], + } + + with patch("superset.key_value.shared_entries.KeyValueDAO", mock_dao): + with patch("superset.key_value.utils.current_app", mock_app): + result = get_shared_value(key) + + # Should have found the MD5 entry + assert result == expected_value + + # Should have called get_value twice (SHA-256 first, then MD5) + assert mock_dao.get_value.call_count == 2 + + +def test_get_shared_value_no_fallback_when_md5() -> None: + """Test get_shared_value with MD5 primary and MD5 in fallbacks.""" + from superset.key_value.shared_entries import get_shared_value + from superset.key_value.types import SharedKey + + key = SharedKey.DASHBOARD_PERMALINK_SALT + + # Mock KeyValueDAO to return None (entry not found) + mock_dao = MagicMock() + mock_dao.get_value.return_value = None + + # Mock current_app to use MD5 with MD5 fallback (same algorithm) + # This would cause 2 lookups if fallback included same algorithm + mock_app = MagicMock() + mock_app.config = { + "HASH_ALGORITHM": "md5", + "HASH_ALGORITHM_FALLBACKS": ["md5"], # Fallback is same as primary + } + + with patch("superset.key_value.shared_entries.KeyValueDAO", mock_dao): + with patch("superset.key_value.utils.current_app", mock_app): + result = get_shared_value(key) + + # Should return None (not found) + assert result is None + + # Should have called get_value twice (primary + fallback, even though same algo) + # This is expected behavior with current implementation + assert mock_dao.get_value.call_count == 2 + + +def test_get_shared_value_finds_sha256_first() -> None: + """Test that get_shared_value finds SHA-256 entry first without fallback.""" + from superset.key_value.shared_entries import get_shared_value + from superset.key_value.types import SharedKey + from superset.key_value.utils import get_uuid_namespace_with_algorithm + + key = SharedKey.DASHBOARD_PERMALINK_SALT + expected_value = "new_sha256_salt" + + # Calculate what the SHA-256 UUID would be + namespace_sha256 = get_uuid_namespace_with_algorithm("", "sha256") + uuid_sha256 = uuid3(namespace_sha256, key) + + # Mock KeyValueDAO to return value for SHA-256 + mock_dao = MagicMock() + + def mock_get_value(resource, uuid_key, codec): + # Return value if UUID matches SHA-256 version + if uuid_key == uuid_sha256: + return expected_value + return None + + mock_dao.get_value.side_effect = mock_get_value + + # Mock current_app to use SHA-256 with MD5 fallback + mock_app = MagicMock() + mock_app.config = { + "HASH_ALGORITHM": "sha256", + "HASH_ALGORITHM_FALLBACKS": ["md5"], + } + + with patch("superset.key_value.shared_entries.KeyValueDAO", mock_dao): + with patch("superset.key_value.utils.current_app", mock_app): + result = get_shared_value(key) + + # Should have found the SHA-256 entry + assert result == expected_value + + # Should have called get_value only once (found immediately) + assert mock_dao.get_value.call_count == 1 diff --git a/tests/unit_tests/key_value/utils_test.py b/tests/unit_tests/key_value/utils_test.py index 5d78f6361c0..8a0ab16e50d 100644 --- a/tests/unit_tests/key_value/utils_test.py +++ b/tests/unit_tests/key_value/utils_test.py @@ -16,6 +16,7 @@ # under the License. from __future__ import annotations +from unittest.mock import MagicMock from uuid import UUID import pytest @@ -28,25 +29,23 @@ UUID_KEY = UUID("3e7a2ab8-bcaf-49b0-a5df-dfb432f291cc") ID_KEY = 123 -def test_get_filter_uuid() -> None: +@pytest.mark.parametrize( + "key,expected_filter", + [ + (UUID_KEY, {"resource": RESOURCE, "uuid": UUID_KEY}), + (ID_KEY, {"resource": RESOURCE, "id": ID_KEY}), + ], + ids=["uuid_key", "id_key"], +) +def test_get_filter(key, expected_filter) -> None: + """Test get_filter with different key types.""" from superset.key_value.utils import get_filter - assert get_filter(resource=RESOURCE, key=UUID_KEY) == { - "resource": RESOURCE, - "uuid": UUID_KEY, - } - - -def test_get_filter_id() -> None: - from superset.key_value.utils import get_filter - - assert get_filter(resource=RESOURCE, key=ID_KEY) == { - "resource": RESOURCE, - "id": ID_KEY, - } + assert get_filter(resource=RESOURCE, key=key) == expected_filter def test_encode_permalink_id_valid() -> None: + """Test encoding permalink ID with valid input.""" from superset.key_value.utils import encode_permalink_key salt = "abc" @@ -54,7 +53,135 @@ def test_encode_permalink_id_valid() -> None: def test_decode_permalink_id_invalid() -> None: + """Test decoding permalink ID with invalid input.""" from superset.key_value.utils import decode_permalink_id with pytest.raises(KeyValueParseKeyError): decode_permalink_id("foo", "bar") + + +@pytest.mark.parametrize( + "algorithm,seed,expected_uuid", + [ + ("md5", "test_seed", UUID("d81a8c4d-6522-9513-525d-6a5cef1c7c9d")), + ("sha256", "test_seed", UUID("4504d44d-861b-6919-7db1-d95e47344234")), + ], + ids=["md5", "sha256"], +) +def test_get_uuid_namespace(algorithm, seed, expected_uuid) -> None: + """Test UUID namespace generation with different algorithms.""" + from superset.key_value.utils import get_uuid_namespace + + mock_app = MagicMock() + mock_app.config = {"HASH_ALGORITHM": algorithm} + namespace = get_uuid_namespace(seed, app=mock_app) + + assert isinstance(namespace, UUID) + assert namespace == expected_uuid + + +def test_get_uuid_namespace_deterministic() -> None: + """Test that UUID namespace generation is deterministic.""" + from superset.key_value.utils import get_uuid_namespace + + mock_app = MagicMock() + mock_app.config = {"HASH_ALGORITHM": "sha256"} + namespace1 = get_uuid_namespace("same_seed", app=mock_app) + namespace2 = get_uuid_namespace("same_seed", app=mock_app) + assert namespace1 == namespace2 + + +def test_get_uuid_namespace_different_seeds() -> None: + """Test that different seeds produce different UUID namespaces.""" + from superset.key_value.utils import get_uuid_namespace + + mock_app = MagicMock() + mock_app.config = {"HASH_ALGORITHM": "sha256"} + namespace1 = get_uuid_namespace("seed1", app=mock_app) + namespace2 = get_uuid_namespace("seed2", app=mock_app) + assert namespace1 != namespace2 + + +@pytest.mark.parametrize( + "algorithm,seed,expected_uuid", + [ + ("md5", "test_seed", UUID("d81a8c4d-6522-9513-525d-6a5cef1c7c9d")), + ("sha256", "test_seed", UUID("4504d44d-861b-6919-7db1-d95e47344234")), + ], + ids=["md5", "sha256"], +) +def test_get_uuid_namespace_with_algorithm(algorithm, seed, expected_uuid) -> None: + """Test UUID namespace generation with explicit algorithm.""" + from superset.key_value.utils import get_uuid_namespace_with_algorithm + + namespace = get_uuid_namespace_with_algorithm(seed, algorithm) + assert isinstance(namespace, UUID) + assert namespace == expected_uuid + + +def test_get_uuid_namespace_with_algorithm_different_results() -> None: + """Test that MD5 and SHA-256 produce different UUIDs for same seed.""" + from superset.key_value.utils import get_uuid_namespace_with_algorithm + + namespace_md5 = get_uuid_namespace_with_algorithm("test_seed", "md5") + namespace_sha256 = get_uuid_namespace_with_algorithm("test_seed", "sha256") + assert namespace_md5 != namespace_sha256 + + +@pytest.mark.parametrize( + "algorithm", + ["md5", "sha256"], + ids=["md5", "sha256"], +) +def test_get_deterministic_uuid_with_algorithm(algorithm) -> None: + """Test deterministic UUID generation with explicit algorithm.""" + from superset.key_value.utils import get_deterministic_uuid_with_algorithm + + payload = {"key": "value", "number": 123} + + # Test that same algorithm produces same UUID (deterministic) + uuid_1 = get_deterministic_uuid_with_algorithm("salt", payload, algorithm) + uuid_2 = get_deterministic_uuid_with_algorithm("salt", payload, algorithm) + assert uuid_1 == uuid_2 + + +def test_get_deterministic_uuid_different_algorithms() -> None: + """Test that different algorithms produce different UUIDs.""" + from superset.key_value.utils import get_deterministic_uuid_with_algorithm + + payload = {"key": "value", "number": 123} + + uuid_md5 = get_deterministic_uuid_with_algorithm("salt", payload, "md5") + uuid_sha256 = get_deterministic_uuid_with_algorithm("salt", payload, "sha256") + assert uuid_md5 != uuid_sha256 + + +@pytest.mark.parametrize( + "config_value,expected_fallbacks", + [ + (["md5"], ["md5"]), + (["md5", "sha256"], ["md5", "sha256"]), + ([], []), + ], + ids=["single_fallback", "multiple_fallbacks", "no_fallbacks"], +) +def test_get_fallback_algorithms(config_value, expected_fallbacks) -> None: + """Test getting fallback algorithms from config.""" + from superset.key_value.utils import get_fallback_algorithms + + mock_app = MagicMock() + mock_app.config = {"HASH_ALGORITHM_FALLBACKS": config_value} + fallbacks = get_fallback_algorithms(app=mock_app) + + assert fallbacks == expected_fallbacks + + +def test_get_fallback_algorithms_default() -> None: + """Test fallback algorithms default to empty list if not configured.""" + from superset.key_value.utils import get_fallback_algorithms + + mock_app = MagicMock() + mock_app.config = {} # No HASH_ALGORITHM_FALLBACKS key + fallbacks = get_fallback_algorithms(app=mock_app) + + assert fallbacks == [] diff --git a/tests/unit_tests/thumbnails/test_digest.py b/tests/unit_tests/thumbnails/test_digest.py index 301e033e79c..d1a787c069e 100644 --- a/tests/unit_tests/thumbnails/test_digest.py +++ b/tests/unit_tests/thumbnails/test_digest.py @@ -86,7 +86,8 @@ def prepare_datasource_mock( False, False, [], - "71452fee8ffbd8d340193d611bcd4559", + # SHA-256 hash with default HASH_ALGORITHM + "73653fa5724a23c28fdf3bba4c7e8a4f6f3470f888b55c986d56e2553c38713e", ), ( None, @@ -94,7 +95,8 @@ def prepare_datasource_mock( True, False, [], - "209dc060ac19271b8708731e3b8280f5", + # SHA-256 hash with default HASH_ALGORITHM + "62d7d89c426fb4f11787095f309c573c69e5d47a92af9cad792b03ba60a1f1cd", ), ( { @@ -104,7 +106,8 @@ def prepare_datasource_mock( True, False, [], - "209dc060ac19271b8708731e3b8280f5", + # SHA-256 hash with default HASH_ALGORITHM + "62d7d89c426fb4f11787095f309c573c69e5d47a92af9cad792b03ba60a1f1cd", ), ( { @@ -114,7 +117,8 @@ def prepare_datasource_mock( True, False, [], - "06a4144466dbd5ffad0c3c2225e96296", + # SHA-256 hash with default HASH_ALGORITHM + "b4004c6d418121e012a6b6d6e8566aca4907e4fb204beaced17d8f8e6f7ff2dd", ), ( { @@ -124,7 +128,8 @@ def prepare_datasource_mock( True, False, [], - "a823ece9563895ccb14f3d9095e84f7a", + # SHA-256 hash with default HASH_ALGORITHM + "e1226d050fde6acda8cc6630d677a971362a87f2e1b4c35df76de4048b5787bc", ), ( { @@ -134,7 +139,8 @@ def prepare_datasource_mock( True, False, [], - "33c5475f92a904925ab3ef493526e5b5", + # SHA-256 hash with default HASH_ALGORITHM + "6073a59a3b7428f03cc72db8de43b74e3f203cac4fb0c84216201924043e8b41", ), ( { @@ -144,7 +150,8 @@ def prepare_datasource_mock( True, False, [], - "cec57345e6402c0d4b3caee5cfaa0a03", + # SHA-256 hash with default HASH_ALGORITHM + "7e3e9ca5bd1493022a3b97a449cf17c931263b4a9d99b1fcad2781766535c116", ), ( { @@ -154,7 +161,8 @@ def prepare_datasource_mock( True, False, [], - "5380dcbe94621a0759b09554404f3d02", + # SHA-256 hash with default HASH_ALGORITHM + "bb0f8d2a1a4e406528ca027b4252856a69037ec7272587026f720521210123fe", ), ( None, @@ -167,7 +175,8 @@ def prepare_datasource_mock( "get_sqla_row_level_filters": MagicMock(return_value=["filter1"]), } ], - "4138959f275c1991466cafcfb190fd72", + # SHA-256 hash with default HASH_ALGORITHM + "88c66714ce66ee9de15bfa82e5bb35479838190ca6662d3088a00802827c195c", ), ( None, @@ -188,7 +197,8 @@ def prepare_datasource_mock( ), }, ], - "80d3bfcc7144bccdba8c718cf49b6420", + # SHA-256 hash with default HASH_ALGORITHM + "1a686c28c9c866832428616a0f9bd12d5b2452ea20645113c86dd2be88980c42", ), ( None, @@ -207,7 +217,8 @@ def prepare_datasource_mock( ), }, ], - "e8fc68cd5aba22a5f1acf06164bfc0f4", + # SHA-256 hash with default HASH_ALGORITHM + "f0d428a30a62b000fa92e87c7bb29c2c55bddc49abf8408d395502653e702cd6", ), ( None, @@ -296,7 +307,8 @@ def test_dashboard_digest( False, False, None, - "47d852b5c4df211c115905617bb722c1", + # SHA-256 hash with default HASH_ALGORITHM + "053d9488ff5da47d00d236084c34261d608f0fb006aceb0084738ccb6fe7a838", ), ( None, @@ -304,7 +316,8 @@ def test_dashboard_digest( True, False, None, - "4f8109d3761e766e650af514bb358f10", + # SHA-256 hash with default HASH_ALGORITHM + "d69f16940a8de1b35088a79424f40ed388f1a7a5f2a7692dd14bf77964fb6898", ), ( None, @@ -323,7 +336,8 @@ def test_dashboard_digest( "is_rls_supported": True, "get_sqla_row_level_filters": MagicMock(return_value=["filter1"]), }, - "61e70336c27eb97fb050328a0b050373", + # SHA-256 hash with default HASH_ALGORITHM + "90a543199890b9b2a6583a27a2fed66948f907d28070437250e3b4d715e5bd3e", ), ( None, @@ -336,7 +350,8 @@ def test_dashboard_digest( return_value=["filter1", "filter2"] ), }, - "95c7cefde8cb519f005f33bfb33cb196", + # SHA-256 hash with default HASH_ALGORITHM + "42fbf56bf1dcbdcd4a84d26ed159ade36ab2bffbab85230799d719ce779c3312", ), ( None, @@ -347,7 +362,8 @@ def test_dashboard_digest( "is_rls_supported": False, "get_sqla_row_level_filters": MagicMock(return_value=[]), }, - "4f8109d3761e766e650af514bb358f10", + # SHA-256 hash with default HASH_ALGORITHM + "d69f16940a8de1b35088a79424f40ed388f1a7a5f2a7692dd14bf77964fb6898", ), ( None, diff --git a/tests/unit_tests/utils/screenshot_test.py b/tests/unit_tests/utils/screenshot_test.py index cb684b827f9..b7f7de6032e 100644 --- a/tests/unit_tests/utils/screenshot_test.py +++ b/tests/unit_tests/utils/screenshot_test.py @@ -22,7 +22,7 @@ from unittest.mock import MagicMock, patch import pytest from pytest_mock import MockerFixture -from superset.utils.hashing import md5_sha_from_dict +from superset.utils.hashing import hash_from_dict from superset.utils.screenshots import ( BaseScreenshot, ChartScreenshot, @@ -74,9 +74,9 @@ def test_get_screenshot(mocker: MockerFixture, screenshot_obj): assert screenshot_data == fake_bytes -def test_get_cache_key(screenshot_obj): +def test_get_cache_key(app_context, screenshot_obj): """Test get_cache_key method""" - expected_cache_key = md5_sha_from_dict( + expected_cache_key = hash_from_dict( { "thumbnail_type": "", "digest": screenshot_obj.digest, diff --git a/tests/unit_tests/utils/test_core.py b/tests/unit_tests/utils/test_core.py index 37b8ed1877a..fe1b3311ef2 100644 --- a/tests/unit_tests/utils/test_core.py +++ b/tests/unit_tests/utils/test_core.py @@ -687,7 +687,9 @@ def test_merge_extra_filters(): "clause": "WHERE", "comparator": "someval", "expressionType": "SIMPLE", - "filterOptionName": "90cfb3c34852eb3bc741b0cc20053b46", + "filterOptionName": ( + "eb77ff8188437d8722af8c932727da1e83ec37e88aaf800a3859ed352d87119f" + ), "isExtra": True, "operator": "in", "subject": "a", @@ -696,7 +698,9 @@ def test_merge_extra_filters(): "clause": "WHERE", "comparator": ["c1", "c2"], "expressionType": "SIMPLE", - "filterOptionName": "6c178d069965f1c02640661280415d96", + "filterOptionName": ( + "48dd60c7ecb8699b51e36ce956ba481aa5382548811aecec71af7e550c59762c" + ), "isExtra": True, "operator": "==", "subject": "B", @@ -735,7 +739,9 @@ def test_merge_extra_filters(): "clause": "WHERE", "comparator": "someval", "expressionType": "SIMPLE", - "filterOptionName": "90cfb3c34852eb3bc741b0cc20053b46", + "filterOptionName": ( + "eb77ff8188437d8722af8c932727da1e83ec37e88aaf800a3859ed352d87119f" + ), "isExtra": True, "operator": "in", "subject": "a", @@ -744,7 +750,9 @@ def test_merge_extra_filters(): "clause": "WHERE", "comparator": ["c1", "c2"], "expressionType": "SIMPLE", - "filterOptionName": "6c178d069965f1c02640661280415d96", + "filterOptionName": ( + "48dd60c7ecb8699b51e36ce956ba481aa5382548811aecec71af7e550c59762c" + ), "isExtra": True, "operator": "==", "subject": "B", @@ -769,7 +777,9 @@ def test_merge_extra_filters(): "clause": "WHERE", "comparator": "hello", "expressionType": "SIMPLE", - "filterOptionName": "e3cbdd92a2ae23ca92c6d7fca42e36a6", + "filterOptionName": ( + "2ca91524f5ab8e39d6aa5373d1f11301ad2c5b95f5aa77eb30d92f572f5b9157" + ), "isExtra": True, "operator": "like", "subject": "A", @@ -933,7 +943,9 @@ def test_merge_extra_filters_merges_different_val_types(): "clause": "WHERE", "comparator": ["g1", "g2"], "expressionType": "SIMPLE", - "filterOptionName": "c11969c994b40a83a4ae7d48ff1ea28e", + "filterOptionName": ( + "e2f7d6304169124258364916403b2d9208fce39dd7771797726111b7498bbd52" + ), "isExtra": True, "operator": "in", "subject": "a", @@ -985,7 +997,9 @@ def test_merge_extra_filters_merges_different_val_types(): "clause": "WHERE", "comparator": "someval", "expressionType": "SIMPLE", - "filterOptionName": "90cfb3c34852eb3bc741b0cc20053b46", + "filterOptionName": ( + "eb77ff8188437d8722af8c932727da1e83ec37e88aaf800a3859ed352d87119f" + ), "isExtra": True, "operator": "in", "subject": "a", @@ -1040,7 +1054,9 @@ def test_merge_extra_filters_adds_unequal_lists(): "clause": "WHERE", "comparator": ["g1", "g2", "g3"], "expressionType": "SIMPLE", - "filterOptionName": "21cbb68af7b17e62b3b2f75e2190bfd7", + "filterOptionName": ( + "b3f17391546e130560efd1e841742bc5f154d09a7d534b8c0ec33fc1c8a146cd" + ), "isExtra": True, "operator": "in", "subject": "a", @@ -1049,7 +1065,9 @@ def test_merge_extra_filters_adds_unequal_lists(): "clause": "WHERE", "comparator": ["c1", "c2", "c3"], "expressionType": "SIMPLE", - "filterOptionName": "0a8dcb928f1f4bba97643c6e68d672f1", + "filterOptionName": ( + "41ef70f6edada46006253189b27778088da2cf27ccc69f703634493d7396708a" + ), "isExtra": True, "operator": "==", "subject": "B", diff --git a/tests/unit_tests/utils/test_hashing.py b/tests/unit_tests/utils/test_hashing.py new file mode 100644 index 00000000000..2791e5ce64d --- /dev/null +++ b/tests/unit_tests/utils/test_hashing.py @@ -0,0 +1,183 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from unittest.mock import patch + +import pytest + +from superset.utils.hashing import ( + hash_from_dict, + hash_from_str, +) + + +def test_hash_from_str_sha256(): + """Test SHA-256 hashing produces expected output.""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="sha256"): + result = hash_from_str("test") + expected = "9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08" + assert result == expected + + +def test_hash_from_str_md5(): + """Test MD5 hashing for backward compatibility.""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="md5"): + result = hash_from_str("test") + expected = "098f6bcd4621d373cade4e832627b4f6" + assert result == expected + + +def test_hash_from_dict_deterministic(): + """Test dictionary hashing is deterministic.""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="sha256"): + obj = {"key": "value", "number": 42} + hash1 = hash_from_dict(obj) + hash2 = hash_from_dict(obj) + assert hash1 == hash2 + + +def test_hash_from_dict_key_order_invariant(): + """Test dictionary hashing is invariant to key order.""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="sha256"): + obj1 = {"a": 1, "b": 2, "c": 3} + obj2 = {"c": 3, "a": 1, "b": 2} + assert hash_from_dict(obj1) == hash_from_dict(obj2) + + +def test_hash_algorithm_override(): + """Test explicit algorithm override.""" + # Config set to SHA-256 + with patch("superset.utils.hashing.get_hash_algorithm", return_value="sha256"): + # Force MD5 via parameter + result = hash_from_str("test", algorithm="md5") + expected_md5 = "098f6bcd4621d373cade4e832627b4f6" + assert result == expected_md5 + + # Force SHA-256 via parameter (redundant but valid) + result = hash_from_str("test", algorithm="sha256") + expected_sha256 = ( + "9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08" + ) + assert result == expected_sha256 + + +def test_backward_compatibility_alias_md5(): + """Test legacy function names work with MD5.""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="md5"): + result = hash_from_str("test") + expected = "098f6bcd4621d373cade4e832627b4f6" + assert result == expected + + +def test_backward_compatibility_alias_sha256(): + """Test legacy function names work with SHA-256.""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="sha256"): + result = hash_from_str("test") + # Should return SHA-256, not MD5 + assert len(result) == 64 # SHA-256 hex length + expected = "9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08" + assert result == expected + + +def test_backward_compatibility_dict_alias(): + """Test legacy dict function name.""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="sha256"): + obj = {"key": "value"} + result = hash_from_dict(obj) + # Should use SHA-256 + assert len(result) == 64 + + +def test_invalid_algorithm_raises(): + """Test invalid algorithm raises ValueError.""" + with pytest.raises(ValueError, match="Unsupported hash algorithm"): + hash_from_str("test", algorithm="sha1") + + +def test_empty_string(): + """Test hashing empty string.""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="sha256"): + result = hash_from_str("") + # SHA-256 of empty string + expected = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + assert result == expected + + +def test_empty_dict(): + """Test hashing empty dictionary.""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="sha256"): + result = hash_from_dict({}) + # Should hash the JSON representation "{}" + assert isinstance(result, str) + assert len(result) == 64 + + +def test_unicode_string(): + """Test hashing Unicode strings.""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="sha256"): + result = hash_from_str("Hello δΈ–η•Œ 🌍") + # Should handle Unicode correctly + assert isinstance(result, str) + assert len(result) == 64 + + +def test_nested_dict(): + """Test hashing nested dictionaries.""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="sha256"): + obj = {"outer": {"inner": {"deep": "value"}}, "list": [1, 2, 3]} + result = hash_from_dict(obj) + assert isinstance(result, str) + assert len(result) == 64 + + +def test_dict_with_nan(): + """Test hashing dictionary with NaN values.""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="sha256"): + import math + + obj = {"value": math.nan, "normal": 42} + # Should handle NaN with ignore_nan parameter + result = hash_from_dict(obj, ignore_nan=True) + assert isinstance(result, str) + assert len(result) == 64 + + +def test_hash_consistency_across_runs(): + """Test that hashing is consistent across multiple invocations.""" + with patch("superset.utils.hashing.get_hash_algorithm", return_value="sha256"): + test_string = "consistency_test" + results = [hash_from_str(test_string) for _ in range(10)] + + # All results should be identical + assert len(set(results)) == 1 + + +def test_md5_vs_sha256_different_outputs(): + """Test that MD5 and SHA-256 produce different hashes.""" + test_string = "compare" + + with patch("superset.utils.hashing.get_hash_algorithm", return_value="md5"): + md5_result = hash_from_str(test_string) + + with patch("superset.utils.hashing.get_hash_algorithm", return_value="sha256"): + sha256_result = hash_from_str(test_string) + + # Hashes should be different + assert md5_result != sha256_result + # MD5 produces 32 character hex string + assert len(md5_result) == 32 + # SHA-256 produces 64 character hex string + assert len(sha256_result) == 64