diff --git a/superset/commands/database/uploaders/csv_reader.py b/superset/commands/database/uploaders/csv_reader.py index 6184bade417..4f497853a5c 100644 --- a/superset/commands/database/uploaders/csv_reader.py +++ b/superset/commands/database/uploaders/csv_reader.py @@ -326,6 +326,26 @@ class CSVReader(BaseDataReader): CSVReader._cast_single_column(df, column, dtype, kwargs) return df + @staticmethod + def _split_types(types: dict[str, str]) -> tuple[dict[str, str], dict[str, str]]: + """ + Split column data types into custom and pandas-native types. + + :param types: Dictionary mapping column names to data types + :return: Tuple of (custom_types, pandas_types) dictionaries + """ + pandas_types = { + col: dtype + for col, dtype in types.items() + if dtype in ("str", "object", "string") + } + custom_types = { + col: dtype + for col, dtype in types.items() + if dtype not in ("str", "object", "string") + } + return custom_types, pandas_types + @staticmethod def _read_csv( # noqa: C901 file: FileStorage, @@ -357,7 +377,17 @@ class CSVReader(BaseDataReader): kwargs["low_memory"] = False try: - types = kwargs.pop("dtype", None) + types = None + if "dtype" in kwargs and kwargs["dtype"]: + custom_types, pandas_types = CSVReader._split_types(kwargs["dtype"]) + if pandas_types: + kwargs["dtype"] = pandas_types + else: + kwargs.pop("dtype", None) + + # Custom types for our manual casting + types = custom_types if custom_types else None + if "chunksize" in kwargs: chunks = [] total_rows = 0 diff --git a/tests/unit_tests/commands/databases/csv_reader_test.py b/tests/unit_tests/commands/databases/csv_reader_test.py index 6d7ee868e91..386bf748926 100644 --- a/tests/unit_tests/commands/databases/csv_reader_test.py +++ b/tests/unit_tests/commands/databases/csv_reader_test.py @@ -854,6 +854,33 @@ def test_csv_reader_successful_numeric_conversion(): assert df.iloc[0]["ID"] == 1001 +def test_csv_reader_successful_string_conversion_with_floats(): + csv_data = [ + ["id"], + [1439403621518935563], + [42286989], + [1413660691875593351], + [8.26839e17], + ] + + csv_reader = CSVReader( + options=CSVReaderOptions( + column_data_types={ + "id": "str", + } + ) + ) + + df = csv_reader.file_to_dataframe(create_csv_file(csv_data)) + + assert df.shape == (4, 1) + assert df["id"].dtype == "object" + assert df.iloc[0]["id"] == "1439403621518935563" + assert df.iloc[1]["id"] == "42286989" + assert df.iloc[2]["id"] == "1413660691875593351" + assert df.iloc[3]["id"] == "8.26839e+17" + + def test_csv_reader_error_detection_improvements_summary(): csv_data_with_custom_header = [ ["metadata_row", "skip", "this"],