feat: improve perf of CSV uploads (#34603)

2026-04-19 08:04:53 +00:00 · 2025-08-20 08:53:02 -04:00
parent 691926f0e1
commit a82e310600
10 changed files with 589 additions and 26 deletions
--- a/superset/commands/database/uploaders/csv_reader.py
+++ b/superset/commands/database/uploaders/csv_reader.py
@@ -15,12 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 import logging
+from importlib import util
 from typing import Any, Optional

 import pandas as pd
+from flask import current_app
 from flask_babel import lazy_gettext as _
 from werkzeug.datastructures import FileStorage

+from superset import is_feature_enabled
 from superset.commands.database.exceptions import DatabaseUploadFailed
 from superset.commands.database.uploaders.base import (
    BaseDataReader,
@@ -30,8 +33,9 @@ from superset.commands.database.uploaders.base import (

 logger = logging.getLogger(__name__)

-READ_CSV_CHUNK_SIZE = 1000
-ROWS_TO_READ_METADATA = 2
+ROWS_TO_READ_METADATA = 100
+DEFAULT_ENCODING = "utf-8"
+ENCODING_FALLBACKS = ["utf-8", "latin-1", "cp1252", "iso-8859-1"]


 class CSVReaderOptions(ReaderOptions, total=False):
@@ -61,23 +65,153 @@ class CSVReader(BaseDataReader):
        )

    @staticmethod
-    def _read_csv(file: FileStorage, kwargs: dict[str, Any]) -> pd.DataFrame:
+    def _detect_encoding(file: FileStorage) -> str:
+        """Detect file encoding with progressive sampling"""
+        # Try progressively larger samples to improve detection reliability
+        sample_sizes = [1024, 8192, 32768, 65536]
+
+        for sample_size in sample_sizes:
+            file.seek(0)
+            sample = file.read(sample_size)
+            if not sample:  # Empty file or reached end
+                break
+
+            for encoding in ENCODING_FALLBACKS:
+                try:
+                    sample.decode(encoding)
+                    file.seek(0)
+                    return encoding
+                except UnicodeDecodeError:
+                    continue
+
+        file.seek(0)
+        return DEFAULT_ENCODING
+
+    @staticmethod
+    def _select_optimal_engine() -> str:
+        """Select the best available CSV parsing engine"""
+        try:
+            # Check if pyarrow is available as a separate package
+            pyarrow_spec = util.find_spec("pyarrow")
+            if not pyarrow_spec:
+                return "c"
+
+            # Import pyarrow to verify it works properly
+            import pyarrow as pa  # noqa: F401
+
+            # Check if pandas has built-in pyarrow support
+            pandas_version = str(pd.__version__)
+            has_builtin_pyarrow = "pyarrow" in pandas_version
+
+            if has_builtin_pyarrow:
+                # Pandas has built-in pyarrow, safer to use c engine
+                logger.info("Pandas has built-in pyarrow support, using 'c' engine")
+                return "c"
+            else:
+                # External pyarrow available, can safely use it
+                logger.info("Using 'pyarrow' engine for CSV parsing")
+                return "pyarrow"
+
+        except ImportError:
+            # PyArrow import failed, fall back to c engine
+            logger.info("PyArrow not properly installed, falling back to 'c' engine")
+            return "c"
+        except Exception as ex:
+            # Any other error, fall back to c engine
+            logger.warning(
+                f"Error selecting CSV engine: {ex}, falling back to 'c' engine"
+            )
+            return "c"
+
+    @staticmethod
+    def _read_csv(  # noqa: C901
+        file: FileStorage,
+        kwargs: dict[str, Any],
+    ) -> pd.DataFrame:
+        encoding = kwargs.get("encoding", DEFAULT_ENCODING)
+
+        # PyArrow engine doesn't support iterator/chunksize/nrows
+        # It also has known issues with date parsing and missing values
+        # Default to "c" engine for stability
+        has_unsupported_options = (
+            "chunksize" in kwargs
+            or "iterator" in kwargs
+            or kwargs.get("nrows") is not None
+            or kwargs.get("parse_dates")  # Has bugs with multiple date columns
+            or kwargs.get("na_values")  # Has bugs with missing value handling
+        )
+
+        # Use PyArrow engine if feature flag is enabled and options are compatible
+        if (
+            is_feature_enabled("CSV_UPLOAD_PYARROW_ENGINE")
+            and not has_unsupported_options
+        ):
+            kwargs["engine"] = CSVReader._select_optimal_engine()
+        else:
+            # Default to c engine for reliability
+            kwargs["engine"] = "c"
+
+        kwargs["low_memory"] = False
+
        try:
            if "chunksize" in kwargs:
-                return pd.concat(
-                    pd.read_csv(
-                        filepath_or_buffer=file.stream,
-                        **kwargs,
-                    )
+                chunks = []
+                total_rows = 0
+                max_rows = kwargs.get("nrows")
+                chunk_iterator = pd.read_csv(
+                    filepath_or_buffer=file.stream,
+                    **kwargs,
                )
+
+                for chunk in chunk_iterator:
+                    # Check if adding this chunk would exceed the row limit
+                    if max_rows is not None and total_rows + len(chunk) > max_rows:
+                        # Only take the needed rows from this chunk
+                        remaining_rows = max_rows - total_rows
+                        chunk = chunk.iloc[:remaining_rows]
+                        chunks.append(chunk)
+                        break
+
+                    chunks.append(chunk)
+                    total_rows += len(chunk)
+
+                    # Break if we've reached the desired number of rows
+                    if max_rows is not None and total_rows >= max_rows:
+                        break
+
+                if chunks:
+                    result = pd.concat(chunks, ignore_index=False)
+                    # When using chunking, we need to reset and rebuild the index
+                    if kwargs.get("index_col") is not None:
+                        # The index was already set by pandas during read_csv
+                        # Just need to ensure it's properly named after concatenation
+                        index_col = kwargs.get("index_col")
+                        if isinstance(index_col, str):
+                            result.index.name = index_col
+                    return result
+                return pd.DataFrame()
+
            return pd.read_csv(
                filepath_or_buffer=file.stream,
                **kwargs,
            )
+        except UnicodeDecodeError as ex:
+            if encoding != DEFAULT_ENCODING:
+                raise DatabaseUploadFailed(
+                    message=_("Parsing error: %(error)s", error=str(ex))
+                ) from ex
+
+            file.seek(0)
+            detected_encoding = CSVReader._detect_encoding(file)
+            if detected_encoding != encoding:
+                kwargs["encoding"] = detected_encoding
+                return CSVReader._read_csv(file, kwargs)
+            raise DatabaseUploadFailed(
+                message=_("Parsing error: %(error)s", error=str(ex))
+            ) from ex
        except (
            pd.errors.ParserError,
            pd.errors.EmptyDataError,
-            UnicodeDecodeError,
            ValueError,
        ) as ex:
            raise DatabaseUploadFailed(
@@ -93,31 +227,46 @@ class CSVReader(BaseDataReader):
        :return: pandas DataFrame
        :throws DatabaseUploadFailed: if there is an error reading the file
        """
+        rows_to_read = self._options.get("rows_to_read")
+        chunk_size = current_app.config.get("READ_CSV_CHUNK_SIZE", 1000)
+
+        use_chunking = rows_to_read is None or rows_to_read > chunk_size * 2
+
        kwargs = {
-            "chunksize": READ_CSV_CHUNK_SIZE,
-            "encoding": "utf-8",
+            "encoding": self._options.get("encoding", DEFAULT_ENCODING),
            "header": self._options.get("header_row", 0),
            "decimal": self._options.get("decimal_character", "."),
            "index_col": self._options.get("index_column"),
            "dayfirst": self._options.get("day_first", False),
-            "iterator": True,
            "keep_default_na": not self._options.get("null_values"),
-            "usecols": self._options.get("columns_read")
-            if self._options.get("columns_read")  # None if an empty list
-            else None,
-            "na_values": self._options.get("null_values")
-            if self._options.get("null_values")  # None if an empty list
-            else None,
-            "nrows": self._options.get("rows_to_read"),
+            "usecols": (
+                self._options.get("columns_read")
+                if self._options.get("columns_read")  # None if an empty list
+                else None
+            ),
+            "na_values": (
+                self._options.get("null_values")
+                if self._options.get("null_values")  # None if an empty list
+                else None
+            ),
+            "nrows": rows_to_read,
            "parse_dates": self._options.get("column_dates"),
            "sep": self._options.get("delimiter", ","),
            "skip_blank_lines": self._options.get("skip_blank_lines", False),
            "skipinitialspace": self._options.get("skip_initial_space", False),
            "skiprows": self._options.get("skip_rows", 0),
-            "dtype": self._options.get("column_data_types")
-            if self._options.get("column_data_types")
-            else None,
+            "dtype": (
+                self._options.get("column_data_types")
+                if self._options.get("column_data_types")
+                else None
+            ),
+            "cache_dates": True,
        }
+
+        if use_chunking:
+            kwargs["chunksize"] = chunk_size
+            kwargs["iterator"] = True
+
        return self._read_csv(file, kwargs)

    def file_metadata(self, file: FileStorage) -> FileMetadata:
@@ -131,6 +280,8 @@ class CSVReader(BaseDataReader):
            "nrows": ROWS_TO_READ_METADATA,
            "header": self._options.get("header_row", 0),
            "sep": self._options.get("delimiter", ","),
+            "encoding": self._options.get("encoding", DEFAULT_ENCODING),
+            "low_memory": False,
        }
        df = self._read_csv(file, kwargs)
        return {