fix(hive): Use parquet rather than textfile when uploading CSV files to Hive (#14240)

* fix(hive): Use parquet rather than textfile when uploading CSV files * [csv/excel]: Use stream rather than temporary file Co-authored-by: John Bodley <john.bodley@airbnb.com>
2026-04-19 16:14:52 +00:00 · 2021-04-24 18:17:30 +12:00
parent e392e2ed39
commit b0f8f6b6ad
7 changed files with 247 additions and 387 deletions
--- a/superset/db_engine_specs/bigquery.py
+++ b/superset/db_engine_specs/bigquery.py
@@ -26,6 +26,7 @@ from sqlalchemy.sql.expression import ColumnClause

 from superset.db_engine_specs.base import BaseEngineSpec
 from superset.errors import SupersetErrorType
+from superset.sql_parse import Table
 from superset.utils import core as utils

 if TYPE_CHECKING:
@@ -228,16 +229,26 @@ class BigQueryEngineSpec(BaseEngineSpec):
        return "TIMESTAMP_MILLIS({col})"

    @classmethod
-    def df_to_sql(cls, df: pd.DataFrame, **kwargs: Any) -> None:
+    def df_to_sql(
+        cls,
+        database: "Database",
+        table: Table,
+        df: pd.DataFrame,
+        to_sql_kwargs: Dict[str, Any],
+    ) -> None:
        """
-        Upload data from a Pandas DataFrame to BigQuery. Calls
-        `DataFrame.to_gbq()` which requires `pandas_gbq` to be installed.
+        Upload data from a Pandas DataFrame to a database.

-        :param df: Dataframe with data to be uploaded
-        :param kwargs: kwargs to be passed to to_gbq() method. Requires that `schema`,
-        `name` and `con` are present in kwargs. `name` and `schema` are combined
-         and passed to `to_gbq()` as `destination_table`.
+        Calls `pandas_gbq.DataFrame.to_gbq` which requires `pandas_gbq` to be installed.
+
+        Note this method does not create metadata for the table.
+
+        :param database: The database to upload the data to
+        :param table: The table to upload the data to
+        :param df: The dataframe with data to be uploaded
+        :param to_sql_kwargs: The kwargs to be passed to pandas.DataFrame.to_sql` method
        """
+
        try:
            import pandas_gbq
            from google.oauth2 import service_account
@@ -248,22 +259,25 @@ class BigQueryEngineSpec(BaseEngineSpec):
                "to upload data to BigQuery"
            )

-        if not ("name" in kwargs and "schema" in kwargs and "con" in kwargs):
-            raise Exception("name, schema and con need to be defined in kwargs")
+        if not table.schema:
+            raise Exception("The table schema must be defined")

-        gbq_kwargs = {}
-        gbq_kwargs["project_id"] = kwargs["con"].engine.url.host
-        gbq_kwargs["destination_table"] = f"{kwargs.pop('schema')}.{kwargs.pop('name')}"
+        engine = cls.get_engine(database)
+        to_gbq_kwargs = {"destination_table": str(table), "project_id": engine.url.host}
+
+        # Add credentials if they are set on the SQLAlchemy dialect.
+        creds = engine.dialect.credentials_info

-        # add credentials if they are set on the SQLAlchemy Dialect:
-        creds = kwargs["con"].dialect.credentials_info
        if creds:
-            credentials = service_account.Credentials.from_service_account_info(creds)
-            gbq_kwargs["credentials"] = credentials
+            to_gbq_kwargs[
+                "credentials"
+            ] = service_account.Credentials.from_service_account_info(creds)

-        # Only pass through supported kwargs
+        # Only pass through supported kwargs.
        supported_kwarg_keys = {"if_exists"}
+
        for key in supported_kwarg_keys:
-            if key in kwargs:
-                gbq_kwargs[key] = kwargs[key]
-        pandas_gbq.to_gbq(df, **gbq_kwargs)
+            if key in to_sql_kwargs:
+                to_gbq_kwargs[key] = to_sql_kwargs[key]
+
+        pandas_gbq.to_gbq(df, **to_gbq_kwargs)