mirror of
https://github.com/apache/superset.git
synced 2026-04-18 23:55:00 +00:00
feat: Add parquet upload (#14449)
* allow csv upload to accept parquet file * fix mypy * fix if statement * add test for specificying columns in CSV upload * clean up test * change order in test * fix failures * upload parquet to seperate table in test * fix error message * fix mypy again * rename other extensions to columnar * add new form for columnar upload * add support for zip files * undo csv form changes except usecols * add more tests for zip * isort & black * pylint * fix trailing space * address more review comments * pylint * black * resolve remaining issues
This commit is contained in:
committed by
GitHub
parent
ad8336a5b4
commit
d25b0967a1
@@ -19,6 +19,7 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from typing import Dict, Optional
|
||||
|
||||
from unittest import mock
|
||||
@@ -43,9 +44,14 @@ CSV_UPLOAD_DATABASE = "csv_explore_db"
|
||||
CSV_FILENAME1 = "testCSV1.csv"
|
||||
CSV_FILENAME2 = "testCSV2.csv"
|
||||
EXCEL_FILENAME = "testExcel.xlsx"
|
||||
PARQUET_FILENAME1 = "testZip/testParquet1.parquet"
|
||||
PARQUET_FILENAME2 = "testZip/testParquet2.parquet"
|
||||
ZIP_DIRNAME = "testZip"
|
||||
ZIP_FILENAME = "testZip.zip"
|
||||
|
||||
EXCEL_UPLOAD_TABLE = "excel_upload"
|
||||
CSV_UPLOAD_TABLE = "csv_upload"
|
||||
PARQUET_UPLOAD_TABLE = "parquet_upload"
|
||||
CSV_UPLOAD_TABLE_W_SCHEMA = "csv_upload_w_schema"
|
||||
CSV_UPLOAD_TABLE_W_EXPLORE = "csv_upload_w_explore"
|
||||
|
||||
@@ -70,6 +76,7 @@ def setup_csv_upload():
|
||||
engine = upload_db.get_sqla_engine()
|
||||
engine.execute(f"DROP TABLE IF EXISTS {EXCEL_UPLOAD_TABLE}")
|
||||
engine.execute(f"DROP TABLE IF EXISTS {CSV_UPLOAD_TABLE}")
|
||||
engine.execute(f"DROP TABLE IF EXISTS {PARQUET_UPLOAD_TABLE}")
|
||||
engine.execute(f"DROP TABLE IF EXISTS {CSV_UPLOAD_TABLE_W_SCHEMA}")
|
||||
engine.execute(f"DROP TABLE IF EXISTS {CSV_UPLOAD_TABLE_W_EXPLORE}")
|
||||
db.session.delete(upload_db)
|
||||
@@ -97,6 +104,17 @@ def create_excel_files():
|
||||
os.remove(EXCEL_FILENAME)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def create_columnar_files():
|
||||
os.mkdir(ZIP_DIRNAME)
|
||||
pd.DataFrame({"a": ["john", "paul"], "b": [1, 2]}).to_parquet(PARQUET_FILENAME1)
|
||||
pd.DataFrame({"a": ["max", "bob"], "b": [3, 4]}).to_parquet(PARQUET_FILENAME2)
|
||||
shutil.make_archive(ZIP_DIRNAME, "zip", ZIP_DIRNAME)
|
||||
yield
|
||||
os.remove(ZIP_FILENAME)
|
||||
shutil.rmtree(ZIP_DIRNAME)
|
||||
|
||||
|
||||
def get_upload_db():
|
||||
return db.session.query(Database).filter_by(database_name=CSV_UPLOAD_DATABASE).one()
|
||||
|
||||
@@ -134,6 +152,22 @@ def upload_excel(
|
||||
return get_resp(test_client, "/exceltodatabaseview/form", data=form_data)
|
||||
|
||||
|
||||
def upload_columnar(
|
||||
filename: str, table_name: str, extra: Optional[Dict[str, str]] = None
|
||||
):
|
||||
columnar_upload_db_id = get_upload_db().id
|
||||
form_data = {
|
||||
"columnar_file": open(filename, "rb"),
|
||||
"name": table_name,
|
||||
"con": columnar_upload_db_id,
|
||||
"if_exists": "fail",
|
||||
"index_label": "test_label",
|
||||
}
|
||||
if extra:
|
||||
form_data.update(extra)
|
||||
return get_resp(test_client, "/columnartodatabaseview/form", data=form_data)
|
||||
|
||||
|
||||
def mock_upload_to_s3(filename: str, upload_prefix: str, table: Table) -> str:
|
||||
"""
|
||||
HDFS is used instead of S3 for the unit tests.integration_tests.
|
||||
@@ -249,6 +283,18 @@ def test_import_csv(setup_csv_upload, create_csv_files):
|
||||
)
|
||||
assert success_msg_f1 in resp
|
||||
|
||||
# upload again with replace mode and specific columns
|
||||
resp = upload_csv(
|
||||
CSV_FILENAME1,
|
||||
CSV_UPLOAD_TABLE,
|
||||
extra={"if_exists": "replace", "usecols": '["a"]'},
|
||||
)
|
||||
assert success_msg_f1 in resp
|
||||
|
||||
# make sure only specified column name was read
|
||||
table = SupersetTestCase.get_table(name=CSV_UPLOAD_TABLE)
|
||||
assert "b" not in table.column_names
|
||||
|
||||
# upload again with replace mode
|
||||
resp = upload_csv(CSV_FILENAME1, CSV_UPLOAD_TABLE, extra={"if_exists": "replace"})
|
||||
assert success_msg_f1 in resp
|
||||
@@ -328,3 +374,68 @@ def test_import_excel(setup_csv_upload, create_excel_files):
|
||||
.fetchall()
|
||||
)
|
||||
assert data == [(0, "john", 1), (1, "paul", 2)]
|
||||
|
||||
|
||||
@mock.patch("superset.db_engine_specs.hive.upload_to_s3", mock_upload_to_s3)
|
||||
def test_import_parquet(setup_csv_upload, create_columnar_files):
|
||||
if utils.backend() == "hive":
|
||||
pytest.skip("Hive doesn't allow parquet upload.")
|
||||
|
||||
success_msg_f1 = f'Columnar file "[\'{PARQUET_FILENAME1}\']" uploaded to table "{PARQUET_UPLOAD_TABLE}"'
|
||||
|
||||
# initial upload with fail mode
|
||||
resp = upload_columnar(PARQUET_FILENAME1, PARQUET_UPLOAD_TABLE)
|
||||
assert success_msg_f1 in resp
|
||||
|
||||
# upload again with fail mode; should fail
|
||||
fail_msg = f'Unable to upload Columnar file "[\'{PARQUET_FILENAME1}\']" to table "{PARQUET_UPLOAD_TABLE}"'
|
||||
resp = upload_columnar(PARQUET_FILENAME1, PARQUET_UPLOAD_TABLE)
|
||||
assert fail_msg in resp
|
||||
|
||||
if utils.backend() != "hive":
|
||||
# upload again with append mode
|
||||
resp = upload_columnar(
|
||||
PARQUET_FILENAME1, PARQUET_UPLOAD_TABLE, extra={"if_exists": "append"}
|
||||
)
|
||||
assert success_msg_f1 in resp
|
||||
|
||||
# upload again with replace mode and specific columns
|
||||
resp = upload_columnar(
|
||||
PARQUET_FILENAME1,
|
||||
PARQUET_UPLOAD_TABLE,
|
||||
extra={"if_exists": "replace", "usecols": '["a"]'},
|
||||
)
|
||||
assert success_msg_f1 in resp
|
||||
|
||||
# make sure only specified column name was read
|
||||
table = SupersetTestCase.get_table(name=PARQUET_UPLOAD_TABLE)
|
||||
assert "b" not in table.column_names
|
||||
|
||||
# upload again with replace mode
|
||||
resp = upload_columnar(
|
||||
PARQUET_FILENAME1, PARQUET_UPLOAD_TABLE, extra={"if_exists": "replace"}
|
||||
)
|
||||
assert success_msg_f1 in resp
|
||||
|
||||
data = (
|
||||
get_upload_db()
|
||||
.get_sqla_engine()
|
||||
.execute(f"SELECT * from {PARQUET_UPLOAD_TABLE}")
|
||||
.fetchall()
|
||||
)
|
||||
assert data == [("john", 1), ("paul", 2)]
|
||||
|
||||
# replace table with zip file
|
||||
resp = upload_columnar(
|
||||
ZIP_FILENAME, PARQUET_UPLOAD_TABLE, extra={"if_exists": "replace"}
|
||||
)
|
||||
success_msg_f2 = f'Columnar file "[\'{ZIP_FILENAME}\']" uploaded to table "{PARQUET_UPLOAD_TABLE}"'
|
||||
assert success_msg_f2 in resp
|
||||
|
||||
data = (
|
||||
get_upload_db()
|
||||
.get_sqla_engine()
|
||||
.execute(f"SELECT * from {PARQUET_UPLOAD_TABLE}")
|
||||
.fetchall()
|
||||
)
|
||||
assert data == [("john", 1), ("paul", 2), ("max", 3), ("bob", 4)]
|
||||
|
||||
Reference in New Issue
Block a user