feat: Add parquet upload (#14449)

* allow csv upload to accept parquet file

* fix mypy

* fix if statement

* add test for specificying columns in CSV upload

* clean up test

* change order in test

* fix failures

* upload parquet to seperate table in test

* fix error message

* fix mypy again

* rename other extensions to columnar

* add new form for columnar upload

* add support for zip files

* undo csv form changes except usecols

* add more tests for zip

* isort & black

* pylint

* fix trailing space

* address more review comments

* pylint

* black

* resolve remaining issues
This commit is contained in:
Shiva Raisinghani
2021-08-31 00:20:25 -07:00
committed by GitHub
parent ad8336a5b4
commit d25b0967a1
10 changed files with 493 additions and 10 deletions

View File

@@ -19,6 +19,7 @@
import json
import logging
import os
import shutil
from typing import Dict, Optional
from unittest import mock
@@ -43,9 +44,14 @@ CSV_UPLOAD_DATABASE = "csv_explore_db"
CSV_FILENAME1 = "testCSV1.csv"
CSV_FILENAME2 = "testCSV2.csv"
EXCEL_FILENAME = "testExcel.xlsx"
PARQUET_FILENAME1 = "testZip/testParquet1.parquet"
PARQUET_FILENAME2 = "testZip/testParquet2.parquet"
ZIP_DIRNAME = "testZip"
ZIP_FILENAME = "testZip.zip"
EXCEL_UPLOAD_TABLE = "excel_upload"
CSV_UPLOAD_TABLE = "csv_upload"
PARQUET_UPLOAD_TABLE = "parquet_upload"
CSV_UPLOAD_TABLE_W_SCHEMA = "csv_upload_w_schema"
CSV_UPLOAD_TABLE_W_EXPLORE = "csv_upload_w_explore"
@@ -70,6 +76,7 @@ def setup_csv_upload():
engine = upload_db.get_sqla_engine()
engine.execute(f"DROP TABLE IF EXISTS {EXCEL_UPLOAD_TABLE}")
engine.execute(f"DROP TABLE IF EXISTS {CSV_UPLOAD_TABLE}")
engine.execute(f"DROP TABLE IF EXISTS {PARQUET_UPLOAD_TABLE}")
engine.execute(f"DROP TABLE IF EXISTS {CSV_UPLOAD_TABLE_W_SCHEMA}")
engine.execute(f"DROP TABLE IF EXISTS {CSV_UPLOAD_TABLE_W_EXPLORE}")
db.session.delete(upload_db)
@@ -97,6 +104,17 @@ def create_excel_files():
os.remove(EXCEL_FILENAME)
@pytest.fixture()
def create_columnar_files():
os.mkdir(ZIP_DIRNAME)
pd.DataFrame({"a": ["john", "paul"], "b": [1, 2]}).to_parquet(PARQUET_FILENAME1)
pd.DataFrame({"a": ["max", "bob"], "b": [3, 4]}).to_parquet(PARQUET_FILENAME2)
shutil.make_archive(ZIP_DIRNAME, "zip", ZIP_DIRNAME)
yield
os.remove(ZIP_FILENAME)
shutil.rmtree(ZIP_DIRNAME)
def get_upload_db():
return db.session.query(Database).filter_by(database_name=CSV_UPLOAD_DATABASE).one()
@@ -134,6 +152,22 @@ def upload_excel(
return get_resp(test_client, "/exceltodatabaseview/form", data=form_data)
def upload_columnar(
filename: str, table_name: str, extra: Optional[Dict[str, str]] = None
):
columnar_upload_db_id = get_upload_db().id
form_data = {
"columnar_file": open(filename, "rb"),
"name": table_name,
"con": columnar_upload_db_id,
"if_exists": "fail",
"index_label": "test_label",
}
if extra:
form_data.update(extra)
return get_resp(test_client, "/columnartodatabaseview/form", data=form_data)
def mock_upload_to_s3(filename: str, upload_prefix: str, table: Table) -> str:
"""
HDFS is used instead of S3 for the unit tests.integration_tests.
@@ -249,6 +283,18 @@ def test_import_csv(setup_csv_upload, create_csv_files):
)
assert success_msg_f1 in resp
# upload again with replace mode and specific columns
resp = upload_csv(
CSV_FILENAME1,
CSV_UPLOAD_TABLE,
extra={"if_exists": "replace", "usecols": '["a"]'},
)
assert success_msg_f1 in resp
# make sure only specified column name was read
table = SupersetTestCase.get_table(name=CSV_UPLOAD_TABLE)
assert "b" not in table.column_names
# upload again with replace mode
resp = upload_csv(CSV_FILENAME1, CSV_UPLOAD_TABLE, extra={"if_exists": "replace"})
assert success_msg_f1 in resp
@@ -328,3 +374,68 @@ def test_import_excel(setup_csv_upload, create_excel_files):
.fetchall()
)
assert data == [(0, "john", 1), (1, "paul", 2)]
@mock.patch("superset.db_engine_specs.hive.upload_to_s3", mock_upload_to_s3)
def test_import_parquet(setup_csv_upload, create_columnar_files):
if utils.backend() == "hive":
pytest.skip("Hive doesn't allow parquet upload.")
success_msg_f1 = f'Columnar file "[\'{PARQUET_FILENAME1}\']" uploaded to table "{PARQUET_UPLOAD_TABLE}"'
# initial upload with fail mode
resp = upload_columnar(PARQUET_FILENAME1, PARQUET_UPLOAD_TABLE)
assert success_msg_f1 in resp
# upload again with fail mode; should fail
fail_msg = f'Unable to upload Columnar file "[\'{PARQUET_FILENAME1}\']" to table "{PARQUET_UPLOAD_TABLE}"'
resp = upload_columnar(PARQUET_FILENAME1, PARQUET_UPLOAD_TABLE)
assert fail_msg in resp
if utils.backend() != "hive":
# upload again with append mode
resp = upload_columnar(
PARQUET_FILENAME1, PARQUET_UPLOAD_TABLE, extra={"if_exists": "append"}
)
assert success_msg_f1 in resp
# upload again with replace mode and specific columns
resp = upload_columnar(
PARQUET_FILENAME1,
PARQUET_UPLOAD_TABLE,
extra={"if_exists": "replace", "usecols": '["a"]'},
)
assert success_msg_f1 in resp
# make sure only specified column name was read
table = SupersetTestCase.get_table(name=PARQUET_UPLOAD_TABLE)
assert "b" not in table.column_names
# upload again with replace mode
resp = upload_columnar(
PARQUET_FILENAME1, PARQUET_UPLOAD_TABLE, extra={"if_exists": "replace"}
)
assert success_msg_f1 in resp
data = (
get_upload_db()
.get_sqla_engine()
.execute(f"SELECT * from {PARQUET_UPLOAD_TABLE}")
.fetchall()
)
assert data == [("john", 1), ("paul", 2)]
# replace table with zip file
resp = upload_columnar(
ZIP_FILENAME, PARQUET_UPLOAD_TABLE, extra={"if_exists": "replace"}
)
success_msg_f2 = f'Columnar file "[\'{ZIP_FILENAME}\']" uploaded to table "{PARQUET_UPLOAD_TABLE}"'
assert success_msg_f2 in resp
data = (
get_upload_db()
.get_sqla_engine()
.execute(f"SELECT * from {PARQUET_UPLOAD_TABLE}")
.fetchall()
)
assert data == [("john", 1), ("paul", 2), ("max", 3), ("bob", 4)]