feat: new CSV upload form and API (#27840)

This commit is contained in:
Daniel Vaz Gaspar
2024-04-15 09:38:51 +01:00
committed by GitHub
parent 40e77be813
commit 54387b4589
30 changed files with 2883 additions and 873 deletions

View File

@@ -20,7 +20,7 @@ import json
import logging
import os
import shutil
from typing import Optional, Union
from typing import Optional
from unittest import mock
@@ -129,31 +129,6 @@ def get_upload_db():
return db.session.query(Database).filter_by(database_name=CSV_UPLOAD_DATABASE).one()
def upload_csv(
filename: str,
table_name: str,
extra: Optional[dict[str, str]] = None,
dtype: Union[str, None] = None,
):
csv_upload_db_id = get_upload_db().id
form_data = {
"csv_file": open(filename, "rb"),
"delimiter": ",",
"table_name": table_name,
"database": csv_upload_db_id,
"if_exists": "fail",
"index_label": "test_label",
"overwrite_duplicate": False,
}
if schema := utils.get_example_default_schema():
form_data["schema"] = schema
if extra:
form_data.update(extra)
if dtype:
form_data["dtype"] = dtype
return get_resp(test_client, "/csvtodatabaseview/form", data=form_data)
def upload_excel(
filename: str, table_name: str, extra: Optional[dict[str, str]] = None
):
@@ -224,205 +199,6 @@ def escaped_parquet(text):
return escaped_double_quotes(f"['{text}']")
@pytest.mark.usefixtures("setup_csv_upload_with_context")
@pytest.mark.usefixtures("create_csv_files")
@mock.patch(
"superset.models.core.config",
{**app.config, "ALLOWED_USER_CSV_SCHEMA_FUNC": lambda d, u: ["admin_database"]},
)
@mock.patch("superset.db_engine_specs.hive.upload_to_s3", mock_upload_to_s3)
@mock.patch("superset.views.database.views.event_logger.log_with_context")
def test_import_csv_enforced_schema(mock_event_logger):
if utils.backend() == "sqlite":
pytest.skip("Sqlite doesn't support schema / database creation")
if utils.backend() == "mysql":
pytest.skip("This test is flaky on MySQL")
full_table_name = f"admin_database.{CSV_UPLOAD_TABLE_W_SCHEMA}"
# Invalid table name
resp = upload_csv(CSV_FILENAME1, full_table_name)
assert "Table name cannot contain a schema" in resp
# no schema specified, fail upload
resp = upload_csv(CSV_FILENAME1, CSV_UPLOAD_TABLE_W_SCHEMA, extra={"schema": None})
assert (
f"Database {escaped_double_quotes(CSV_UPLOAD_DATABASE)} schema"
f" {escaped_double_quotes('None')} is not allowed for csv uploads" in resp
)
success_msg = f"CSV file {escaped_double_quotes(CSV_FILENAME1)} uploaded to table {escaped_double_quotes(full_table_name)}"
resp = upload_csv(
CSV_FILENAME1,
CSV_UPLOAD_TABLE_W_SCHEMA,
extra={"schema": "admin_database", "if_exists": "replace"},
)
assert success_msg in resp
mock_event_logger.assert_called_with(
action="successful_csv_upload",
database=get_upload_db().name,
schema="admin_database",
table=CSV_UPLOAD_TABLE_W_SCHEMA,
)
with get_upload_db().get_sqla_engine() as engine:
data = engine.execute(
f"SELECT * from {ADMIN_SCHEMA_NAME}.{CSV_UPLOAD_TABLE_W_SCHEMA} ORDER BY b"
).fetchall()
assert data == [("john", 1), ("paul", 2)]
# user specified schema doesn't match, fail
resp = upload_csv(
CSV_FILENAME1, CSV_UPLOAD_TABLE_W_SCHEMA, extra={"schema": "gold"}
)
assert (
f'Database {escaped_double_quotes(CSV_UPLOAD_DATABASE)} schema {escaped_double_quotes("gold")} is not allowed for csv uploads'
in resp
)
# user specified schema matches the expected schema, append
if utils.backend() == "hive":
pytest.skip("Hive database doesn't support append csv uploads.")
resp = upload_csv(
CSV_FILENAME1,
CSV_UPLOAD_TABLE_W_SCHEMA,
extra={"schema": "admin_database", "if_exists": "append"},
)
assert success_msg in resp
# Clean up
with get_upload_db().get_sqla_engine() as engine:
engine.execute(f"DROP TABLE {full_table_name}")
@mock.patch("superset.db_engine_specs.hive.upload_to_s3", mock_upload_to_s3)
def test_import_csv_explore_database(setup_csv_upload_with_context, create_csv_files):
schema = utils.get_example_default_schema()
full_table_name = (
f"{schema}.{CSV_UPLOAD_TABLE_W_EXPLORE}"
if schema
else CSV_UPLOAD_TABLE_W_EXPLORE
)
if utils.backend() == "sqlite":
pytest.skip("Sqlite doesn't support schema / database creation")
resp = upload_csv(CSV_FILENAME1, CSV_UPLOAD_TABLE_W_EXPLORE)
assert (
f"CSV file {escaped_double_quotes(CSV_FILENAME1)} uploaded to table {escaped_double_quotes(full_table_name)}"
in resp
)
table = SupersetTestCase.get_table(name=CSV_UPLOAD_TABLE_W_EXPLORE)
assert table.database_id == superset.utils.database.get_example_database().id
@pytest.mark.usefixtures("setup_csv_upload_with_context")
@pytest.mark.usefixtures("create_csv_files")
@mock.patch("superset.db_engine_specs.hive.upload_to_s3", mock_upload_to_s3)
@mock.patch("superset.views.database.views.event_logger.log_with_context")
def test_import_csv(mock_event_logger):
schema = utils.get_example_default_schema()
full_table_name = f"{schema}.{CSV_UPLOAD_TABLE}" if schema else CSV_UPLOAD_TABLE
success_msg_f1 = f"CSV file {escaped_double_quotes(CSV_FILENAME1)} uploaded to table {escaped_double_quotes(full_table_name)}"
test_db = get_upload_db()
# initial upload with fail mode
resp = upload_csv(CSV_FILENAME1, CSV_UPLOAD_TABLE)
assert success_msg_f1 in resp
# upload again with fail mode; should fail
fail_msg = f"Unable to upload CSV file {escaped_double_quotes(CSV_FILENAME1)} to table {escaped_double_quotes(CSV_UPLOAD_TABLE)}"
resp = upload_csv(CSV_FILENAME1, CSV_UPLOAD_TABLE)
assert fail_msg in resp
if utils.backend() != "hive":
# upload again with append mode
resp = upload_csv(
CSV_FILENAME1, CSV_UPLOAD_TABLE, extra={"if_exists": "append"}
)
assert success_msg_f1 in resp
mock_event_logger.assert_called_with(
action="successful_csv_upload",
database=test_db.name,
schema=schema,
table=CSV_UPLOAD_TABLE,
)
# upload again with replace mode
resp = upload_csv(CSV_FILENAME1, CSV_UPLOAD_TABLE, extra={"if_exists": "replace"})
assert success_msg_f1 in resp
# try to append to table from file with different schema
resp = upload_csv(CSV_FILENAME2, CSV_UPLOAD_TABLE, extra={"if_exists": "append"})
fail_msg_f2 = f"Unable to upload CSV file {escaped_double_quotes(CSV_FILENAME2)} to table {escaped_double_quotes(CSV_UPLOAD_TABLE)}"
assert fail_msg_f2 in resp
# replace table from file with different schema
resp = upload_csv(CSV_FILENAME2, CSV_UPLOAD_TABLE, extra={"if_exists": "replace"})
success_msg_f2 = f"CSV file {escaped_double_quotes(CSV_FILENAME2)} uploaded to table {escaped_double_quotes(full_table_name)}"
assert success_msg_f2 in resp
table = SupersetTestCase.get_table(name=CSV_UPLOAD_TABLE)
# make sure the new column name is reflected in the table metadata
assert "d" in table.column_names
# ensure user is assigned as an owner
assert security_manager.find_user("admin") in table.owners
# null values are set
upload_csv(
CSV_FILENAME2,
CSV_UPLOAD_TABLE,
extra={"null_values": '["", "john"]', "if_exists": "replace"},
)
# make sure that john and empty string are replaced with None
with test_db.get_sqla_engine() as engine:
data = engine.execute(f"SELECT * from {CSV_UPLOAD_TABLE} ORDER BY c").fetchall()
assert data == [(None, 1, "x"), ("paul", 2, None)]
# default null values
upload_csv(CSV_FILENAME2, CSV_UPLOAD_TABLE, extra={"if_exists": "replace"})
# make sure that john and empty string are replaced with None
data = engine.execute(f"SELECT * from {CSV_UPLOAD_TABLE} ORDER BY c").fetchall()
assert data == [("john", 1, "x"), ("paul", 2, None)]
# cleanup
with get_upload_db().get_sqla_engine() as engine:
engine.execute(f"DROP TABLE {full_table_name}")
# with dtype
upload_csv(
CSV_FILENAME1,
CSV_UPLOAD_TABLE,
dtype='{"a": "string", "b": "float64"}',
)
# you can change the type to something compatible, like an object to string
# or an int to a float
# file upload should work as normal
with test_db.get_sqla_engine() as engine:
data = engine.execute(f"SELECT * from {CSV_UPLOAD_TABLE} ORDER BY b").fetchall()
assert data == [("john", 1), ("paul", 2)]
# cleanup
with get_upload_db().get_sqla_engine() as engine:
engine.execute(f"DROP TABLE {full_table_name}")
# with dtype - wrong type
resp = upload_csv(
CSV_FILENAME1,
CSV_UPLOAD_TABLE,
dtype='{"a": "int"}',
)
# you cannot pass an incompatible dtype
fail_msg = f"Unable to upload CSV file {escaped_double_quotes(CSV_FILENAME1)} to table {escaped_double_quotes(CSV_UPLOAD_TABLE)}"
assert fail_msg in resp
@pytest.mark.usefixtures("setup_csv_upload_with_context")
@pytest.mark.usefixtures("create_excel_files")
@mock.patch("superset.db_engine_specs.hive.upload_to_s3", mock_upload_to_s3)