chore: refactor file upload commands (#28164)

This commit is contained in:
Daniel Vaz Gaspar
2024-04-23 08:42:19 +01:00
committed by GitHub
parent cfc440c56c
commit de82d90b9c
17 changed files with 930 additions and 734 deletions

View File

@@ -0,0 +1,16 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

View File

@@ -0,0 +1,313 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import io
from datetime import datetime
import numpy as np
import pytest
from superset.commands.database.exceptions import DatabaseUploadFailed
from superset.commands.database.uploaders.csv_reader import CSVReader, CSVReaderOptions
from tests.unit_tests.fixtures.common import create_csv_file
CSV_DATA = [
["Name", "Age", "City", "Birth"],
["name1", "30", "city1", "1990-02-01"],
["name2", "25", "city2", "1995-02-01"],
["name3", "20", "city3", "2000-02-01"],
]
CSV_DATA_CHANGED_HEADER = [
["name1", "30", "city1", "1990-02-01"],
["Name", "Age", "City", "Birth"],
["name2", "25", "city2", "1995-02-01"],
["name3", "20", "city3", "2000-02-01"],
]
CSV_DATA_WITH_NULLS = [
["Name", "Age", "City", "Birth"],
["name1", "N/A", "city1", "1990-02-01"],
["name2", "25", "None", "1995-02-01"],
["name3", "20", "city3", "2000-02-01"],
]
CSV_DATA_DAY_FIRST = [
["Name", "Age", "City", "Birth"],
["name1", "30", "city1", "01-02-1990"],
]
CSV_DATA_DECIMAL_CHAR = [
["Name", "Age", "City", "Birth"],
["name1", "30,1", "city1", "1990-02-01"],
]
CSV_DATA_SKIP_INITIAL_SPACE = [
[" Name", "Age", "City", "Birth"],
[" name1", "30", "city1", "1990-02-01"],
]
@pytest.mark.parametrize(
"file, options, expected_cols, expected_values",
[
(
create_csv_file(CSV_DATA),
CSVReaderOptions(),
["Name", "Age", "City", "Birth"],
[
["name1", 30, "city1", "1990-02-01"],
["name2", 25, "city2", "1995-02-01"],
["name3", 20, "city3", "2000-02-01"],
],
),
(
create_csv_file(CSV_DATA, delimiter="|"),
CSVReaderOptions(delimiter="|"),
["Name", "Age", "City", "Birth"],
[
["name1", 30, "city1", "1990-02-01"],
["name2", 25, "city2", "1995-02-01"],
["name3", 20, "city3", "2000-02-01"],
],
),
(
create_csv_file(CSV_DATA),
CSVReaderOptions(
columns_read=["Name", "Age"],
),
["Name", "Age"],
[
["name1", 30],
["name2", 25],
["name3", 20],
],
),
(
create_csv_file(CSV_DATA),
CSVReaderOptions(
columns_read=["Name", "Age"],
column_data_types={"Age": "float"},
),
["Name", "Age"],
[
["name1", 30.0],
["name2", 25.0],
["name3", 20.0],
],
),
(
create_csv_file(CSV_DATA),
CSVReaderOptions(
columns_read=[],
),
["Name", "Age", "City", "Birth"],
[
["name1", 30, "city1", "1990-02-01"],
["name2", 25, "city2", "1995-02-01"],
["name3", 20, "city3", "2000-02-01"],
],
),
(
create_csv_file(CSV_DATA),
CSVReaderOptions(
columns_read=[],
column_data_types={"Age": "float"},
),
["Name", "Age", "City", "Birth"],
[
["name1", 30.0, "city1", "1990-02-01"],
["name2", 25.0, "city2", "1995-02-01"],
["name3", 20.0, "city3", "2000-02-01"],
],
),
(
create_csv_file(CSV_DATA),
CSVReaderOptions(
rows_to_read=1,
),
["Name", "Age", "City", "Birth"],
[
["name1", 30.0, "city1", "1990-02-01"],
],
),
(
create_csv_file(CSV_DATA),
CSVReaderOptions(
rows_to_read=1,
columns_read=["Name", "Age"],
),
["Name", "Age"],
[
["name1", 30.0],
],
),
(
create_csv_file(CSV_DATA),
CSVReaderOptions(
skip_rows=1,
),
["name1", "30", "city1", "1990-02-01"],
[
["name2", 25.0, "city2", "1995-02-01"],
["name3", 20.0, "city3", "2000-02-01"],
],
),
(
create_csv_file(CSV_DATA),
CSVReaderOptions(
column_dates=["Birth"],
),
["Name", "Age", "City", "Birth"],
[
["name1", 30, "city1", datetime(1990, 2, 1, 0, 0)],
["name2", 25, "city2", datetime(1995, 2, 1, 0, 0)],
["name3", 20, "city3", datetime(2000, 2, 1, 0, 0)],
],
),
(
create_csv_file(CSV_DATA_CHANGED_HEADER),
CSVReaderOptions(
header_row=1,
),
["Name", "Age", "City", "Birth"],
[
["name2", 25, "city2", "1995-02-01"],
["name3", 20, "city3", "2000-02-01"],
],
),
(
create_csv_file(CSV_DATA_WITH_NULLS),
CSVReaderOptions(
null_values=["N/A", "None"],
),
["Name", "Age", "City", "Birth"],
[
["name1", np.nan, "city1", "1990-02-01"],
["name2", 25.0, np.nan, "1995-02-01"],
["name3", 20.0, "city3", "2000-02-01"],
],
),
(
create_csv_file(CSV_DATA_DAY_FIRST),
CSVReaderOptions(
day_first=False,
column_dates=["Birth"],
),
["Name", "Age", "City", "Birth"],
[
["name1", 30, "city1", datetime(1990, 1, 2, 0, 0)],
],
),
(
create_csv_file(CSV_DATA_DAY_FIRST),
CSVReaderOptions(
day_first=True,
column_dates=["Birth"],
),
["Name", "Age", "City", "Birth"],
[
["name1", 30, "city1", datetime(1990, 2, 1, 0, 0)],
],
),
(
create_csv_file(CSV_DATA_DECIMAL_CHAR),
CSVReaderOptions(
decimal_character=",",
),
["Name", "Age", "City", "Birth"],
[
["name1", 30.1, "city1", "1990-02-01"],
],
),
(
create_csv_file(CSV_DATA_SKIP_INITIAL_SPACE),
CSVReaderOptions(
skip_initial_space=True,
),
["Name", "Age", "City", "Birth"],
[
["name1", 30, "city1", "1990-02-01"],
],
),
],
)
def test_csv_reader_file_to_dataframe(file, options, expected_cols, expected_values):
csv_reader = CSVReader(
options=options,
)
df = csv_reader.file_to_dataframe(file)
assert df.columns.tolist() == expected_cols
actual_values = df.values.tolist()
for i in range(len(expected_values)):
for j in range(len(expected_values[i])):
expected_val = expected_values[i][j]
actual_val = actual_values[i][j]
# Check if both values are NaN
if isinstance(expected_val, float) and isinstance(actual_val, float):
assert np.isnan(expected_val) == np.isnan(actual_val)
else:
assert expected_val == actual_val
file.close()
def test_csv_reader_broken_file_no_columns():
csv_reader = CSVReader(
options=CSVReaderOptions(),
)
with pytest.raises(DatabaseUploadFailed) as ex:
csv_reader.file_to_dataframe(create_csv_file([""]))
assert str(ex.value) == "Parsing error: No columns to parse from file"
def test_csv_reader_wrong_columns_to_read():
csv_reader = CSVReader(
options=CSVReaderOptions(columns_read=["xpto"]),
)
with pytest.raises(DatabaseUploadFailed) as ex:
csv_reader.file_to_dataframe(create_csv_file(CSV_DATA))
assert str(ex.value) == (
"Parsing error: Usecols do not match columns, "
"columns expected but not found: ['xpto']"
)
def test_csv_reader_invalid_file():
csv_reader = CSVReader(
options=CSVReaderOptions(),
)
with pytest.raises(DatabaseUploadFailed) as ex:
csv_reader.file_to_dataframe(
io.StringIO("c1,c2,c3\na,b,c\n1,2,3,4,5,6,7\n1,2,3")
)
assert str(ex.value) == (
"Parsing error: Error tokenizing data. C error:"
" Expected 3 fields in line 3, saw 7\n"
)
def test_csv_reader_invalid_encoding():
csv_reader = CSVReader(
options=CSVReaderOptions(),
)
binary_data = b"col1,col2,col3\nv1,v2,\xba\nv3,v4,v5\n"
with pytest.raises(DatabaseUploadFailed) as ex:
csv_reader.file_to_dataframe(io.BytesIO(binary_data))
assert str(ex.value) == (
"Parsing error: 'utf-8' codec can't decode byte 0xba in"
" position 21: invalid start byte"
)

View File

@@ -0,0 +1,209 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import io
from datetime import datetime
from typing import Any
import numpy as np
import pytest
from superset.commands.database.exceptions import DatabaseUploadFailed
from superset.commands.database.uploaders.excel_reader import (
ExcelReader,
ExcelReaderOptions,
)
from tests.unit_tests.fixtures.common import create_excel_file
EXCEL_DATA: dict[str, list[Any]] = {
"Name": ["name1", "name2", "name3"],
"Age": [30, 25, 20],
"City": ["city1", "city2", "city3"],
"Birth": ["1990-02-01", "1995-02-01", "2000-02-01"],
}
EXCEL_WITH_NULLS: dict[str, list[Any]] = {
"Name": ["name1", "name2", "name3"],
"Age": ["N/A", 25, 20],
"City": ["city1", "None", "city3"],
"Birth": ["1990-02-01", "1995-02-01", "2000-02-01"],
}
EXCEL_DATA_DECIMAL_CHAR = {
"Name": ["name1"],
"Age": ["30,1"],
"City": ["city1"],
"Birth": ["1990-02-01"],
}
@pytest.mark.parametrize(
"file, options, expected_cols, expected_values",
[
(
create_excel_file(EXCEL_DATA),
ExcelReaderOptions(),
["Name", "Age", "City", "Birth"],
[
["name1", 30, "city1", "1990-02-01"],
["name2", 25, "city2", "1995-02-01"],
["name3", 20, "city3", "2000-02-01"],
],
),
(
create_excel_file(EXCEL_DATA),
ExcelReaderOptions(
columns_read=["Name", "Age"],
),
["Name", "Age"],
[
["name1", 30],
["name2", 25],
["name3", 20],
],
),
(
create_excel_file(EXCEL_DATA),
ExcelReaderOptions(
columns_read=[],
),
["Name", "Age", "City", "Birth"],
[
["name1", 30, "city1", "1990-02-01"],
["name2", 25, "city2", "1995-02-01"],
["name3", 20, "city3", "2000-02-01"],
],
),
(
create_excel_file(EXCEL_DATA),
ExcelReaderOptions(
rows_to_read=1,
),
["Name", "Age", "City", "Birth"],
[
["name1", 30.0, "city1", "1990-02-01"],
],
),
(
create_excel_file(EXCEL_DATA),
ExcelReaderOptions(
rows_to_read=1,
columns_read=["Name", "Age"],
),
["Name", "Age"],
[
["name1", 30.0],
],
),
(
create_excel_file(EXCEL_DATA),
ExcelReaderOptions(
skip_rows=1,
),
["name1", 30, "city1", "1990-02-01"],
[
["name2", 25.0, "city2", "1995-02-01"],
["name3", 20.0, "city3", "2000-02-01"],
],
),
(
create_excel_file(EXCEL_DATA),
ExcelReaderOptions(
column_dates=["Birth"],
),
["Name", "Age", "City", "Birth"],
[
["name1", 30, "city1", datetime(1990, 2, 1, 0, 0)],
["name2", 25, "city2", datetime(1995, 2, 1, 0, 0)],
["name3", 20, "city3", datetime(2000, 2, 1, 0, 0)],
],
),
(
create_excel_file(EXCEL_WITH_NULLS),
ExcelReaderOptions(
null_values=["N/A", "None"],
),
["Name", "Age", "City", "Birth"],
[
["name1", np.nan, "city1", "1990-02-01"],
["name2", 25.0, np.nan, "1995-02-01"],
["name3", 20.0, "city3", "2000-02-01"],
],
),
(
create_excel_file(EXCEL_DATA_DECIMAL_CHAR),
ExcelReaderOptions(
decimal_character=",",
),
["Name", "Age", "City", "Birth"],
[
["name1", 30.1, "city1", "1990-02-01"],
],
),
],
)
def test_excel_reader_file_to_dataframe(file, options, expected_cols, expected_values):
excel_reader = ExcelReader(
options=options,
)
df = excel_reader.file_to_dataframe(file)
assert df.columns.tolist() == expected_cols
actual_values = df.values.tolist()
for i in range(len(expected_values)):
for j in range(len(expected_values[i])):
expected_val = expected_values[i][j]
actual_val = actual_values[i][j]
# Check if both values are NaN
if isinstance(expected_val, float) and isinstance(actual_val, float):
assert np.isnan(expected_val) == np.isnan(actual_val)
else:
assert expected_val == actual_val
file.close()
def test_excel_reader_wrong_columns_to_read():
excel_reader = ExcelReader(
options=ExcelReaderOptions(columns_read=["xpto"]),
)
with pytest.raises(DatabaseUploadFailed) as ex:
excel_reader.file_to_dataframe(create_excel_file(EXCEL_DATA))
assert str(ex.value) == (
"Parsing error: Usecols do not match columns, "
"columns expected but not found: ['xpto'] (sheet: 0)"
)
def test_excel_reader_wrong_date():
excel_reader = ExcelReader(
options=ExcelReaderOptions(column_dates=["xpto"]),
)
with pytest.raises(DatabaseUploadFailed) as ex:
excel_reader.file_to_dataframe(create_excel_file(EXCEL_DATA))
assert str(ex.value) == (
"Parsing error: Missing column provided to 'parse_dates':" " 'xpto' (sheet: 0)"
)
def test_excel_reader_invalid_file():
excel_reader = ExcelReader(
options=ExcelReaderOptions(),
)
with pytest.raises(DatabaseUploadFailed) as ex:
excel_reader.file_to_dataframe(io.StringIO("c1"))
assert str(ex.value) == (
"Parsing error: Excel file format cannot be determined, you must specify an engine manually."
)

View File

@@ -33,8 +33,9 @@ from pytest_mock import MockFixture
from sqlalchemy.orm.session import Session
from superset import db
from superset.commands.database.csv_import import CSVImportCommand
from superset.commands.database.excel_import import ExcelImportCommand
from superset.commands.database.uploaders.base import UploadCommand
from superset.commands.database.uploaders.csv_reader import CSVReader
from superset.commands.database.uploaders.excel_reader import ExcelReader
from superset.db_engine_specs.sqlite import SqliteEngineSpec
from superset.errors import ErrorLevel, SupersetError, SupersetErrorType
from superset.exceptions import SupersetSecurityException
@@ -829,7 +830,7 @@ def test_oauth2_error(
@pytest.mark.parametrize(
"payload,cmd_called_with",
"payload,upload_called_with,reader_called_with",
[
(
{
@@ -841,6 +842,10 @@ def test_oauth2_error(
1,
"table1",
ANY,
None,
ANY,
),
(
{
"already_exists": "fail",
"delimiter": ",",
@@ -861,6 +866,10 @@ def test_oauth2_error(
1,
"table2",
ANY,
None,
ANY,
),
(
{
"already_exists": "replace",
"column_dates": ["col1", "col2"],
@@ -879,7 +888,6 @@ def test_oauth2_error(
"columns_read": "col1,col2",
"day_first": True,
"rows_to_read": "1",
"overwrite_duplicates": True,
"skip_blank_lines": True,
"skip_initial_space": True,
"skip_rows": "10",
@@ -890,12 +898,15 @@ def test_oauth2_error(
1,
"table2",
ANY,
None,
ANY,
),
(
{
"already_exists": "replace",
"columns_read": ["col1", "col2"],
"null_values": ["None", "N/A", "''"],
"day_first": True,
"overwrite_duplicates": True,
"rows_to_read": 1,
"skip_blank_lines": True,
"skip_initial_space": True,
@@ -911,7 +922,8 @@ def test_oauth2_error(
)
def test_csv_upload(
payload: dict[str, Any],
cmd_called_with: tuple[int, str, Any, dict[str, Any]],
upload_called_with: tuple[int, str, Any, dict[str, Any]],
reader_called_with: dict[str, Any],
mocker: MockFixture,
client: Any,
full_api_access: None,
@@ -919,9 +931,11 @@ def test_csv_upload(
"""
Test CSV Upload success.
"""
init_mock = mocker.patch.object(CSVImportCommand, "__init__")
init_mock = mocker.patch.object(UploadCommand, "__init__")
init_mock.return_value = None
_ = mocker.patch.object(CSVImportCommand, "run")
_ = mocker.patch.object(UploadCommand, "run")
reader_mock = mocker.patch.object(CSVReader, "__init__")
reader_mock.return_value = None
response = client.post(
f"/api/v1/database/1/csv_upload/",
data=payload,
@@ -929,7 +943,8 @@ def test_csv_upload(
)
assert response.status_code == 200
assert response.json == {"message": "OK"}
init_mock.assert_called_with(*cmd_called_with)
init_mock.assert_called_with(*upload_called_with)
reader_mock.assert_called_with(*reader_called_with)
@pytest.mark.parametrize(
@@ -994,16 +1009,6 @@ def test_csv_upload(
},
{"message": {"header_row": ["Not a valid integer."]}},
),
(
{
"file": (create_csv_file(), "out.csv"),
"table_name": "table1",
"delimiter": ",",
"already_exists": "fail",
"overwrite_duplicates": "test1",
},
{"message": {"overwrite_duplicates": ["Not a valid boolean."]}},
),
(
{
"file": (create_csv_file(), "out.csv"),
@@ -1066,7 +1071,7 @@ def test_csv_upload_validation(
"""
Test CSV Upload validation fails.
"""
_ = mocker.patch.object(CSVImportCommand, "run")
_ = mocker.patch.object(UploadCommand, "run")
response = client.post(
f"/api/v1/database/1/csv_upload/",
@@ -1085,7 +1090,7 @@ def test_csv_upload_file_size_validation(
"""
Test CSV Upload validation fails.
"""
_ = mocker.patch.object(CSVImportCommand, "run")
_ = mocker.patch.object(UploadCommand, "run")
current_app.config["CSV_UPLOAD_MAX_SIZE"] = 5
response = client.post(
f"/api/v1/database/1/csv_upload/",
@@ -1127,7 +1132,7 @@ def test_csv_upload_file_extension_invalid(
"""
Test CSV Upload validation fails.
"""
_ = mocker.patch.object(CSVImportCommand, "run")
_ = mocker.patch.object(UploadCommand, "run")
response = client.post(
f"/api/v1/database/1/csv_upload/",
data={
@@ -1163,7 +1168,7 @@ def test_csv_upload_file_extension_valid(
"""
Test CSV Upload validation fails.
"""
_ = mocker.patch.object(CSVImportCommand, "run")
_ = mocker.patch.object(UploadCommand, "run")
response = client.post(
f"/api/v1/database/1/csv_upload/",
data={
@@ -1177,7 +1182,7 @@ def test_csv_upload_file_extension_valid(
@pytest.mark.parametrize(
"payload,cmd_called_with",
"payload,upload_called_with,reader_called_with",
[
(
{
@@ -1188,6 +1193,10 @@ def test_csv_upload_file_extension_valid(
1,
"table1",
ANY,
None,
ANY,
),
(
{
"already_exists": "fail",
"file": ANY,
@@ -1207,6 +1216,10 @@ def test_csv_upload_file_extension_valid(
1,
"table2",
ANY,
None,
ANY,
),
(
{
"already_exists": "replace",
"column_dates": ["col1", "col2"],
@@ -1231,6 +1244,10 @@ def test_csv_upload_file_extension_valid(
1,
"table2",
ANY,
None,
ANY,
),
(
{
"already_exists": "replace",
"columns_read": ["col1", "col2"],
@@ -1247,7 +1264,8 @@ def test_csv_upload_file_extension_valid(
)
def test_excel_upload(
payload: dict[str, Any],
cmd_called_with: tuple[int, str, Any, dict[str, Any]],
upload_called_with: tuple[int, str, Any, dict[str, Any]],
reader_called_with: dict[str, Any],
mocker: MockFixture,
client: Any,
full_api_access: None,
@@ -1255,9 +1273,11 @@ def test_excel_upload(
"""
Test Excel Upload success.
"""
init_mock = mocker.patch.object(ExcelImportCommand, "__init__")
init_mock = mocker.patch.object(UploadCommand, "__init__")
init_mock.return_value = None
_ = mocker.patch.object(ExcelImportCommand, "run")
_ = mocker.patch.object(UploadCommand, "run")
reader_mock = mocker.patch.object(ExcelReader, "__init__")
reader_mock.return_value = None
response = client.post(
f"/api/v1/database/1/excel_upload/",
data=payload,
@@ -1265,7 +1285,8 @@ def test_excel_upload(
)
assert response.status_code == 200
assert response.json == {"message": "OK"}
init_mock.assert_called_with(*cmd_called_with)
init_mock.assert_called_with(*upload_called_with)
reader_mock.assert_called_with(*reader_called_with)
@pytest.mark.parametrize(
@@ -1347,7 +1368,7 @@ def test_excel_upload_validation(
"""
Test Excel Upload validation fails.
"""
_ = mocker.patch.object(ExcelImportCommand, "run")
_ = mocker.patch.object(UploadCommand, "run")
response = client.post(
f"/api/v1/database/1/excel_upload/",
@@ -1382,7 +1403,7 @@ def test_excel_upload_file_extension_invalid(
"""
Test Excel Upload file extension fails.
"""
_ = mocker.patch.object(ExcelImportCommand, "run")
_ = mocker.patch.object(UploadCommand, "run")
response = client.post(
f"/api/v1/database/1/excel_upload/",
data={

View File

@@ -31,7 +31,7 @@ def dttm() -> datetime:
return datetime.strptime("2019-01-02 03:04:05.678900", "%Y-%m-%d %H:%M:%S.%f")
def create_csv_file(data: list[list[str]] | None = None) -> BytesIO:
def create_csv_file(data: list[list[str]] | None = None, delimiter=",") -> BytesIO:
data = (
[
["Name", "Age", "City"],
@@ -42,7 +42,7 @@ def create_csv_file(data: list[list[str]] | None = None) -> BytesIO:
)
output = StringIO()
writer = csv.writer(output)
writer = csv.writer(output, delimiter=delimiter)
for row in data:
writer.writerow(row)
output.seek(0)