mirror of
https://github.com/apache/superset.git
synced 2026-04-13 21:24:28 +00:00
chore: refactor file upload commands (#28164)
This commit is contained in:
committed by
GitHub
parent
cfc440c56c
commit
de82d90b9c
313
tests/unit_tests/commands/databases/csv_reader_test.py
Normal file
313
tests/unit_tests/commands/databases/csv_reader_test.py
Normal file
@@ -0,0 +1,313 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
import io
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from superset.commands.database.exceptions import DatabaseUploadFailed
|
||||
from superset.commands.database.uploaders.csv_reader import CSVReader, CSVReaderOptions
|
||||
from tests.unit_tests.fixtures.common import create_csv_file
|
||||
|
||||
CSV_DATA = [
|
||||
["Name", "Age", "City", "Birth"],
|
||||
["name1", "30", "city1", "1990-02-01"],
|
||||
["name2", "25", "city2", "1995-02-01"],
|
||||
["name3", "20", "city3", "2000-02-01"],
|
||||
]
|
||||
|
||||
CSV_DATA_CHANGED_HEADER = [
|
||||
["name1", "30", "city1", "1990-02-01"],
|
||||
["Name", "Age", "City", "Birth"],
|
||||
["name2", "25", "city2", "1995-02-01"],
|
||||
["name3", "20", "city3", "2000-02-01"],
|
||||
]
|
||||
|
||||
CSV_DATA_WITH_NULLS = [
|
||||
["Name", "Age", "City", "Birth"],
|
||||
["name1", "N/A", "city1", "1990-02-01"],
|
||||
["name2", "25", "None", "1995-02-01"],
|
||||
["name3", "20", "city3", "2000-02-01"],
|
||||
]
|
||||
|
||||
CSV_DATA_DAY_FIRST = [
|
||||
["Name", "Age", "City", "Birth"],
|
||||
["name1", "30", "city1", "01-02-1990"],
|
||||
]
|
||||
|
||||
CSV_DATA_DECIMAL_CHAR = [
|
||||
["Name", "Age", "City", "Birth"],
|
||||
["name1", "30,1", "city1", "1990-02-01"],
|
||||
]
|
||||
|
||||
CSV_DATA_SKIP_INITIAL_SPACE = [
|
||||
[" Name", "Age", "City", "Birth"],
|
||||
[" name1", "30", "city1", "1990-02-01"],
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"file, options, expected_cols, expected_values",
|
||||
[
|
||||
(
|
||||
create_csv_file(CSV_DATA),
|
||||
CSVReaderOptions(),
|
||||
["Name", "Age", "City", "Birth"],
|
||||
[
|
||||
["name1", 30, "city1", "1990-02-01"],
|
||||
["name2", 25, "city2", "1995-02-01"],
|
||||
["name3", 20, "city3", "2000-02-01"],
|
||||
],
|
||||
),
|
||||
(
|
||||
create_csv_file(CSV_DATA, delimiter="|"),
|
||||
CSVReaderOptions(delimiter="|"),
|
||||
["Name", "Age", "City", "Birth"],
|
||||
[
|
||||
["name1", 30, "city1", "1990-02-01"],
|
||||
["name2", 25, "city2", "1995-02-01"],
|
||||
["name3", 20, "city3", "2000-02-01"],
|
||||
],
|
||||
),
|
||||
(
|
||||
create_csv_file(CSV_DATA),
|
||||
CSVReaderOptions(
|
||||
columns_read=["Name", "Age"],
|
||||
),
|
||||
["Name", "Age"],
|
||||
[
|
||||
["name1", 30],
|
||||
["name2", 25],
|
||||
["name3", 20],
|
||||
],
|
||||
),
|
||||
(
|
||||
create_csv_file(CSV_DATA),
|
||||
CSVReaderOptions(
|
||||
columns_read=["Name", "Age"],
|
||||
column_data_types={"Age": "float"},
|
||||
),
|
||||
["Name", "Age"],
|
||||
[
|
||||
["name1", 30.0],
|
||||
["name2", 25.0],
|
||||
["name3", 20.0],
|
||||
],
|
||||
),
|
||||
(
|
||||
create_csv_file(CSV_DATA),
|
||||
CSVReaderOptions(
|
||||
columns_read=[],
|
||||
),
|
||||
["Name", "Age", "City", "Birth"],
|
||||
[
|
||||
["name1", 30, "city1", "1990-02-01"],
|
||||
["name2", 25, "city2", "1995-02-01"],
|
||||
["name3", 20, "city3", "2000-02-01"],
|
||||
],
|
||||
),
|
||||
(
|
||||
create_csv_file(CSV_DATA),
|
||||
CSVReaderOptions(
|
||||
columns_read=[],
|
||||
column_data_types={"Age": "float"},
|
||||
),
|
||||
["Name", "Age", "City", "Birth"],
|
||||
[
|
||||
["name1", 30.0, "city1", "1990-02-01"],
|
||||
["name2", 25.0, "city2", "1995-02-01"],
|
||||
["name3", 20.0, "city3", "2000-02-01"],
|
||||
],
|
||||
),
|
||||
(
|
||||
create_csv_file(CSV_DATA),
|
||||
CSVReaderOptions(
|
||||
rows_to_read=1,
|
||||
),
|
||||
["Name", "Age", "City", "Birth"],
|
||||
[
|
||||
["name1", 30.0, "city1", "1990-02-01"],
|
||||
],
|
||||
),
|
||||
(
|
||||
create_csv_file(CSV_DATA),
|
||||
CSVReaderOptions(
|
||||
rows_to_read=1,
|
||||
columns_read=["Name", "Age"],
|
||||
),
|
||||
["Name", "Age"],
|
||||
[
|
||||
["name1", 30.0],
|
||||
],
|
||||
),
|
||||
(
|
||||
create_csv_file(CSV_DATA),
|
||||
CSVReaderOptions(
|
||||
skip_rows=1,
|
||||
),
|
||||
["name1", "30", "city1", "1990-02-01"],
|
||||
[
|
||||
["name2", 25.0, "city2", "1995-02-01"],
|
||||
["name3", 20.0, "city3", "2000-02-01"],
|
||||
],
|
||||
),
|
||||
(
|
||||
create_csv_file(CSV_DATA),
|
||||
CSVReaderOptions(
|
||||
column_dates=["Birth"],
|
||||
),
|
||||
["Name", "Age", "City", "Birth"],
|
||||
[
|
||||
["name1", 30, "city1", datetime(1990, 2, 1, 0, 0)],
|
||||
["name2", 25, "city2", datetime(1995, 2, 1, 0, 0)],
|
||||
["name3", 20, "city3", datetime(2000, 2, 1, 0, 0)],
|
||||
],
|
||||
),
|
||||
(
|
||||
create_csv_file(CSV_DATA_CHANGED_HEADER),
|
||||
CSVReaderOptions(
|
||||
header_row=1,
|
||||
),
|
||||
["Name", "Age", "City", "Birth"],
|
||||
[
|
||||
["name2", 25, "city2", "1995-02-01"],
|
||||
["name3", 20, "city3", "2000-02-01"],
|
||||
],
|
||||
),
|
||||
(
|
||||
create_csv_file(CSV_DATA_WITH_NULLS),
|
||||
CSVReaderOptions(
|
||||
null_values=["N/A", "None"],
|
||||
),
|
||||
["Name", "Age", "City", "Birth"],
|
||||
[
|
||||
["name1", np.nan, "city1", "1990-02-01"],
|
||||
["name2", 25.0, np.nan, "1995-02-01"],
|
||||
["name3", 20.0, "city3", "2000-02-01"],
|
||||
],
|
||||
),
|
||||
(
|
||||
create_csv_file(CSV_DATA_DAY_FIRST),
|
||||
CSVReaderOptions(
|
||||
day_first=False,
|
||||
column_dates=["Birth"],
|
||||
),
|
||||
["Name", "Age", "City", "Birth"],
|
||||
[
|
||||
["name1", 30, "city1", datetime(1990, 1, 2, 0, 0)],
|
||||
],
|
||||
),
|
||||
(
|
||||
create_csv_file(CSV_DATA_DAY_FIRST),
|
||||
CSVReaderOptions(
|
||||
day_first=True,
|
||||
column_dates=["Birth"],
|
||||
),
|
||||
["Name", "Age", "City", "Birth"],
|
||||
[
|
||||
["name1", 30, "city1", datetime(1990, 2, 1, 0, 0)],
|
||||
],
|
||||
),
|
||||
(
|
||||
create_csv_file(CSV_DATA_DECIMAL_CHAR),
|
||||
CSVReaderOptions(
|
||||
decimal_character=",",
|
||||
),
|
||||
["Name", "Age", "City", "Birth"],
|
||||
[
|
||||
["name1", 30.1, "city1", "1990-02-01"],
|
||||
],
|
||||
),
|
||||
(
|
||||
create_csv_file(CSV_DATA_SKIP_INITIAL_SPACE),
|
||||
CSVReaderOptions(
|
||||
skip_initial_space=True,
|
||||
),
|
||||
["Name", "Age", "City", "Birth"],
|
||||
[
|
||||
["name1", 30, "city1", "1990-02-01"],
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_csv_reader_file_to_dataframe(file, options, expected_cols, expected_values):
|
||||
csv_reader = CSVReader(
|
||||
options=options,
|
||||
)
|
||||
df = csv_reader.file_to_dataframe(file)
|
||||
assert df.columns.tolist() == expected_cols
|
||||
actual_values = df.values.tolist()
|
||||
for i in range(len(expected_values)):
|
||||
for j in range(len(expected_values[i])):
|
||||
expected_val = expected_values[i][j]
|
||||
actual_val = actual_values[i][j]
|
||||
|
||||
# Check if both values are NaN
|
||||
if isinstance(expected_val, float) and isinstance(actual_val, float):
|
||||
assert np.isnan(expected_val) == np.isnan(actual_val)
|
||||
else:
|
||||
assert expected_val == actual_val
|
||||
file.close()
|
||||
|
||||
|
||||
def test_csv_reader_broken_file_no_columns():
|
||||
csv_reader = CSVReader(
|
||||
options=CSVReaderOptions(),
|
||||
)
|
||||
with pytest.raises(DatabaseUploadFailed) as ex:
|
||||
csv_reader.file_to_dataframe(create_csv_file([""]))
|
||||
assert str(ex.value) == "Parsing error: No columns to parse from file"
|
||||
|
||||
|
||||
def test_csv_reader_wrong_columns_to_read():
|
||||
csv_reader = CSVReader(
|
||||
options=CSVReaderOptions(columns_read=["xpto"]),
|
||||
)
|
||||
with pytest.raises(DatabaseUploadFailed) as ex:
|
||||
csv_reader.file_to_dataframe(create_csv_file(CSV_DATA))
|
||||
assert str(ex.value) == (
|
||||
"Parsing error: Usecols do not match columns, "
|
||||
"columns expected but not found: ['xpto']"
|
||||
)
|
||||
|
||||
|
||||
def test_csv_reader_invalid_file():
|
||||
csv_reader = CSVReader(
|
||||
options=CSVReaderOptions(),
|
||||
)
|
||||
with pytest.raises(DatabaseUploadFailed) as ex:
|
||||
csv_reader.file_to_dataframe(
|
||||
io.StringIO("c1,c2,c3\na,b,c\n1,2,3,4,5,6,7\n1,2,3")
|
||||
)
|
||||
assert str(ex.value) == (
|
||||
"Parsing error: Error tokenizing data. C error:"
|
||||
" Expected 3 fields in line 3, saw 7\n"
|
||||
)
|
||||
|
||||
|
||||
def test_csv_reader_invalid_encoding():
|
||||
csv_reader = CSVReader(
|
||||
options=CSVReaderOptions(),
|
||||
)
|
||||
binary_data = b"col1,col2,col3\nv1,v2,\xba\nv3,v4,v5\n"
|
||||
with pytest.raises(DatabaseUploadFailed) as ex:
|
||||
csv_reader.file_to_dataframe(io.BytesIO(binary_data))
|
||||
assert str(ex.value) == (
|
||||
"Parsing error: 'utf-8' codec can't decode byte 0xba in"
|
||||
" position 21: invalid start byte"
|
||||
)
|
||||
Reference in New Issue
Block a user