mirror of
https://github.com/apache/superset.git
synced 2026-04-22 01:24:43 +00:00
879 lines
27 KiB
Python
879 lines
27 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
import io
|
|
from datetime import datetime
|
|
|
|
import numpy as np
|
|
import pytest
|
|
from werkzeug.datastructures import FileStorage
|
|
|
|
from superset.commands.database.exceptions import DatabaseUploadFailed
|
|
from superset.commands.database.uploaders.csv_reader import CSVReader, CSVReaderOptions
|
|
from tests.unit_tests.fixtures.common import create_csv_file
|
|
|
|
CSV_DATA = [
|
|
["Name", "Age", "City", "Birth"],
|
|
["name1", "30", "city1", "1990-02-01"],
|
|
["name2", "25", "city2", "1995-02-01"],
|
|
["name3", "20", "city3", "2000-02-01"],
|
|
]
|
|
|
|
CSV_DATA_CHANGED_HEADER = [
|
|
["name1", "30", "city1", "1990-02-01"],
|
|
["Name", "Age", "City", "Birth"],
|
|
["name2", "25", "city2", "1995-02-01"],
|
|
["name3", "20", "city3", "2000-02-01"],
|
|
]
|
|
|
|
CSV_DATA_WITH_NULLS = [
|
|
["Name", "Age", "City", "Birth"],
|
|
["name1", "N/A", "city1", "1990-02-01"],
|
|
["name2", "25", "None", "1995-02-01"],
|
|
["name3", "20", "city3", "2000-02-01"],
|
|
]
|
|
|
|
CSV_DATA_DAY_FIRST = [
|
|
["Name", "Age", "City", "Birth"],
|
|
["name1", "30", "city1", "01-02-1990"],
|
|
]
|
|
|
|
CSV_DATA_DECIMAL_CHAR = [
|
|
["Name", "Age", "City", "Birth"],
|
|
["name1", "30,1", "city1", "1990-02-01"],
|
|
]
|
|
|
|
CSV_DATA_SKIP_INITIAL_SPACE = [
|
|
[" Name", "Age", "City", "Birth"],
|
|
[" name1", "30", "city1", "1990-02-01"],
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"file, options, expected_cols, expected_values",
|
|
[
|
|
(
|
|
create_csv_file(CSV_DATA),
|
|
CSVReaderOptions(),
|
|
["Name", "Age", "City", "Birth"],
|
|
[
|
|
["name1", 30, "city1", "1990-02-01"],
|
|
["name2", 25, "city2", "1995-02-01"],
|
|
["name3", 20, "city3", "2000-02-01"],
|
|
],
|
|
),
|
|
(
|
|
create_csv_file(CSV_DATA, delimiter="|"),
|
|
CSVReaderOptions(delimiter="|"),
|
|
["Name", "Age", "City", "Birth"],
|
|
[
|
|
["name1", 30, "city1", "1990-02-01"],
|
|
["name2", 25, "city2", "1995-02-01"],
|
|
["name3", 20, "city3", "2000-02-01"],
|
|
],
|
|
),
|
|
(
|
|
create_csv_file(CSV_DATA),
|
|
CSVReaderOptions(
|
|
columns_read=["Name", "Age"],
|
|
),
|
|
["Name", "Age"],
|
|
[
|
|
["name1", 30],
|
|
["name2", 25],
|
|
["name3", 20],
|
|
],
|
|
),
|
|
(
|
|
create_csv_file(CSV_DATA),
|
|
CSVReaderOptions(
|
|
columns_read=["Name", "Age"],
|
|
column_data_types={"Age": "float"},
|
|
),
|
|
["Name", "Age"],
|
|
[
|
|
["name1", 30.0],
|
|
["name2", 25.0],
|
|
["name3", 20.0],
|
|
],
|
|
),
|
|
(
|
|
create_csv_file(CSV_DATA),
|
|
CSVReaderOptions(
|
|
columns_read=[],
|
|
),
|
|
["Name", "Age", "City", "Birth"],
|
|
[
|
|
["name1", 30, "city1", "1990-02-01"],
|
|
["name2", 25, "city2", "1995-02-01"],
|
|
["name3", 20, "city3", "2000-02-01"],
|
|
],
|
|
),
|
|
(
|
|
create_csv_file(CSV_DATA),
|
|
CSVReaderOptions(
|
|
columns_read=[],
|
|
column_data_types={"Age": "float"},
|
|
),
|
|
["Name", "Age", "City", "Birth"],
|
|
[
|
|
["name1", 30.0, "city1", "1990-02-01"],
|
|
["name2", 25.0, "city2", "1995-02-01"],
|
|
["name3", 20.0, "city3", "2000-02-01"],
|
|
],
|
|
),
|
|
(
|
|
create_csv_file(CSV_DATA),
|
|
CSVReaderOptions(
|
|
rows_to_read=1,
|
|
),
|
|
["Name", "Age", "City", "Birth"],
|
|
[
|
|
["name1", 30.0, "city1", "1990-02-01"],
|
|
],
|
|
),
|
|
(
|
|
create_csv_file(CSV_DATA),
|
|
CSVReaderOptions(
|
|
rows_to_read=1,
|
|
columns_read=["Name", "Age"],
|
|
),
|
|
["Name", "Age"],
|
|
[
|
|
["name1", 30.0],
|
|
],
|
|
),
|
|
(
|
|
create_csv_file(CSV_DATA),
|
|
CSVReaderOptions(
|
|
skip_rows=1,
|
|
),
|
|
["name1", "30", "city1", "1990-02-01"],
|
|
[
|
|
["name2", 25.0, "city2", "1995-02-01"],
|
|
["name3", 20.0, "city3", "2000-02-01"],
|
|
],
|
|
),
|
|
(
|
|
create_csv_file(CSV_DATA),
|
|
CSVReaderOptions(
|
|
column_dates=["Birth"],
|
|
),
|
|
["Name", "Age", "City", "Birth"],
|
|
[
|
|
["name1", 30, "city1", datetime(1990, 2, 1, 0, 0)],
|
|
["name2", 25, "city2", datetime(1995, 2, 1, 0, 0)],
|
|
["name3", 20, "city3", datetime(2000, 2, 1, 0, 0)],
|
|
],
|
|
),
|
|
(
|
|
create_csv_file(CSV_DATA_CHANGED_HEADER),
|
|
CSVReaderOptions(
|
|
header_row=1,
|
|
),
|
|
["Name", "Age", "City", "Birth"],
|
|
[
|
|
["name2", 25, "city2", "1995-02-01"],
|
|
["name3", 20, "city3", "2000-02-01"],
|
|
],
|
|
),
|
|
(
|
|
create_csv_file(CSV_DATA_WITH_NULLS),
|
|
CSVReaderOptions(
|
|
null_values=["N/A", "None"],
|
|
),
|
|
["Name", "Age", "City", "Birth"],
|
|
[
|
|
["name1", np.nan, "city1", "1990-02-01"],
|
|
["name2", 25.0, np.nan, "1995-02-01"],
|
|
["name3", 20.0, "city3", "2000-02-01"],
|
|
],
|
|
),
|
|
(
|
|
create_csv_file(CSV_DATA_DAY_FIRST),
|
|
CSVReaderOptions(
|
|
day_first=False,
|
|
column_dates=["Birth"],
|
|
),
|
|
["Name", "Age", "City", "Birth"],
|
|
[
|
|
["name1", 30, "city1", datetime(1990, 1, 2, 0, 0)],
|
|
],
|
|
),
|
|
(
|
|
create_csv_file(CSV_DATA_DAY_FIRST),
|
|
CSVReaderOptions(
|
|
day_first=True,
|
|
column_dates=["Birth"],
|
|
),
|
|
["Name", "Age", "City", "Birth"],
|
|
[
|
|
["name1", 30, "city1", datetime(1990, 2, 1, 0, 0)],
|
|
],
|
|
),
|
|
(
|
|
create_csv_file(CSV_DATA_DECIMAL_CHAR),
|
|
CSVReaderOptions(
|
|
decimal_character=",",
|
|
),
|
|
["Name", "Age", "City", "Birth"],
|
|
[
|
|
["name1", 30.1, "city1", "1990-02-01"],
|
|
],
|
|
),
|
|
(
|
|
create_csv_file(CSV_DATA_SKIP_INITIAL_SPACE),
|
|
CSVReaderOptions(
|
|
skip_initial_space=True,
|
|
),
|
|
["Name", "Age", "City", "Birth"],
|
|
[
|
|
["name1", 30, "city1", "1990-02-01"],
|
|
],
|
|
),
|
|
],
|
|
)
|
|
def test_csv_reader_file_to_dataframe(file, options, expected_cols, expected_values):
|
|
csv_reader = CSVReader(
|
|
options=options,
|
|
)
|
|
df = csv_reader.file_to_dataframe(file)
|
|
assert df.columns.tolist() == expected_cols
|
|
actual_values = df.values.tolist()
|
|
for i in range(len(expected_values)):
|
|
for j in range(len(expected_values[i])):
|
|
expected_val = expected_values[i][j]
|
|
actual_val = actual_values[i][j]
|
|
|
|
# Check if both values are NaN
|
|
if isinstance(expected_val, float) and isinstance(actual_val, float):
|
|
assert np.isnan(expected_val) == np.isnan(actual_val)
|
|
else:
|
|
assert expected_val == actual_val
|
|
file.close()
|
|
|
|
|
|
def test_csv_reader_index_column():
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(index_column="Name"),
|
|
)
|
|
df = csv_reader.file_to_dataframe(create_csv_file(CSV_DATA))
|
|
assert df.index.name == "Name"
|
|
|
|
|
|
def test_csv_reader_wrong_index_column():
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(index_column="wrong"),
|
|
)
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
csv_reader.file_to_dataframe(create_csv_file(CSV_DATA))
|
|
assert str(ex.value) == "Parsing error: Index wrong invalid"
|
|
|
|
|
|
def test_csv_reader_broken_file_no_columns():
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(),
|
|
)
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
csv_reader.file_to_dataframe(create_csv_file([""]))
|
|
assert str(ex.value) == "Parsing error: No columns to parse from file"
|
|
|
|
|
|
def test_csv_reader_wrong_columns_to_read():
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(columns_read=["xpto"]),
|
|
)
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
csv_reader.file_to_dataframe(create_csv_file(CSV_DATA))
|
|
assert str(ex.value) == (
|
|
"Parsing error: Usecols do not match columns, "
|
|
"columns expected but not found: ['xpto']"
|
|
)
|
|
|
|
|
|
def test_csv_reader_invalid_file():
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(),
|
|
)
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
csv_reader.file_to_dataframe(
|
|
FileStorage(
|
|
io.StringIO("c1,c2,c3\na,b,c\n1,2,3,4,5,6,7\n1,2,3"), filename=""
|
|
)
|
|
)
|
|
assert str(ex.value) == (
|
|
"Parsing error: Error tokenizing data. C error:"
|
|
" Expected 3 fields in line 3, saw 7\n"
|
|
)
|
|
|
|
|
|
def test_csv_reader_invalid_encoding():
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(),
|
|
)
|
|
binary_data = b"col1,col2,col3\nv1,v2,\xba\nv3,v4,v5\n"
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
csv_reader.file_to_dataframe(FileStorage(io.BytesIO(binary_data)))
|
|
assert str(ex.value) == (
|
|
"Parsing error: 'utf-8' codec can't decode byte 0xba in"
|
|
" position 21: invalid start byte"
|
|
)
|
|
|
|
|
|
def test_csv_reader_file_metadata():
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(),
|
|
)
|
|
file = create_csv_file(CSV_DATA)
|
|
metadata = csv_reader.file_metadata(file)
|
|
assert metadata == {
|
|
"items": [
|
|
{"column_names": ["Name", "Age", "City", "Birth"], "sheet_name": None}
|
|
]
|
|
}
|
|
file.close()
|
|
|
|
file = create_csv_file(CSV_DATA, delimiter="|")
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(delimiter="|"),
|
|
)
|
|
metadata = csv_reader.file_metadata(file)
|
|
assert metadata == {
|
|
"items": [
|
|
{"column_names": ["Name", "Age", "City", "Birth"], "sheet_name": None}
|
|
]
|
|
}
|
|
file.close()
|
|
|
|
|
|
def test_csv_reader_file_metadata_invalid_file():
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(),
|
|
)
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
csv_reader.file_metadata(
|
|
FileStorage(io.StringIO("c1,c2,c3\na,b,c\n1,2,3,4,5,6,7\n1,2,3"))
|
|
)
|
|
assert str(ex.value) == (
|
|
"Parsing error: Error tokenizing data. C error:"
|
|
" Expected 3 fields in line 3, saw 7\n"
|
|
)
|
|
|
|
|
|
def test_csv_reader_integer_in_float_column():
|
|
csv_data = [
|
|
["Name", "Score", "City"],
|
|
["name1", 25.5, "city1"],
|
|
["name2", 25, "city2"],
|
|
]
|
|
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(column_data_types={"Score": "float"})
|
|
)
|
|
|
|
df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
|
|
assert df.shape == (2, 3)
|
|
assert df["Score"].dtype == "float64"
|
|
|
|
|
|
def test_csv_reader_object_type_auto_inferring():
|
|
# this case below won't raise a error
|
|
csv_data = [
|
|
["Name", "id", "City"],
|
|
["name1", 25.5, "city1"],
|
|
["name2", 15, "city2"],
|
|
["name3", 123456789086, "city3"],
|
|
["name4", "abc", "city4"],
|
|
["name5", 4.75, "city5"],
|
|
]
|
|
|
|
csv_reader = CSVReader()
|
|
|
|
df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
|
|
assert df.shape == (5, 3)
|
|
# pandas automatically infers the type if column_data_types is not informed
|
|
# if there's only one string in the column it converts the whole column to object
|
|
assert df["id"].dtype == "object"
|
|
|
|
|
|
def test_csv_reader_float_type_auto_inferring():
|
|
csv_data = [
|
|
["Name", "id", "City"],
|
|
["name1", "25", "city1"],
|
|
["name2", "15", "city2"],
|
|
["name3", "123456789086", "city3"],
|
|
["name5", "4.75", "city5"],
|
|
]
|
|
|
|
csv_reader = CSVReader()
|
|
|
|
df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
|
|
assert df.shape == (4, 3)
|
|
# The type here is automatically inferred to float due to 4.75 value
|
|
assert df["id"].dtype == "float64"
|
|
|
|
|
|
def test_csv_reader_int_type_auto_inferring():
|
|
csv_data = [
|
|
["Name", "id", "City"],
|
|
["name1", "0", "city1"],
|
|
["name2", "15", "city2"],
|
|
["name3", "123456789086", "city3"],
|
|
["name5", "45", "city5"],
|
|
]
|
|
|
|
csv_reader = CSVReader()
|
|
|
|
df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
|
|
assert df.shape == (4, 3)
|
|
assert df["id"].dtype == "int64"
|
|
|
|
|
|
def test_csv_reader_bigint_type_auto_inferring():
|
|
csv_data = [
|
|
["Name", "id", "City"],
|
|
["name1", "9223372036854775807", "city1"],
|
|
["name2", "9223372036854775806", "city2"],
|
|
["name3", "1234567890123456789", "city3"],
|
|
["name4", "0", "city4"],
|
|
["name5", "-9223372036854775808", "city5"],
|
|
]
|
|
|
|
csv_reader = CSVReader()
|
|
|
|
df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
|
|
assert df.shape == (5, 3)
|
|
assert df["id"].dtype == "int64"
|
|
assert df.iloc[0]["id"] == 9223372036854775807
|
|
assert df.iloc[4]["id"] == -9223372036854775808
|
|
|
|
|
|
def test_csv_reader_int_typing():
|
|
csv_data = [
|
|
["Name", "id", "City"],
|
|
["name1", "0", "city1"],
|
|
["name2", "15", "city2"],
|
|
["name3", "123456789086", "city3"],
|
|
["name5", "45", "city5"],
|
|
]
|
|
|
|
csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"id": "int"}))
|
|
|
|
df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
|
|
assert df.shape == (4, 3)
|
|
assert df["id"].dtype == "int64"
|
|
|
|
|
|
def test_csv_reader_float_typing():
|
|
csv_data = [
|
|
["Name", "score", "City"],
|
|
["name1", "0", "city1"],
|
|
["name2", "15.3", "city2"],
|
|
["name3", "45", "city3"],
|
|
["name5", "23.1342", "city5"],
|
|
]
|
|
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(column_data_types={"score": "float"})
|
|
)
|
|
|
|
df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
|
|
assert df.shape == (4, 3)
|
|
assert df["score"].dtype == "float64"
|
|
|
|
|
|
def test_csv_reader_multiple_errors_display():
|
|
"""Test that multiple errors are displayed with proper formatting."""
|
|
csv_data = [
|
|
["Name", "Age", "Score"],
|
|
["Alice", "25", "95.5"],
|
|
["Bob", "invalid1", "87.2"],
|
|
["Charlie", "invalid2", "92.1"],
|
|
["Diana", "invalid3", "88.5"],
|
|
["Eve", "invalid4", "90.0"],
|
|
["Frank", "30", "85.5"],
|
|
]
|
|
|
|
csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"Age": "int64"}))
|
|
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
|
|
error_msg = str(ex.value)
|
|
assert "Cannot convert column 'Age' to int64" in error_msg
|
|
assert "Found 4 error(s):" in error_msg
|
|
assert "Line 3: 'invalid1' cannot be converted to int64" in error_msg
|
|
assert "Line 4: 'invalid2' cannot be converted to int64" in error_msg
|
|
assert "Line 5: 'invalid3' cannot be converted to int64" in error_msg
|
|
assert "Line 6: 'invalid4' cannot be converted to int64" in error_msg
|
|
# With MAX_DISPLAYED_ERRORS = 5, all 4 errors should be shown without truncation
|
|
assert "and" not in error_msg or "more error(s)" not in error_msg
|
|
|
|
|
|
def test_csv_reader_non_numeric_in_integer_column():
|
|
csv_data = [
|
|
["Name", "Age", "City"],
|
|
["name1", "abc", "city1"],
|
|
["name2", "25", "city2"],
|
|
]
|
|
|
|
csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"Age": "int64"}))
|
|
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
|
|
error_msg = str(ex.value)
|
|
assert "Cannot convert column 'Age' to int64" in error_msg
|
|
assert "Found 1 error(s):" in error_msg
|
|
assert "Line 2: 'abc' cannot be converted to int64" in error_msg
|
|
|
|
|
|
def test_csv_reader_non_numeric_in_float_column():
|
|
csv_data = [
|
|
["Name", "Score", "City"],
|
|
["name1", "5.3", "city1"],
|
|
["name2", "25.5", "city2"],
|
|
["name3", "24.5", "city3"],
|
|
["name4", "1.0", "city4"],
|
|
["name5", "one point five", "city5"],
|
|
]
|
|
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(column_data_types={"Score": "float64"})
|
|
)
|
|
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
|
|
error_msg = str(ex.value)
|
|
assert "Cannot convert column 'Score' to float64" in error_msg
|
|
assert "Found 1 error(s):" in error_msg
|
|
assert "Line 6: 'one point five' cannot be converted to float64" in error_msg
|
|
|
|
|
|
def test_csv_reader_improved_error_detection_int32():
|
|
"""Test improved error detection for int32 type casting."""
|
|
csv_data = [
|
|
["Name", "ID", "City"],
|
|
["name1", "123", "city1"],
|
|
["name2", "456", "city2"],
|
|
["name3", "not_a_number", "city3"],
|
|
["name4", "789", "city4"],
|
|
]
|
|
|
|
csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"ID": "int32"}))
|
|
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
|
|
error_msg = str(ex.value)
|
|
assert "Cannot convert column 'ID' to int32" in error_msg
|
|
assert "Found 1 error(s):" in error_msg
|
|
assert "Line 4: 'not_a_number' cannot be converted to int32" in error_msg
|
|
|
|
|
|
def test_csv_reader_improved_error_detection_float32():
|
|
"""Test improved error detection for float32 type casting."""
|
|
csv_data = [
|
|
["Name", "Score", "City"],
|
|
["name1", "1.5", "city1"],
|
|
["name2", "2.7", "city2"],
|
|
["name3", "invalid_float", "city3"],
|
|
]
|
|
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(column_data_types={"Score": "float32"})
|
|
)
|
|
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
|
|
error_msg = str(ex.value)
|
|
assert "Cannot convert column 'Score' to float32" in error_msg
|
|
assert "Found 1 error(s):" in error_msg
|
|
assert "Line 4: 'invalid_float' cannot be converted to float32" in error_msg
|
|
|
|
|
|
def test_csv_reader_error_detection_with_header_row():
|
|
"""Test that line numbers are correctly calculated with custom header row."""
|
|
csv_data = [
|
|
["skip_this_row", "skip", "skip"],
|
|
["Name", "Age", "City"],
|
|
["name1", "25", "city1"],
|
|
["name2", "invalid_age", "city2"],
|
|
]
|
|
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(header_row=1, column_data_types={"Age": "int"})
|
|
)
|
|
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
|
|
error_msg = str(ex.value)
|
|
assert "Cannot convert column 'Age' to int" in error_msg
|
|
assert "Found 1 error(s):" in error_msg
|
|
assert "Line 4: 'invalid_age' cannot be converted to int" in error_msg
|
|
|
|
|
|
def test_csv_reader_error_detection_first_row_error():
|
|
"""Test error detection when the first data row has the error."""
|
|
|
|
csv_data = [
|
|
["Name", "Age", "City"],
|
|
["name1", "not_a_number", "city1"],
|
|
["name2", "25", "city2"],
|
|
]
|
|
|
|
csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"Age": "int64"}))
|
|
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
|
|
error_msg = str(ex.value)
|
|
assert "Cannot convert column 'Age' to int64" in error_msg
|
|
assert "Found 1 error(s):" in error_msg
|
|
assert "Line 2: 'not_a_number' cannot be converted to int64" in error_msg
|
|
|
|
|
|
def test_csv_reader_error_detection_missing_column():
|
|
"""Test that missing columns are handled gracefully."""
|
|
csv_data = [
|
|
["Name", "City"],
|
|
["name1", "city1"],
|
|
["name2", "city2"],
|
|
]
|
|
|
|
# Try to cast a column that doesn't exist
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(column_data_types={"NonExistent": "int64"})
|
|
)
|
|
|
|
# Should not raise an error for missing columns
|
|
df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
assert df.shape == (2, 2)
|
|
assert df.columns.tolist() == ["Name", "City"]
|
|
|
|
|
|
def test_csv_reader_error_detection_mixed_valid_invalid():
|
|
csv_data = [
|
|
["Name", "Score", "City"],
|
|
["name1", "95.5", "city1"],
|
|
["name2", "87.2", "city2"],
|
|
["name3", "92.1", "city3"],
|
|
["name4", "eighty-five", "city4"],
|
|
["name5", "78.9", "city5"],
|
|
]
|
|
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(column_data_types={"Score": "float64"})
|
|
)
|
|
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
|
|
error_msg = str(ex.value)
|
|
assert "Cannot convert column 'Score' to float64" in error_msg
|
|
assert "Found 1 error(s):" in error_msg
|
|
assert "Line 5: 'eighty-five' cannot be converted to float64" in error_msg
|
|
|
|
|
|
def test_csv_reader_error_detection_multiple_invalid_values():
|
|
"""Test error detection with multiple invalid values showing first 5 + count."""
|
|
csv_data = [
|
|
["Name", "Score", "City"],
|
|
["name1", "95.5", "city1"],
|
|
["name2", "87.2", "city2"],
|
|
["name3", "92.1", "city3"],
|
|
["name4", "eighty-five", "city4"],
|
|
["name4", "eighty-one", "city4"],
|
|
["name4", "eighty", "city4"],
|
|
["name4", "one", "city4"],
|
|
["name4", "two", "city4"],
|
|
["name4", "three", "city4"],
|
|
["name5", "78.9", "city5"],
|
|
]
|
|
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(column_data_types={"Score": "float64"})
|
|
)
|
|
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
|
|
error_msg = str(ex.value)
|
|
assert "Cannot convert column 'Score' to float64" in error_msg
|
|
assert "Found 6 error(s):" in error_msg
|
|
assert "Line 5: 'eighty-five' cannot be converted to float64" in error_msg
|
|
assert "Line 6: 'eighty-one' cannot be converted to float64" in error_msg
|
|
assert "Line 7: 'eighty' cannot be converted to float64" in error_msg
|
|
assert "Line 8: 'one' cannot be converted to float64" in error_msg
|
|
assert "Line 9: 'two' cannot be converted to float64" in error_msg
|
|
assert "and 1 more error(s)" in error_msg
|
|
|
|
|
|
def test_csv_reader_error_detection_non_numeric_types():
|
|
"""Test error detection for non-numeric type casting."""
|
|
csv_data = [
|
|
["Name", "Status", "City"],
|
|
["name1", "active", "city1"],
|
|
["name2", "inactive", "city2"],
|
|
["name3", 123, "city3"], # This should cause an error when casting to string
|
|
]
|
|
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(column_data_types={"Status": "string"})
|
|
)
|
|
|
|
# For non-numeric types, the error detection should still work
|
|
# but might have different behavior depending on pandas version
|
|
try:
|
|
df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
# If no error is raised, the conversion succeeded
|
|
assert df["Status"].dtype == "string"
|
|
except DatabaseUploadFailed as ex:
|
|
# If an error is raised, it should have proper formatting
|
|
error_msg = str(ex.value)
|
|
assert "Cannot convert" in error_msg
|
|
assert "Status" in error_msg
|
|
|
|
|
|
def test_csv_reader_error_detection_with_null_values():
|
|
csv_data = [
|
|
["Name", "Age", "City"],
|
|
["name1", "25", "city1"],
|
|
["name2", "", "city2"],
|
|
["name3", "invalid_age", "city3"],
|
|
]
|
|
|
|
csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"Age": "int64"}))
|
|
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
|
|
error_msg = str(ex.value)
|
|
assert "Cannot convert column 'Age' to int64" in error_msg
|
|
assert "Found 1 error(s):" in error_msg
|
|
assert "Line 4: 'invalid_age' cannot be converted to int64" in error_msg
|
|
|
|
|
|
def test_csv_reader_successful_numeric_conversion():
|
|
csv_data = [
|
|
["Name", "Age", "Score", "ID"],
|
|
["name1", "25", "95.5", "1001"],
|
|
["name2", "30", "87.2", "1002"],
|
|
["name3", "35", "92.1", "1003"],
|
|
]
|
|
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(
|
|
column_data_types={
|
|
"Age": "int64",
|
|
"Score": "float64",
|
|
"ID": "int32",
|
|
}
|
|
)
|
|
)
|
|
|
|
df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
|
|
assert df.shape == (3, 4)
|
|
assert df["Age"].dtype == "int64"
|
|
assert df["Score"].dtype == "float64"
|
|
assert df["ID"].dtype == "int32"
|
|
assert df.iloc[0]["Age"] == 25
|
|
assert df.iloc[0]["Score"] == 95.5
|
|
assert df.iloc[0]["ID"] == 1001
|
|
|
|
|
|
def test_csv_reader_successful_string_conversion_with_floats():
|
|
csv_data = [
|
|
["id"],
|
|
[1439403621518935563],
|
|
[42286989],
|
|
[1413660691875593351],
|
|
[8.26839e17],
|
|
]
|
|
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(
|
|
column_data_types={
|
|
"id": "str",
|
|
}
|
|
)
|
|
)
|
|
|
|
df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
|
|
|
|
assert df.shape == (4, 1)
|
|
assert df["id"].dtype == "object"
|
|
assert df.iloc[0]["id"] == "1439403621518935563"
|
|
assert df.iloc[1]["id"] == "42286989"
|
|
assert df.iloc[2]["id"] == "1413660691875593351"
|
|
assert df.iloc[3]["id"] == "8.26839e+17"
|
|
|
|
|
|
def test_csv_reader_error_detection_improvements_summary():
|
|
csv_data_with_custom_header = [
|
|
["metadata_row", "skip", "this"],
|
|
["Name", "Age", "Score"],
|
|
["Alice", "25", "95.5"],
|
|
["Bob", "invalid_age", "87.2"],
|
|
["Charlie", "30", "92.1"],
|
|
]
|
|
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(
|
|
header_row=1, column_data_types={"Age": "int64", "Score": "float64"}
|
|
)
|
|
)
|
|
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
csv_reader.file_to_dataframe(create_csv_file(csv_data_with_custom_header))
|
|
|
|
error_msg = str(ex.value)
|
|
assert "Cannot convert column 'Age' to int64" in error_msg
|
|
assert "Found 1 error(s):" in error_msg
|
|
assert "Line 4: 'invalid_age' cannot be converted to int64" in error_msg
|
|
|
|
# Test case 2: Multiple type errors - Age comes first alphabetically
|
|
csv_data_multiple_errors = [
|
|
["Name", "Age", "Score"],
|
|
["Alice", "25", "95.5"],
|
|
["Bob", "invalid_age", "invalid_score"], # Error in both columns (line 3)
|
|
["Charlie", "30", "92.1"],
|
|
]
|
|
|
|
csv_reader = CSVReader(
|
|
options=CSVReaderOptions(column_data_types={"Age": "int64", "Score": "float64"})
|
|
)
|
|
|
|
with pytest.raises(DatabaseUploadFailed) as ex:
|
|
csv_reader.file_to_dataframe(create_csv_file(csv_data_multiple_errors))
|
|
|
|
error_msg = str(ex.value)
|
|
# Should catch the Age error first (Age comes before Score alphabetically)
|
|
assert "Cannot convert column 'Age' to int64" in error_msg
|
|
assert "Found 1 error(s):" in error_msg
|
|
assert "Line 3: 'invalid_age' cannot be converted to int64" in error_msg
|