mirror of
https://github.com/apache/superset.git
synced 2026-04-18 23:55:00 +00:00
feat: new Columnar upload form and API (#28192)
This commit is contained in:
committed by
GitHub
parent
f5843fe588
commit
9a339f08a7
253
tests/unit_tests/commands/databases/columnar_reader_test.py
Normal file
253
tests/unit_tests/commands/databases/columnar_reader_test.py
Normal file
@@ -0,0 +1,253 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
import io
|
||||
import tempfile
|
||||
from typing import Any
|
||||
from zipfile import ZipFile
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from werkzeug.datastructures import FileStorage
|
||||
|
||||
from superset.commands.database.exceptions import DatabaseUploadFailed
|
||||
from superset.commands.database.uploaders.columnar_reader import (
|
||||
ColumnarReader,
|
||||
ColumnarReaderOptions,
|
||||
)
|
||||
from tests.unit_tests.fixtures.common import create_columnar_file
|
||||
|
||||
COLUMNAR_DATA: dict[str, list[Any]] = {
|
||||
"Name": ["name1", "name2", "name3"],
|
||||
"Age": [30, 25, 20],
|
||||
"City": ["city1", "city2", "city3"],
|
||||
"Birth": ["1990-02-01", "1995-02-01", "2000-02-01"],
|
||||
}
|
||||
|
||||
COLUMNAR_WITH_NULLS: dict[str, list[Any]] = {
|
||||
"Name": ["name1", "name2", "name3"],
|
||||
"Age": [None, 25, 20],
|
||||
"City": ["city1", None, "city3"],
|
||||
"Birth": ["1990-02-01", "1995-02-01", "2000-02-01"],
|
||||
}
|
||||
|
||||
|
||||
COLUMNAR_WITH_FLOATS: dict[str, list[Any]] = {
|
||||
"Name": ["name1", "name2", "name3"],
|
||||
"Age": [30.1, 25.1, 20.1],
|
||||
"City": ["city1", "city2", "city3"],
|
||||
"Birth": ["1990-02-01", "1995-02-01", "2000-02-01"],
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"file, options, expected_cols, expected_values",
|
||||
[
|
||||
(
|
||||
create_columnar_file(COLUMNAR_DATA),
|
||||
ColumnarReaderOptions(),
|
||||
["Name", "Age", "City", "Birth"],
|
||||
[
|
||||
["name1", 30, "city1", "1990-02-01"],
|
||||
["name2", 25, "city2", "1995-02-01"],
|
||||
["name3", 20, "city3", "2000-02-01"],
|
||||
],
|
||||
),
|
||||
(
|
||||
create_columnar_file(COLUMNAR_DATA),
|
||||
ColumnarReaderOptions(
|
||||
columns_read=["Name", "Age"],
|
||||
),
|
||||
["Name", "Age"],
|
||||
[
|
||||
["name1", 30],
|
||||
["name2", 25],
|
||||
["name3", 20],
|
||||
],
|
||||
),
|
||||
(
|
||||
create_columnar_file(COLUMNAR_DATA),
|
||||
ColumnarReaderOptions(
|
||||
columns_read=[],
|
||||
),
|
||||
["Name", "Age", "City", "Birth"],
|
||||
[
|
||||
["name1", 30, "city1", "1990-02-01"],
|
||||
["name2", 25, "city2", "1995-02-01"],
|
||||
["name3", 20, "city3", "2000-02-01"],
|
||||
],
|
||||
),
|
||||
(
|
||||
create_columnar_file(COLUMNAR_WITH_NULLS),
|
||||
ColumnarReaderOptions(),
|
||||
["Name", "Age", "City", "Birth"],
|
||||
[
|
||||
["name1", np.nan, "city1", "1990-02-01"],
|
||||
["name2", 25, None, "1995-02-01"],
|
||||
["name3", 20, "city3", "2000-02-01"],
|
||||
],
|
||||
),
|
||||
(
|
||||
create_columnar_file(COLUMNAR_WITH_FLOATS),
|
||||
ColumnarReaderOptions(),
|
||||
["Name", "Age", "City", "Birth"],
|
||||
[
|
||||
["name1", 30.1, "city1", "1990-02-01"],
|
||||
["name2", 25.1, "city2", "1995-02-01"],
|
||||
["name3", 20.1, "city3", "2000-02-01"],
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_columnar_reader_file_to_dataframe(
|
||||
file, options, expected_cols, expected_values
|
||||
):
|
||||
reader = ColumnarReader(
|
||||
options=options,
|
||||
)
|
||||
df = reader.file_to_dataframe(file)
|
||||
assert df.columns.tolist() == expected_cols
|
||||
actual_values = df.values.tolist()
|
||||
for i in range(len(expected_values)):
|
||||
for j in range(len(expected_values[i])):
|
||||
expected_val = expected_values[i][j]
|
||||
actual_val = actual_values[i][j]
|
||||
|
||||
# Check if both values are NaN
|
||||
if isinstance(expected_val, float) and isinstance(actual_val, float):
|
||||
assert np.isnan(expected_val) == np.isnan(actual_val)
|
||||
else:
|
||||
assert expected_val == actual_val
|
||||
file.close()
|
||||
|
||||
|
||||
def test_excel_reader_wrong_columns_to_read():
|
||||
reader = ColumnarReader(
|
||||
options=ColumnarReaderOptions(columns_read=["xpto"]),
|
||||
)
|
||||
with pytest.raises(DatabaseUploadFailed) as ex:
|
||||
reader.file_to_dataframe(create_columnar_file(COLUMNAR_DATA))
|
||||
assert (
|
||||
str(ex.value)
|
||||
== (
|
||||
"Parsing error: No match for FieldRef.Name(xpto) in Name: string\n"
|
||||
"Age: int64\n"
|
||||
"City: string\n"
|
||||
"Birth: string\n"
|
||||
"__fragment_index: int32\n"
|
||||
"__batch_index: int32\n"
|
||||
"__last_in_fragment: bool\n"
|
||||
"__filename: string"
|
||||
)
|
||||
!= (
|
||||
"Parsing error: Usecols do not match columns, columns expected but not found: "
|
||||
"['xpto'] (sheet: 0)"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def test_columnar_reader_invalid_file():
|
||||
reader = ColumnarReader(
|
||||
options=ColumnarReaderOptions(),
|
||||
)
|
||||
with pytest.raises(DatabaseUploadFailed) as ex:
|
||||
reader.file_to_dataframe(FileStorage(io.BytesIO(b"c1"), "test.parquet"))
|
||||
assert str(ex.value) == (
|
||||
"Parsing error: Could not open Parquet input source '<Buffer>': Parquet file "
|
||||
"size is 2 bytes, smaller than the minimum file footer (8 bytes)"
|
||||
)
|
||||
|
||||
|
||||
def test_columnar_reader_zip():
|
||||
reader = ColumnarReader(
|
||||
options=ColumnarReaderOptions(),
|
||||
)
|
||||
file1 = create_columnar_file(COLUMNAR_DATA, "test1.parquet")
|
||||
file2 = create_columnar_file(COLUMNAR_DATA, "test2.parquet")
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False) as tmp_file1:
|
||||
tmp_file1.write(file1.read())
|
||||
tmp_file1.seek(0)
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False) as tmp_file2:
|
||||
tmp_file2.write(file2.read())
|
||||
tmp_file2.seek(0)
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False) as tmp_zip:
|
||||
with ZipFile(tmp_zip, "w") as zip_file:
|
||||
zip_file.write(tmp_file1.name, "test1.parquet")
|
||||
zip_file.write(tmp_file2.name, "test2.parquet")
|
||||
tmp_zip.seek(0) # Reset file pointer to beginning
|
||||
df = reader.file_to_dataframe(FileStorage(tmp_zip, "test.zip"))
|
||||
assert df.columns.tolist() == ["Name", "Age", "City", "Birth"]
|
||||
assert df.values.tolist() == [
|
||||
["name1", 30, "city1", "1990-02-01"],
|
||||
["name2", 25, "city2", "1995-02-01"],
|
||||
["name3", 20, "city3", "2000-02-01"],
|
||||
["name1", 30, "city1", "1990-02-01"],
|
||||
["name2", 25, "city2", "1995-02-01"],
|
||||
["name3", 20, "city3", "2000-02-01"],
|
||||
]
|
||||
|
||||
|
||||
def test_columnar_reader_bad_parquet_in_zip():
|
||||
reader = ColumnarReader(
|
||||
options=ColumnarReaderOptions(),
|
||||
)
|
||||
with tempfile.NamedTemporaryFile(delete=False) as tmp_zip:
|
||||
with ZipFile(tmp_zip, "w") as zip_file:
|
||||
zip_file.writestr("test1.parquet", b"bad parquet file")
|
||||
zip_file.writestr("test2.parquet", b"bad parquet file")
|
||||
tmp_zip.seek(0) # Reset file pointer to beginning
|
||||
with pytest.raises(DatabaseUploadFailed) as ex:
|
||||
reader.file_to_dataframe(FileStorage(tmp_zip, "test.zip"))
|
||||
assert str(ex.value) == (
|
||||
"Parsing error: Could not open Parquet input source '<Buffer>': "
|
||||
"Parquet magic bytes not found in footer. "
|
||||
"Either the file is corrupted or this is not a parquet file."
|
||||
)
|
||||
|
||||
|
||||
def test_columnar_reader_bad_zip():
|
||||
reader = ColumnarReader(
|
||||
options=ColumnarReaderOptions(),
|
||||
)
|
||||
with pytest.raises(DatabaseUploadFailed) as ex:
|
||||
reader.file_to_dataframe(FileStorage(io.BytesIO(b"bad zip file"), "test.zip"))
|
||||
assert str(ex.value) == "Not a valid ZIP file"
|
||||
|
||||
|
||||
def test_columnar_reader_metadata():
|
||||
reader = ColumnarReader(
|
||||
options=ColumnarReaderOptions(),
|
||||
)
|
||||
file = create_columnar_file(COLUMNAR_DATA)
|
||||
metadata = reader.file_metadata(file)
|
||||
column_names = sorted(metadata["items"][0]["column_names"])
|
||||
assert column_names == ["Age", "Birth", "City", "Name"]
|
||||
assert metadata["items"][0]["sheet_name"] is None
|
||||
|
||||
|
||||
def test_columnar_reader_metadata_invalid_file():
|
||||
reader = ColumnarReader(
|
||||
options=ColumnarReaderOptions(),
|
||||
)
|
||||
with pytest.raises(DatabaseUploadFailed) as ex:
|
||||
reader.file_metadata(FileStorage(io.BytesIO(b"c1"), "test.parquet"))
|
||||
assert str(ex.value) == (
|
||||
"Parsing error: Parquet file size is 2 bytes, "
|
||||
"smaller than the minimum file footer (8 bytes)"
|
||||
)
|
||||
@@ -19,6 +19,7 @@ from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from werkzeug.datastructures import FileStorage
|
||||
|
||||
from superset.commands.database.exceptions import DatabaseUploadFailed
|
||||
from superset.commands.database.uploaders.csv_reader import CSVReader, CSVReaderOptions
|
||||
@@ -265,6 +266,23 @@ def test_csv_reader_file_to_dataframe(file, options, expected_cols, expected_val
|
||||
file.close()
|
||||
|
||||
|
||||
def test_csv_reader_index_column():
|
||||
csv_reader = CSVReader(
|
||||
options=CSVReaderOptions(index_column="Name"),
|
||||
)
|
||||
df = csv_reader.file_to_dataframe(create_csv_file(CSV_DATA))
|
||||
assert df.index.name == "Name"
|
||||
|
||||
|
||||
def test_csv_reader_wrong_index_column():
|
||||
csv_reader = CSVReader(
|
||||
options=CSVReaderOptions(index_column="wrong"),
|
||||
)
|
||||
with pytest.raises(DatabaseUploadFailed) as ex:
|
||||
csv_reader.file_to_dataframe(create_csv_file(CSV_DATA))
|
||||
assert str(ex.value) == "Parsing error: Index wrong invalid"
|
||||
|
||||
|
||||
def test_csv_reader_broken_file_no_columns():
|
||||
csv_reader = CSVReader(
|
||||
options=CSVReaderOptions(),
|
||||
@@ -292,7 +310,9 @@ def test_csv_reader_invalid_file():
|
||||
)
|
||||
with pytest.raises(DatabaseUploadFailed) as ex:
|
||||
csv_reader.file_to_dataframe(
|
||||
io.StringIO("c1,c2,c3\na,b,c\n1,2,3,4,5,6,7\n1,2,3")
|
||||
FileStorage(
|
||||
io.StringIO("c1,c2,c3\na,b,c\n1,2,3,4,5,6,7\n1,2,3"), filename=""
|
||||
)
|
||||
)
|
||||
assert str(ex.value) == (
|
||||
"Parsing error: Error tokenizing data. C error:"
|
||||
@@ -306,8 +326,48 @@ def test_csv_reader_invalid_encoding():
|
||||
)
|
||||
binary_data = b"col1,col2,col3\nv1,v2,\xba\nv3,v4,v5\n"
|
||||
with pytest.raises(DatabaseUploadFailed) as ex:
|
||||
csv_reader.file_to_dataframe(io.BytesIO(binary_data))
|
||||
csv_reader.file_to_dataframe(FileStorage(io.BytesIO(binary_data)))
|
||||
assert str(ex.value) == (
|
||||
"Parsing error: 'utf-8' codec can't decode byte 0xba in"
|
||||
" position 21: invalid start byte"
|
||||
)
|
||||
|
||||
|
||||
def test_csv_reader_file_metadata():
|
||||
csv_reader = CSVReader(
|
||||
options=CSVReaderOptions(),
|
||||
)
|
||||
file = create_csv_file(CSV_DATA)
|
||||
metadata = csv_reader.file_metadata(file)
|
||||
assert metadata == {
|
||||
"items": [
|
||||
{"column_names": ["Name", "Age", "City", "Birth"], "sheet_name": None}
|
||||
]
|
||||
}
|
||||
file.close()
|
||||
|
||||
file = create_csv_file(CSV_DATA, delimiter="|")
|
||||
csv_reader = CSVReader(
|
||||
options=CSVReaderOptions(delimiter="|"),
|
||||
)
|
||||
metadata = csv_reader.file_metadata(file)
|
||||
assert metadata == {
|
||||
"items": [
|
||||
{"column_names": ["Name", "Age", "City", "Birth"], "sheet_name": None}
|
||||
]
|
||||
}
|
||||
file.close()
|
||||
|
||||
|
||||
def test_csv_reader_file_metadata_invalid_file():
|
||||
csv_reader = CSVReader(
|
||||
options=CSVReaderOptions(),
|
||||
)
|
||||
with pytest.raises(DatabaseUploadFailed) as ex:
|
||||
csv_reader.file_metadata(
|
||||
FileStorage(io.StringIO("c1,c2,c3\na,b,c\n1,2,3,4,5,6,7\n1,2,3"))
|
||||
)
|
||||
assert str(ex.value) == (
|
||||
"Parsing error: Error tokenizing data. C error:"
|
||||
" Expected 3 fields in line 3, saw 7\n"
|
||||
)
|
||||
|
||||
@@ -20,6 +20,9 @@ from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import xlsxwriter
|
||||
from werkzeug.datastructures import FileStorage
|
||||
from xlsxwriter.workbook import Worksheet
|
||||
|
||||
from superset.commands.database.exceptions import DatabaseUploadFailed
|
||||
from superset.commands.database.uploaders.excel_reader import (
|
||||
@@ -50,6 +53,18 @@ EXCEL_DATA_DECIMAL_CHAR = {
|
||||
}
|
||||
|
||||
|
||||
def write_data_to_worksheet(
|
||||
worksheet: Worksheet, header: list[str], data: list[list[Any]]
|
||||
):
|
||||
all_data = [header] + data
|
||||
row = 0
|
||||
col = 0
|
||||
for name, age in all_data:
|
||||
worksheet.write(row, col, name)
|
||||
worksheet.write(row, col + 1, age)
|
||||
row += 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"file, options, expected_cols, expected_values",
|
||||
[
|
||||
@@ -175,6 +190,23 @@ def test_excel_reader_file_to_dataframe(file, options, expected_cols, expected_v
|
||||
file.close()
|
||||
|
||||
|
||||
def test_excel_reader_index_column():
|
||||
excel_reader = ExcelReader(
|
||||
options=ExcelReaderOptions(index_column="Name"),
|
||||
)
|
||||
df = excel_reader.file_to_dataframe(create_excel_file(EXCEL_DATA))
|
||||
assert df.index.name == "Name"
|
||||
|
||||
|
||||
def test_excel_reader_wrong_index_column():
|
||||
excel_reader = ExcelReader(
|
||||
options=ExcelReaderOptions(index_column="wrong"),
|
||||
)
|
||||
with pytest.raises(DatabaseUploadFailed) as ex:
|
||||
excel_reader.file_to_dataframe(create_excel_file(EXCEL_DATA))
|
||||
assert str(ex.value) == ("Parsing error: Index wrong invalid (sheet: 0)")
|
||||
|
||||
|
||||
def test_excel_reader_wrong_columns_to_read():
|
||||
excel_reader = ExcelReader(
|
||||
options=ExcelReaderOptions(columns_read=["xpto"]),
|
||||
@@ -203,7 +235,60 @@ def test_excel_reader_invalid_file():
|
||||
options=ExcelReaderOptions(),
|
||||
)
|
||||
with pytest.raises(DatabaseUploadFailed) as ex:
|
||||
excel_reader.file_to_dataframe(io.StringIO("c1"))
|
||||
excel_reader.file_to_dataframe(FileStorage(io.BytesIO(b"c1")))
|
||||
assert str(ex.value) == (
|
||||
"Parsing error: Excel file format cannot be determined, you must specify an engine manually."
|
||||
)
|
||||
|
||||
|
||||
def test_excel_reader_metadata():
|
||||
excel_reader = ExcelReader(
|
||||
options=ExcelReaderOptions(),
|
||||
)
|
||||
file = create_excel_file(EXCEL_DATA)
|
||||
metadata = excel_reader.file_metadata(file)
|
||||
assert metadata == {
|
||||
"items": [
|
||||
{"column_names": ["Name", "Age", "City", "Birth"], "sheet_name": "Sheet1"}
|
||||
]
|
||||
}
|
||||
file.close()
|
||||
|
||||
|
||||
def test_excel_reader_metadata_mul_sheets():
|
||||
buffer = io.BytesIO()
|
||||
workbook = xlsxwriter.Workbook(buffer)
|
||||
|
||||
worksheet1 = workbook.add_worksheet("Sheet1")
|
||||
header1 = ["col11", "col12"]
|
||||
data1 = [["v11", "v12"]]
|
||||
write_data_to_worksheet(worksheet1, header1, data1)
|
||||
|
||||
worksheet2 = workbook.add_worksheet("Sheet2")
|
||||
header2 = ["col21", "col22"]
|
||||
data2 = [["v21", "v22"]]
|
||||
write_data_to_worksheet(worksheet2, header2, data2)
|
||||
workbook.close()
|
||||
|
||||
file = FileStorage(stream=buffer, filename="test.xls")
|
||||
|
||||
excel_reader = ExcelReader(
|
||||
options=ExcelReaderOptions(),
|
||||
)
|
||||
metadata = excel_reader.file_metadata(file)
|
||||
assert metadata == {
|
||||
"items": [
|
||||
{"column_names": ["col11", "col12"], "sheet_name": "Sheet1"},
|
||||
{"column_names": ["col21", "col22"], "sheet_name": "Sheet2"},
|
||||
]
|
||||
}
|
||||
file.close()
|
||||
|
||||
|
||||
def test_excel_reader_file_metadata_invalid_file():
|
||||
excel_reader = ExcelReader(
|
||||
options=ExcelReaderOptions(),
|
||||
)
|
||||
with pytest.raises(DatabaseUploadFailed) as ex:
|
||||
excel_reader.file_metadata(FileStorage(io.BytesIO(b"1")))
|
||||
assert str(ex.value) == ("Excel file format cannot be determined")
|
||||
|
||||
Reference in New Issue
Block a user