refactor: upload data unification, less permissions and less endpoints (#31959)

This commit is contained in:
Daniel Vaz Gaspar
2025-01-28 11:09:55 +00:00
committed by GitHub
parent 09c1987de4
commit 1b375b715c
14 changed files with 312 additions and 489 deletions

View File

@@ -45,6 +45,7 @@ from superset.commands.database.ssh_tunnel.exceptions import (
SSHTunnelInvalidCredentials,
SSHTunnelMissingCredentials,
)
from superset.commands.database.uploaders.base import UploadFileType
from superset.constants import PASSWORD_MASK
from superset.databases.types import ( # pylint:disable=unused-import
EncryptedDict, # noqa: F401
@@ -1081,20 +1082,22 @@ class DelimitedListField(fields.List):
) from exc
class BaseUploadFilePostSchema(Schema):
_extension_config_key = ""
class BaseUploadFilePostSchemaMixin(Schema):
@validates("file")
def validate_file_extension(self, file: FileStorage) -> None:
allowed_extensions = current_app.config["ALLOWED_EXTENSIONS"].intersection(
current_app.config[self._extension_config_key]
)
allowed_extensions = current_app.config["ALLOWED_EXTENSIONS"]
file_suffix = Path(file.filename).suffix
if not file_suffix or file_suffix[1:] not in allowed_extensions:
raise ValidationError([_("File extension is not allowed.")])
class BaseUploadPostSchema(BaseUploadFilePostSchema):
class UploadPostSchema(BaseUploadFilePostSchemaMixin):
type = fields.Enum(
UploadFileType,
required=True,
by_value=True,
metadata={"description": "File type to upload"},
)
already_exists = fields.String(
load_default="fail",
validate=OneOf(choices=("fail", "replace", "append")),
@@ -1123,43 +1126,26 @@ class BaseUploadPostSchema(BaseUploadFilePostSchema):
metadata={"description": "The name of the table to be created/appended"},
)
class ColumnarUploadPostSchema(BaseUploadPostSchema):
"""
Schema for Columnar Upload
"""
_extension_config_key = "COLUMNAR_EXTENSIONS"
# ------------
# CSV Schema
# ------------
file = fields.Raw(
required=True,
metadata={
"description": "The Columnar file to upload",
"type": "string",
"format": "binary",
},
)
class CSVUploadPostSchema(BaseUploadPostSchema):
"""
Schema for CSV Upload
"""
_extension_config_key = "CSV_EXTENSIONS"
file = fields.Raw(
required=True,
metadata={
"description": "The CSV file to upload",
"description": "The file to upload",
"type": "string",
"format": "text/csv",
},
)
delimiter = fields.String(metadata={"description": "The delimiter of the CSV file"})
delimiter = fields.String(
metadata={
"description": "[CSV only] The character used to separate values in the CSV"
" file (e.g., a comma, semicolon, or tab)."
}
)
column_data_types = fields.String(
metadata={
"description": "A dictionary with column names and "
"description": "[CSV only] A dictionary with column names and "
"their data types if you need to change "
"the defaults. Example: {'user_id':'int'}. "
"Check Python Pandas library for supported data types"
@@ -1167,57 +1153,69 @@ class CSVUploadPostSchema(BaseUploadPostSchema):
)
day_first = fields.Boolean(
metadata={
"description": "DD/MM format dates, international and European format"
"description": "[CSV only] DD/MM format dates, international and European"
" format"
}
)
skip_blank_lines = fields.Boolean(
metadata={"description": "Skip blank lines in the CSV file."}
metadata={"description": "[CSV only] Skip blank lines in the CSV file."}
)
skip_initial_space = fields.Boolean(
metadata={"description": "Skip spaces after delimiter."}
metadata={"description": "[CSV only] Skip spaces after delimiter."}
)
column_dates = DelimitedListField(
fields.String(),
metadata={
"description": "A list of column names that should be "
"description": "[CSV and Excel only] A list of column names that should be "
"parsed as dates. Example: date,timestamp"
},
)
decimal_character = fields.String(
metadata={
"description": "Character to recognize as decimal point. Default is '.'"
"description": "[CSV and Excel only] Character to recognize as decimal"
" point. Default is '.'"
}
)
header_row = fields.Integer(
metadata={
"description": "Row containing the headers to use as column names"
"(0 is first line of data). Leave empty if there is no header row."
"description": "[CSV and Excel only] Row containing the headers to use as"
" column names (0 is first line of data). Leave empty if"
" there is no header row."
}
)
index_column = fields.String(
metadata={
"description": "Column to use as the row labels of the dataframe. "
"Leave empty if no index column"
"description": "[CSV and Excel only] Column to use as the row labels of the"
" dataframe. Leave empty if no index column"
}
)
null_values = DelimitedListField(
fields.String(),
metadata={
"description": "A list of strings that should be treated as null. "
"Examples: '' for empty strings, 'None', 'N/A',"
"Warning: Hive database supports only a single value"
"description": "[CSV and Excel only] A list of strings that should be "
"treated as null. Examples: '' for empty strings, 'None',"
" 'N/A', Warning: Hive database supports only a single value"
},
)
rows_to_read = fields.Integer(
metadata={
"description": "Number of rows to read from the file. "
"description": "[CSV and Excel only] Number of rows to read from the file. "
"If None, reads all rows."
},
allow_none=True,
validate=Range(min=1),
)
skip_rows = fields.Integer(
metadata={"description": "Number of rows to skip at start of file."}
metadata={
"description": "[CSV and Excel only] Number of rows to skip at start"
" of file."
}
)
sheet_name = fields.String(
metadata={
"description": "[Excel only]] Strings used for sheet names "
"(default is the first sheet)."
}
)
@post_load
@@ -1234,79 +1232,17 @@ class CSVUploadPostSchema(BaseUploadPostSchema):
return data
class ExcelUploadPostSchema(BaseUploadPostSchema):
class UploadFileMetadataPostSchema(BaseUploadFilePostSchemaMixin):
"""
Schema for Excel Upload
Schema for Upload file metadata.
"""
_extension_config_key = "EXCEL_EXTENSIONS"
file = fields.Raw(
type = fields.Enum(
UploadFileType,
required=True,
metadata={
"description": "The Excel file to upload",
"type": "string",
"format": "binary",
},
by_value=True,
metadata={"description": "File type to upload"},
)
sheet_name = fields.String(
metadata={
"description": "Strings used for sheet names "
"(default is the first sheet)."
}
)
column_dates = DelimitedListField(
fields.String(),
metadata={
"description": "A list of column names that should be "
"parsed as dates. Example: date,timestamp"
},
)
decimal_character = fields.String(
metadata={
"description": "Character to recognize as decimal point. Default is '.'"
}
)
header_row = fields.Integer(
metadata={
"description": "Row containing the headers to use as column names"
"(0 is first line of data). Leave empty if there is no header row."
}
)
index_column = fields.String(
metadata={
"description": "Column to use as the row labels of the dataframe. "
"Leave empty if no index column"
}
)
null_values = DelimitedListField(
fields.String(),
metadata={
"description": "A list of strings that should be treated as null. "
"Examples: '' for empty strings, 'None', 'N/A',"
"Warning: Hive database supports only a single value"
},
)
rows_to_read = fields.Integer(
metadata={
"description": "Number of rows to read from the file. "
"If None, reads all rows."
},
allow_none=True,
validate=Range(min=1),
)
skip_rows = fields.Integer(
metadata={"description": "Number of rows to skip at start of file."}
)
class CSVMetadataUploadFilePostSchema(BaseUploadFilePostSchema):
"""
Schema for CSV metadata.
"""
_extension_config_key = "CSV_EXTENSIONS"
file = fields.Raw(
required=True,
metadata={
@@ -1315,30 +1251,12 @@ class CSVMetadataUploadFilePostSchema(BaseUploadFilePostSchema):
"format": "binary",
},
)
delimiter = fields.String(metadata={"description": "The delimiter of the CSV file"})
header_row = fields.Integer(
delimiter = fields.String(
metadata={
"description": "Row containing the headers to use as column names"
"(0 is first line of data). Leave empty if there is no header row."
"description": "The character used to separate values in the CSV file"
" (e.g., a comma, semicolon, or tab)."
}
)
class ExcelMetadataUploadFilePostSchema(BaseUploadFilePostSchema):
"""
Schema for CSV metadata.
"""
_extension_config_key = "EXCEL_EXTENSIONS"
file = fields.Raw(
required=True,
metadata={
"description": "The file to upload",
"type": "string",
"format": "binary",
},
)
header_row = fields.Integer(
metadata={
"description": "Row containing the headers to use as column names"
@@ -1347,23 +1265,6 @@ class ExcelMetadataUploadFilePostSchema(BaseUploadFilePostSchema):
)
class ColumnarMetadataUploadFilePostSchema(BaseUploadFilePostSchema):
"""
Schema for CSV metadata.
"""
_extension_config_key = "COLUMNAR_EXTENSIONS"
file = fields.Raw(
required=True,
metadata={
"description": "The file to upload",
"type": "string",
"format": "binary",
},
)
class UploadFileMetadataItemSchema(Schema):
sheet_name = fields.String(metadata={"description": "The name of the sheet"})
column_names = fields.List(