mirror of
https://github.com/apache/superset.git
synced 2026-04-21 09:04:38 +00:00
feat: support nulls in the csv uploads (#10208)
* Support more table properties for the hive upload Refactor Add tests, and refactor them to be pytest friendly Use lowercase table names Ignore isort * Use sql params Co-authored-by: bogdan kyryliuk <bogdankyryliuk@dropbox.com>
This commit is contained in:
@@ -106,6 +106,45 @@ class HiveEngineSpec(PrestoEngineSpec):
|
||||
except pyhive.exc.ProgrammingError:
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def get_create_table_stmt( # pylint: disable=too-many-arguments
|
||||
cls,
|
||||
table: Table,
|
||||
schema_definition: str,
|
||||
location: str,
|
||||
delim: str,
|
||||
header_line_count: Optional[int],
|
||||
null_values: Optional[List[str]],
|
||||
) -> text:
|
||||
tblproperties = []
|
||||
# available options:
|
||||
# https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL
|
||||
# TODO(bkyryliuk): figure out what to do with the skip rows field.
|
||||
params: Dict[str, str] = {
|
||||
"delim": delim,
|
||||
"location": location,
|
||||
}
|
||||
if header_line_count is not None and header_line_count >= 0:
|
||||
header_line_count += 1
|
||||
tblproperties.append("'skip.header.line.count'=':header_line_count'")
|
||||
params["header_line_count"] = str(header_line_count)
|
||||
if null_values:
|
||||
# hive only supports 1 value for the null format
|
||||
tblproperties.append("'serialization.null.format'=':null_value'")
|
||||
params["null_value"] = null_values[0]
|
||||
|
||||
if tblproperties:
|
||||
tblproperties_stmt = f"tblproperties ({', '.join(tblproperties)})"
|
||||
sql = f"""CREATE TABLE {str(table)} ( {schema_definition} )
|
||||
ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
|
||||
STORED AS TEXTFILE LOCATION :location
|
||||
{tblproperties_stmt}"""
|
||||
else:
|
||||
sql = f"""CREATE TABLE {str(table)} ( {schema_definition} )
|
||||
ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
|
||||
STORED AS TEXTFILE LOCATION :location"""
|
||||
return sql, params
|
||||
|
||||
@classmethod
|
||||
def create_table_from_csv( # pylint: disable=too-many-arguments, too-many-locals
|
||||
cls,
|
||||
@@ -182,18 +221,17 @@ class HiveEngineSpec(PrestoEngineSpec):
|
||||
bucket_path,
|
||||
os.path.join(upload_prefix, table.table, os.path.basename(filename)),
|
||||
)
|
||||
sql = text(
|
||||
f"""CREATE TABLE {str(table)} ( {schema_definition} )
|
||||
ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
|
||||
STORED AS TEXTFILE LOCATION :location
|
||||
tblproperties ('skip.header.line.count'='1')"""
|
||||
|
||||
sql, params = cls.get_create_table_stmt(
|
||||
table,
|
||||
schema_definition,
|
||||
location,
|
||||
csv_to_df_kwargs["sep"].encode().decode("unicode_escape"),
|
||||
int(csv_to_df_kwargs.get("header", 0)),
|
||||
csv_to_df_kwargs.get("na_values"),
|
||||
)
|
||||
engine = cls.get_engine(database)
|
||||
engine.execute(
|
||||
sql,
|
||||
delim=csv_to_df_kwargs["sep"].encode().decode("unicode_escape"),
|
||||
location=location,
|
||||
)
|
||||
engine.execute(text(sql), **params)
|
||||
|
||||
@classmethod
|
||||
def convert_dttm(cls, target_type: str, dttm: datetime) -> Optional[str]:
|
||||
|
||||
Reference in New Issue
Block a user