feat: support nulls in the csv uploads (#10208)

* Support more table properties for the hive upload

Refactor

Add tests, and refactor them to be pytest friendly

Use lowercase table names

Ignore isort

* Use sql params

Co-authored-by: bogdan kyryliuk <bogdankyryliuk@dropbox.com>
This commit is contained in:
Bogdan
2020-07-06 13:26:43 -07:00
committed by GitHub
parent 318e5347bc
commit 84f8a51458
10 changed files with 325 additions and 160 deletions

View File

@@ -106,6 +106,45 @@ class HiveEngineSpec(PrestoEngineSpec):
except pyhive.exc.ProgrammingError:
return []
@classmethod
def get_create_table_stmt( # pylint: disable=too-many-arguments
cls,
table: Table,
schema_definition: str,
location: str,
delim: str,
header_line_count: Optional[int],
null_values: Optional[List[str]],
) -> text:
tblproperties = []
# available options:
# https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL
# TODO(bkyryliuk): figure out what to do with the skip rows field.
params: Dict[str, str] = {
"delim": delim,
"location": location,
}
if header_line_count is not None and header_line_count >= 0:
header_line_count += 1
tblproperties.append("'skip.header.line.count'=':header_line_count'")
params["header_line_count"] = str(header_line_count)
if null_values:
# hive only supports 1 value for the null format
tblproperties.append("'serialization.null.format'=':null_value'")
params["null_value"] = null_values[0]
if tblproperties:
tblproperties_stmt = f"tblproperties ({', '.join(tblproperties)})"
sql = f"""CREATE TABLE {str(table)} ( {schema_definition} )
ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
STORED AS TEXTFILE LOCATION :location
{tblproperties_stmt}"""
else:
sql = f"""CREATE TABLE {str(table)} ( {schema_definition} )
ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
STORED AS TEXTFILE LOCATION :location"""
return sql, params
@classmethod
def create_table_from_csv( # pylint: disable=too-many-arguments, too-many-locals
cls,
@@ -182,18 +221,17 @@ class HiveEngineSpec(PrestoEngineSpec):
bucket_path,
os.path.join(upload_prefix, table.table, os.path.basename(filename)),
)
sql = text(
f"""CREATE TABLE {str(table)} ( {schema_definition} )
ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
STORED AS TEXTFILE LOCATION :location
tblproperties ('skip.header.line.count'='1')"""
sql, params = cls.get_create_table_stmt(
table,
schema_definition,
location,
csv_to_df_kwargs["sep"].encode().decode("unicode_escape"),
int(csv_to_df_kwargs.get("header", 0)),
csv_to_df_kwargs.get("na_values"),
)
engine = cls.get_engine(database)
engine.execute(
sql,
delim=csv_to_df_kwargs["sep"].encode().decode("unicode_escape"),
location=location,
)
engine.execute(text(sql), **params)
@classmethod
def convert_dttm(cls, target_type: str, dttm: datetime) -> Optional[str]: