fix(hive): Update _latest_partition_from_df in HiveEngineSpec to work on tables with multiple indexes (#14302)

* Fix _latest_partition_from_df in HiveEngineSpec

* Add test HiveEngineSpec._latest_partition_from_df

* Fix formatting to pass black

Co-authored-by: Ville Brofeldt <ville.v.brofeldt@gmail.com>
This commit is contained in:
Long Le Xich
2021-11-23 16:07:47 +08:00
committed by GitHub
parent 5d1c9078ad
commit bc855f4040
2 changed files with 26 additions and 2 deletions

View File

@@ -429,9 +429,12 @@ class HiveEngineSpec(PrestoEngineSpec):
@classmethod
def _latest_partition_from_df(cls, df: pd.DataFrame) -> Optional[List[str]]:
"""Hive partitions look like ds={partition name}"""
"""Hive partitions look like ds={partition name}/ds={partition name}"""
if not df.empty:
return [df.iloc[:, 0].max().split("=")[1]]
return [
partition_str.split("=")[1]
for partition_str in df.iloc[:, 0].max().split("/")
]
return None
@classmethod

View File

@@ -17,6 +17,7 @@
# isort:skip_file
from datetime import datetime
from unittest import mock
from typing import List
import pytest
import pandas as pd
@@ -379,3 +380,23 @@ def test_where_latest_partition_no_columns_no_values(mock_method):
"test_table", "test_schema", db, select()
)
assert result is None
def test__latest_partition_from_df():
def is_correct_result(data: List, result: List) -> bool:
df = pd.DataFrame({"partition": data})
return HiveEngineSpec._latest_partition_from_df(df) == result
assert is_correct_result(["ds=01-01-19"], ["01-01-19"])
assert is_correct_result(
["ds=01-01-19", "ds=01-03-19", "ds=01-02-19"], ["01-03-19"]
)
assert is_correct_result(["ds=01-01-19/hour=1"], ["01-01-19", "1"])
assert is_correct_result(
["ds=01-01-19/hour=1", "ds=01-03-19/hour=1", "ds=01-02-19/hour=1"],
["01-03-19", "1"],
)
assert is_correct_result(
["ds=01-01-19/hour=1", "ds=01-03-19/hour=1", "ds=01-02-19/hour=2"],
["01-03-19", "1"],
)