[druid] optimize Druid queries where possible (#1517)

* [druid] optimize Druid queries where possible

Trying to use timeseries, topn where possible, falling back on 2-phases
groupby only where needed

* Fixing py3 bug
This commit is contained in:
Maxime Beauchemin
2016-11-02 11:25:33 -07:00
committed by GitHub
parent cdf4dd0302
commit 1b124bfb87
3 changed files with 110 additions and 65 deletions

View File

@@ -113,10 +113,11 @@ class DruidTests(CaravelTestCase):
instance.query_dict = {}
instance.query_builder.last_query.query_dict = {}
resp = self.client.get('/caravel/explore/druid/{}/'.format(
resp = self.get_resp('/caravel/explore/druid/{}/'.format(
datasource_id))
assert "[test_cluster].[test_datasource]" in resp.data.decode('utf-8')
assert "[test_cluster].[test_datasource]" in resp
# One groupby
url = (
'/caravel/explore_json/druid/{}/?viz_type=table&granularity=one+day&'
'druid_time_origin=&since=7+days+ago&until=now&row_limit=5000&'
@@ -125,8 +126,35 @@ class DruidTests(CaravelTestCase):
'action=&datasource_name=test_datasource&datasource_id={}&'
'datasource_type=druid&previous_viz_type=table&'
'force=true'.format(datasource_id, datasource_id))
resp = self.get_resp(url)
assert "Canada" in resp
resp = self.get_json_resp(url)
self.assertEqual("Canada", resp['data']['records'][0]['name'])
# two groupby
url = (
'/caravel/explore_json/druid/{}/?viz_type=table&granularity=one+day&'
'druid_time_origin=&since=7+days+ago&until=now&row_limit=5000&'
'include_search=false&metrics=count&groupby=name&'
'flt_col_0=dim1&groupby=second&'
'flt_op_0=in&flt_eq_0=&slice_id=&slice_name=&collapsed_fieldsets=&'
'action=&datasource_name=test_datasource&datasource_id={}&'
'datasource_type=druid&previous_viz_type=table&'
'force=true'.format(datasource_id, datasource_id))
resp = self.get_json_resp(url)
self.assertEqual("Canada", resp['data']['records'][0]['name'])
# no groupby
url = (
'/caravel/explore_json/druid/{}/?viz_type=table&granularity=one+day&'
'druid_time_origin=&since=7+days+ago&until=now&row_limit=5000&'
'include_search=false&metrics=count&'
'flt_col_0=dim1&'
'flt_op_0=in&flt_eq_0=&slice_id=&slice_name=&collapsed_fieldsets=&'
'action=&datasource_name=test_datasource&datasource_id={}&'
'datasource_type=druid&previous_viz_type=table&'
'force=true'.format(datasource_id, datasource_id))
resp = self.get_json_resp(url)
self.assertEqual(
"2012-01-01T00:00:00", resp['data']['records'][0]['timestamp'])
def test_druid_sync_from_config(self):
CLUSTER_NAME = 'new_druid'