diff --git a/caravel/assets/javascripts/SqlLab/components/ResultSet.jsx b/caravel/assets/javascripts/SqlLab/components/ResultSet.jsx
index 7f855e9c3ea..247759e0894 100644
--- a/caravel/assets/javascripts/SqlLab/components/ResultSet.jsx
+++ b/caravel/assets/javascripts/SqlLab/components/ResultSet.jsx
@@ -72,7 +72,7 @@ class ResultSet extends React.Component {
col.name)}
sortable
className="table table-condensed table-bordered"
filterBy={this.state.searchText}
diff --git a/caravel/assets/javascripts/SqlLab/components/VisualizeModal.jsx b/caravel/assets/javascripts/SqlLab/components/VisualizeModal.jsx
index 718a9fda232..ee8d2cd3e9b 100644
--- a/caravel/assets/javascripts/SqlLab/components/VisualizeModal.jsx
+++ b/caravel/assets/javascripts/SqlLab/components/VisualizeModal.jsx
@@ -26,10 +26,25 @@ class VisualizeModal extends React.Component {
columns: {},
hints: [],
};
+ // update columns if possible
+ this.setStateFromProps();
+ }
+ componentWillMount() {
+ this.setStateFromProps();
}
componentDidMount() {
this.validate();
}
+ setStateFromProps() {
+ if (!this.props.query || !this.props.query.results.columns) {
+ return;
+ }
+ const columns = {};
+ this.props.query.results.columns.forEach((col) => {
+ columns[col.name] = col;
+ });
+ this.setState({ columns });
+ }
validate() {
const hints = [];
const cols = this.mergedColumns();
@@ -67,8 +82,8 @@ class VisualizeModal extends React.Component {
const columns = Object.assign({}, this.state.columns);
if (this.props.query && this.props.query.results.columns) {
this.props.query.results.columns.forEach((col) => {
- if (columns[col] === undefined) {
- columns[col] = {};
+ if (columns[col.name] === undefined) {
+ columns[col.name] = col;
}
});
}
@@ -88,17 +103,17 @@ class VisualizeModal extends React.Component {
this.setState({ datasourceName: event.target.value });
this.validate();
}
- changeCheckbox(attr, col, event) {
+ changeCheckbox(attr, columnName, event) {
let columns = this.mergedColumns();
- const column = Object.assign({}, columns[col], { [attr]: event.target.checked });
- columns = Object.assign({}, columns, { [col]: column });
+ const column = Object.assign({}, columns[columnName], { [attr]: event.target.checked });
+ columns = Object.assign({}, columns, { [columnName]: column });
this.setState({ columns }, this.validate);
}
- changeAggFunction(col, option) {
+ changeAggFunction(columnName, option) {
let columns = this.mergedColumns();
const val = (option) ? option.value : null;
- const column = Object.assign({}, columns[col], { agg: val });
- columns = Object.assign({}, columns, { [col]: column });
+ const column = Object.assign({}, columns[columnName], { agg: val });
+ columns = Object.assign({}, columns, { [columnName]: column });
this.setState({ columns }, this.validate);
}
render() {
@@ -106,12 +121,12 @@ class VisualizeModal extends React.Component {
return ;
}
const tableData = this.props.query.results.columns.map((col) => ({
- column: col,
+ column: col.name,
is_dimension: (
),
@@ -119,8 +134,8 @@ class VisualizeModal extends React.Component {
),
agg_func: (
@@ -132,8 +147,8 @@ class VisualizeModal extends React.Component {
{ value: 'avg', label: 'AVG(x)' },
{ value: 'count_distinct', label: 'COUNT(DISTINCT x)' },
]}
- onChange={this.changeAggFunction.bind(this, col)}
- value={(this.state.columns[col]) ? this.state.columns[col].agg : null}
+ onChange={this.changeAggFunction.bind(this, col.name)}
+ value={(this.state.columns[col.name]) ? this.state.columns[col.name].agg : null}
/>
),
}));
diff --git a/caravel/dataframe.py b/caravel/dataframe.py
new file mode 100644
index 00000000000..4b86e80d45c
--- /dev/null
+++ b/caravel/dataframe.py
@@ -0,0 +1,113 @@
+""" Caravel wrapper around pandas.DataFrame.
+
+TODO(bkyryliuk): add support for the conventions like: *_dim or dim_*
+ dimensions, *_ts, ts_*, ds_*, *_ds - datetime, etc.
+TODO(bkyryliuk): recognize integer encoded enums.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import pandas as pd
+import numpy as np
+
+
+INFER_COL_TYPES_THRESHOLD = 95
+INFER_COL_TYPES_SAMPLE_SIZE = 100
+
+
+# http://pandas.pydata.org/pandas-docs/stable/internals.html#
+# subclassing-pandas-data-structures
+class CaravelDataFrame(object):
+ def __init__(self, df):
+ self.__df = df.where((pd.notnull(df)), None)
+
+ @property
+ def size(self):
+ return len(self.__df.index)
+
+ @property
+ def data(self):
+ return self.__df.to_dict(orient='records')
+
+ @property
+ def columns_dict(self):
+ """Provides metadata about columns for data visualization.
+
+ :return: dict, with the fields name, type, is_date, is_dim and agg.
+ """
+ if self.__df.empty:
+ return None
+
+ columns = []
+ sample_size = min(INFER_COL_TYPES_SAMPLE_SIZE, len(self.__df.index))
+ sample = self.__df
+ if sample_size:
+ sample = self.__df.sample(sample_size)
+ for col in self.__df.dtypes.keys():
+ column = {
+ 'name': col,
+ 'type': self.__df.dtypes[col].name,
+ 'is_date': is_date(self.__df.dtypes[col]),
+ 'is_dim': is_dimension(self.__df.dtypes[col], col),
+ }
+ agg = agg_func(self.__df.dtypes[col], col)
+ if agg_func:
+ column['agg'] = agg
+
+ if column['type'] == 'object':
+ # check if encoded datetime
+ if (datetime_conversion_rate(sample[col]) >
+ INFER_COL_TYPES_THRESHOLD):
+ column.update({
+ 'type': 'datetime_string',
+ 'is_date': True,
+ 'is_dim': False,
+ 'agg': None
+ })
+ # 'agg' is optional attribute
+ if not column['agg']:
+ column.pop('agg', None)
+ columns.append(column)
+
+ return columns
+
+
+# It will give false positives on the numbers that are stored as strings.
+# It is hard to distinguish integer numbers and timestamps
+def datetime_conversion_rate(data_series):
+ success = 0
+ total = 0
+ for value in data_series:
+ total = total + 1
+ try:
+ pd.to_datetime(value)
+ success = success + 1
+ except Exception:
+ continue
+ return 100 * success / total
+
+
+def is_date(dtype):
+ return dtype.name.startswith('datetime')
+
+
+def is_dimension(dtype, column_name):
+ if is_id(column_name):
+ return False
+ return dtype == np.object or dtype == np.bool
+
+
+def is_id(column_name):
+ return column_name.startswith('id') or column_name.endswith('id')
+
+
+def agg_func(dtype, column_name):
+ # consider checking for key substring too.
+ if is_id(column_name):
+ return 'count_distinct'
+ if np.issubdtype(dtype, np.number):
+ return 'sum'
+ return None
diff --git a/caravel/sql_lab.py b/caravel/sql_lab.py
index 13009f56349..97fa11ac202 100644
--- a/caravel/sql_lab.py
+++ b/caravel/sql_lab.py
@@ -2,10 +2,10 @@ import celery
from datetime import datetime
import pandas as pd
import logging
+import numpy
import time
-import json
-from caravel import app, cache, db, models, utils
+from caravel import app, db, models, utils, dataframe
QueryStatus = models.QueryStatus
@@ -114,45 +114,40 @@ def get_sql_results(query_id, return_results=True):
time.sleep(1)
polled = cursor.poll()
- columns = None
- data = None
+ cdf = None
if result_proxy.cursor:
- columns = [col[0] for col in result_proxy.cursor.description]
+ column_names = [col[0] for col in result_proxy.cursor.description]
data = result_proxy.fetchall()
- df = pd.DataFrame(data, columns=columns)
- df = df.where((pd.notnull(df)), None)
- # TODO consider generating tuples instead of dicts to send
- # less data through the wire. The command bellow does that,
- # but we'd need to align on the client side.
- # data = df.values.tolist()
- data = df.to_dict(orient='records')
+ cdf = dataframe.CaravelDataFrame(
+ pd.DataFrame(data, columns=column_names))
+ # TODO consider generating tuples instead of dicts to send
+ # less data through the wire. The command bellow does that,
+ # but we'd need to align on the client side.
+ # data = df.values.tolist()
query.rows = result_proxy.rowcount
query.progress = 100
query.status = QueryStatus.SUCCESS
- if query.rows == -1 and data:
+ if query.rows == -1 and cdf:
# Presto doesn't provide result_proxy.row_count
- query.rows = len(data)
-
- # CTAs queries result in 1 cell having the # of the added rows.
+ query.rows = cdf.size
if query.select_as_cta:
query.select_sql = '{}'.format(database.select_star(
query.tmp_table_name, limit=query.limit))
-
query.end_time = utils.now_as_float()
session.commit()
- payload = {
- 'query_id': query.id,
- 'status': query.status,
- 'data': [],
- }
- if query.status == models.QueryStatus.SUCCESS:
- payload['data'] = data
- payload['columns'] = columns
- else:
- payload['error'] = query.error_message
if return_results:
+ payload = {
+ 'query_id': query.id,
+ 'status': query.status,
+ 'data': [],
+ }
+ if query.status == models.QueryStatus.SUCCESS:
+ payload['data'] = cdf.data if cdf else []
+ payload['columns'] = cdf.columns_dict if cdf else []
+ else:
+ payload['error'] = query.error_message
return payload
'''
# Hack testing using a kv store for results
diff --git a/tests/celery_tests.py b/tests/celery_tests.py
index 1cd3401c7e3..46cef8f3b82 100644
--- a/tests/celery_tests.py
+++ b/tests/celery_tests.py
@@ -14,7 +14,7 @@ import unittest
import pandas as pd
import caravel
-from caravel import app, appbuilder, db, models, sql_lab, utils
+from caravel import app, appbuilder, db, models, sql_lab, utils, dataframe
from .base_tests import CaravelTestCase
@@ -34,6 +34,7 @@ app.config['CELERY_CONFIG'] = CeleryConfig
class UtilityFunctionTests(CaravelTestCase):
+
# TODO(bkyryliuk): support more cases in CTA function.
def test_create_table_as(self):
select_query = "SELECT * FROM outer_space;"
@@ -193,8 +194,8 @@ class CeleryTestCase(CaravelTestCase):
result2 = self.run_sql(
1, sql_where, tmp_table='tmp_table_2', cta='true')
self.assertEqual(QueryStatus.SUCCESS, result2['query']['state'])
- self.assertIsNone(result2['data'])
- self.assertIsNone(result2['columns'])
+ self.assertEqual([], result2['data'])
+ self.assertEqual([], result2['columns'])
query2 = self.get_query_by_id(result2['query']['serverId'])
# Check the data in the tmp table.
@@ -208,8 +209,8 @@ class CeleryTestCase(CaravelTestCase):
result3 = self.run_sql(
1, sql_empty_result, tmp_table='tmp_table_3', cta='true',)
self.assertEqual(QueryStatus.SUCCESS, result3['query']['state'])
- self.assertIsNone(result3['data'])
- self.assertIsNone(result3['columns'])
+ self.assertEqual([], result3['data'])
+ self.assertEqual([], result3['columns'])
query3 = self.get_query_by_id(result3['query']['serverId'])
self.assertEqual(QueryStatus.SUCCESS, query3.status)
@@ -250,6 +251,52 @@ class CeleryTestCase(CaravelTestCase):
self.assertEqual(True, query1.select_as_cta)
self.assertEqual(True, query1.select_as_cta_used)
+ def test_get_columns_dict(self):
+ main_db = db.session.query(models.Database).filter_by(
+ database_name='main').first()
+ df = main_db.get_df("SELECT * FROM multiformat_time_series", None)
+ cdf = dataframe.CaravelDataFrame(df)
+ if main_db.sqlalchemy_uri.startswith('sqlite'):
+ self.assertEqual(
+ [{'is_date': True, 'type': 'datetime_string', 'name': 'ds',
+ 'is_dim': False},
+ {'is_date': True, 'type': 'datetime_string', 'name': 'ds2',
+ 'is_dim': False},
+ {'agg': 'sum', 'is_date': False, 'type': 'int64',
+ 'name': 'epoch_ms', 'is_dim': False},
+ {'agg': 'sum', 'is_date': False, 'type': 'int64',
+ 'name': 'epoch_s', 'is_dim': False},
+ {'is_date': True, 'type': 'datetime_string', 'name': 'string0',
+ 'is_dim': False},
+ {'is_date': False, 'type': 'object',
+ 'name': 'string1', 'is_dim': True},
+ {'is_date': True, 'type': 'datetime_string', 'name': 'string2',
+ 'is_dim': False},
+ {'is_date': False, 'type': 'object',
+ 'name': 'string3', 'is_dim': True}]
+ , cdf.columns_dict
+ )
+ else:
+ self.assertEqual(
+ [{'is_date': True, 'type': 'datetime_string', 'name': 'ds',
+ 'is_dim': False},
+ {'is_date': True, 'type': 'datetime64[ns]',
+ 'name': 'ds2', 'is_dim': False},
+ {'agg': 'sum', 'is_date': False, 'type': 'int64',
+ 'name': 'epoch_ms', 'is_dim': False},
+ {'agg': 'sum', 'is_date': False, 'type': 'int64',
+ 'name': 'epoch_s', 'is_dim': False},
+ {'is_date': True, 'type': 'datetime_string', 'name': 'string0',
+ 'is_dim': False},
+ {'is_date': False, 'type': 'object',
+ 'name': 'string1', 'is_dim': True},
+ {'is_date': True, 'type': 'datetime_string', 'name': 'string2',
+ 'is_dim': False},
+ {'is_date': False, 'type': 'object',
+ 'name': 'string3', 'is_dim': True}]
+ , cdf.columns_dict
+ )
+
if __name__ == '__main__':
unittest.main()