Files
superset2/superset/sql_parse.py
Maxime Beauchemin b839608c32 [sql lab] a better approach at limiting queries (#4947)
* [sql lab] a better approach at limiting queries

Currently there are two mechanisms that we use to enforce the row
limiting constraints, depending on the database engine:
1. use dbapi's `cursor.fetchmany()`
2. wrap the SQL into a limiting subquery

Method 1 isn't great as it can result in the database server storing
larger than required result sets in memory expecting another fetch
command while we know we don't need that.

Method 2 has a positive side of working with all database engines,
whether they use LIMIT, ROWNUM, TOP or whatever else since sqlalchemy
does the work as specified for the dialect. On the downside though
the query optimizer might not be able to optimize this as much as an
approach that doesn't use a subquery.

Since most modern DBs use the LIMIT syntax, this adds a regex approach
to modify the query and force a LIMIT clause without using a subquery
for the database that support this syntax and uses method 2 for all
others.

* Fixing build

* Fix lint

* Added more tests

* Fix tests
2018-05-14 14:44:05 -05:00

131 lines
4.3 KiB
Python

# -*- coding: utf-8 -*-
# pylint: disable=C,R,W
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import sqlparse
from sqlparse.sql import Identifier, IdentifierList
from sqlparse.tokens import Keyword, Name
RESULT_OPERATIONS = {'UNION', 'INTERSECT', 'EXCEPT'}
PRECEDES_TABLE_NAME = {'FROM', 'JOIN', 'DESC', 'DESCRIBE', 'WITH'}
class SupersetQuery(object):
def __init__(self, sql_statement):
self.sql = sql_statement
self._table_names = set()
self._alias_names = set()
# TODO: multistatement support
logging.info('Parsing with sqlparse statement {}'.format(self.sql))
self._parsed = sqlparse.parse(self.sql)
for statement in self._parsed:
self.__extract_from_token(statement)
self._table_names = self._table_names - self._alias_names
@property
def tables(self):
return self._table_names
def is_select(self):
return self._parsed[0].get_type() == 'SELECT'
def stripped(self):
return self.sql.strip(' \t\n;')
@staticmethod
def __precedes_table_name(token_value):
for keyword in PRECEDES_TABLE_NAME:
if keyword in token_value:
return True
return False
@staticmethod
def __get_full_name(identifier):
if len(identifier.tokens) > 1 and identifier.tokens[1].value == '.':
return '{}.{}'.format(identifier.tokens[0].value,
identifier.tokens[2].value)
return identifier.get_real_name()
@staticmethod
def __is_result_operation(keyword):
for operation in RESULT_OPERATIONS:
if operation in keyword.upper():
return True
return False
@staticmethod
def __is_identifier(token):
return isinstance(token, (IdentifierList, Identifier))
def __process_identifier(self, identifier):
# exclude subselects
if '(' not in '{}'.format(identifier):
self._table_names.add(self.__get_full_name(identifier))
return
# store aliases
if hasattr(identifier, 'get_alias'):
self._alias_names.add(identifier.get_alias())
if hasattr(identifier, 'tokens'):
# some aliases are not parsed properly
if identifier.tokens[0].ttype == Name:
self._alias_names.add(identifier.tokens[0].value)
self.__extract_from_token(identifier)
def as_create_table(self, table_name, overwrite=False):
"""Reformats the query into the create table as query.
Works only for the single select SQL statements, in all other cases
the sql query is not modified.
:param superset_query: string, sql query that will be executed
:param table_name: string, will contain the results of the
query execution
:param overwrite, boolean, table table_name will be dropped if true
:return: string, create table as query
"""
exec_sql = ''
sql = self.stripped()
if overwrite:
exec_sql = 'DROP TABLE IF EXISTS {table_name};\n'
exec_sql += 'CREATE TABLE {table_name} AS \n{sql}'
return exec_sql.format(**locals())
def __extract_from_token(self, token):
if not hasattr(token, 'tokens'):
return
table_name_preceding_token = False
for item in token.tokens:
if item.is_group and not self.__is_identifier(item):
self.__extract_from_token(item)
if item.ttype in Keyword:
if self.__precedes_table_name(item.value.upper()):
table_name_preceding_token = True
continue
if not table_name_preceding_token:
continue
if item.ttype in Keyword:
if self.__is_result_operation(item.value):
table_name_preceding_token = False
continue
# FROM clause is over
break
if isinstance(item, Identifier):
self.__process_identifier(item)
if isinstance(item, IdentifierList):
for token in item.tokens:
if self.__is_identifier(token):
self.__process_identifier(token)