Merge pull request #255 from preset-io/use-sqlglot

betodealmeida · web-flow · commit 9bdc49c79994 · 2024-01-09T19:17:09.000-05:00
chore: use sqlglot
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -1,6 +1,6 @@
 #
-# This file is autogenerated by pip-compile with python 3.8
-# To update, run:
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
 #
 #    pip-compile --no-annotate dev-requirements.in
 #
@@ -68,7 +68,6 @@ six==1.16.0
 soupsieve==2.3.2.post1
 sqlalchemy==1.4.40
 sqlglot==20.7.1
-sqlparse==0.4.3
 tabulate==0.8.10
 toml==0.10.2
 tomli==2.0.1
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 #
-# This file is autogenerated by pip-compile with python 3.8
-# To update, run:
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
 #
 #    pip-compile --no-annotate
 #
@@ -41,7 +41,6 @@ six==1.16.0
 soupsieve==2.3.2.post1
 sqlalchemy==1.4.35
 sqlglot==20.7.1
-sqlparse==0.4.3
 tabulate==0.8.9
 typing-extensions==4.2.0
 urllib3==1.26.9
diff --git a/setup.cfg b/setup.cfg
@@ -72,7 +72,6 @@ install_requires =
     rich>=12.3.0
     sqlalchemy>=1.4,<2
     sqlglot>=19
-    sqlparse>=0.4.3
     tabulate>=0.8.9
     typing-extensions>=4.0.1
     yarl>=1.7.2
diff --git a/src/preset_cli/cli/superset/sql.py b/src/preset_cli/cli/superset/sql.py
@@ -15,14 +15,14 @@
 from prompt_toolkit.styles.pygments import style_from_pygments_cls
 from pygments.lexers.sql import SqlLexer
 from pygments.styles import get_style_by_name
-from sqlparse.keywords import KEYWORDS
+from sqlglot.tokens import Tokenizer
 from tabulate import tabulate
 from yarl import URL
 
 from preset_cli.api.clients.superset import SupersetClient
 from preset_cli.exceptions import SupersetError
 
-sql_completer = WordCompleter(list(KEYWORDS))
+sql_completer = WordCompleter(list(Tokenizer.KEYWORDS))
 style = style_from_pygments_cls(get_style_by_name("stata-dark"))
 
 
diff --git a/src/preset_cli/cli/superset/sync/dbt/metrics.py b/src/preset_cli/cli/superset/sync/dbt/metrics.py
@@ -11,12 +11,10 @@
 from collections import defaultdict
 from typing import Dict, List, Optional, Set
 
-import sqlparse
-from sqlglot import Expression, parse_one
+import sqlglot
+from sqlglot import Expression, exp, parse_one
 from sqlglot.expressions import Alias, Case, Identifier, If, Join, Select, Table, Where
 from sqlglot.optimizer import traverse_scope
-from sqlparse.sql import Identifier as SQLParseIdentifier
-from sqlparse.sql import TokenList
 
 from preset_cli.api.clients.dbt import (
     FilterSchema,
@@ -43,7 +41,7 @@
 
 def get_metric_expression(unique_id: str, metrics: Dict[str, MetricSchema]) -> str:
     """
-    Return a SQL expression for a given dbt metric.
+    Return a SQL expression for a given dbt metric using sqlglot.
     """
     if unique_id not in metrics:
         raise Exception(f"Invalid metric {unique_id}")
@@ -77,18 +75,16 @@ def get_metric_expression(unique_id: str, metrics: Dict[str, MetricSchema]) -> s
         return f"COUNT(DISTINCT {sql})"
 
     if type_ in {"expression", "derived"}:
-        statement = sqlparse.parse(sql)[0]
-        tokens = statement.tokens[:]
-        while tokens:
-            token = tokens.pop(0)
-
-            if isinstance(token, SQLParseIdentifier) and token.value in metrics:
-                parent_sql = get_metric_expression(token.value, metrics)
-                token.tokens = sqlparse.parse(parent_sql)[0].tokens
-            elif isinstance(token, TokenList):
-                tokens.extend(token.tokens)
-
-        return str(statement)
+        expression = sqlglot.parse_one(sql)
+        tokens = expression.find_all(exp.Column)
+
+        for token in tokens:
+            if token.sql() in metrics:
+                parent_sql = get_metric_expression(token.sql(), metrics)
+                parent_expression = sqlglot.parse_one(parent_sql)
+                token.replace(parent_expression)
+
+        return expression.sql()
 
     sorted_metric = dict(sorted(metric.items()))
     raise Exception(f"Unable to generate metric expression from: {sorted_metric}")
diff --git a/tests/cli/superset/sync/dbt/metrics_test.py b/tests/cli/superset/sync/dbt/metrics_test.py
@@ -84,8 +84,8 @@ def test_get_metric_expression() -> None:
     assert get_metric_expression("two", metrics) == "COUNT(DISTINCT user_id)"
 
     assert get_metric_expression("three", metrics) == (
-        "COUNT(CASE WHEN is_paying is true AND lifetime_value >= 100 AND "
-        "company_name != 'Acme, Inc' AND signup_date >= '2020-01-01' THEN user_id END) "
+        "COUNT(CASE WHEN is_paying IS TRUE AND lifetime_value >= 100 AND "
+        "company_name <> 'Acme, Inc' AND signup_date >= '2020-01-01' THEN user_id END) "
         "- COUNT(DISTINCT user_id)"
     )
 

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`#`
`2`		`-# This file is autogenerated by pip-compile with python 3.8`
`3`		`-# To update, run:`
	`2`	`+# This file is autogenerated by pip-compile with Python 3.8`
	`3`	`+# by the following command:`
`4`	`4`	`#`
`5`	`5`	`# pip-compile --no-annotate dev-requirements.in`
`6`	`6`	`#`
`@@ -68,7 +68,6 @@ six==1.16.0`
`68`	`68`	`soupsieve==2.3.2.post1`
`69`	`69`	`sqlalchemy==1.4.40`
`70`	`70`	`sqlglot==20.7.1`
`71`		`-sqlparse==0.4.3`
`72`	`71`	`tabulate==0.8.10`
`73`	`72`	`toml==0.10.2`
`74`	`73`	`tomli==2.0.1`
Original file line number	Diff line number	Diff line change
`@@ -84,8 +84,8 @@ def test_get_metric_expression() -> None:`
`84`	`84`	`assert get_metric_expression("two", metrics) == "COUNT(DISTINCT user_id)"`
`85`	`85`
`86`	`86`	`assert get_metric_expression("three", metrics) == (`
`87`		`- "COUNT(CASE WHEN is_paying is true AND lifetime_value >= 100 AND "`
`88`		`- "company_name != 'Acme, Inc' AND signup_date >= '2020-01-01' THEN user_id END) "`
	`87`	`+ "COUNT(CASE WHEN is_paying IS TRUE AND lifetime_value >= 100 AND "`
	`88`	`+ "company_name <> 'Acme, Inc' AND signup_date >= '2020-01-01' THEN user_id END) "`
`89`	`89`	`"- COUNT(DISTINCT user_id)"`
`90`	`90`	`)`
`91`	`91`