diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c428d7b2..dd5ac4d3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -31,6 +31,12 @@ repos: hooks: - id: black args: [--line-length=120, --skip-string-normalization] + - repo: https://github.com/tconbeer/sqlfmt + rev: v0.10.1 + hooks: + - id: sqlfmt + language_version: python + exclude: 'rasgotransforms/rasgotransforms/snippets' ci: autofix_commit_msg: '[pre-commit.ci] auto fixes from pre-commit.com hooks' autofix_prs: true diff --git a/rasgotransforms/rasgotransforms/transforms/aggregate/aggregate.sql b/rasgotransforms/rasgotransforms/transforms/aggregate/aggregate.sql index 0033dc26..cba31319 100644 --- a/rasgotransforms/rasgotransforms/transforms/aggregate/aggregate.sql +++ b/rasgotransforms/rasgotransforms/transforms/aggregate/aggregate.sql @@ -1,38 +1,39 @@ {%- set aggregations = aggregations.copy() %} {%- if 'numeric columns' in aggregations.keys() and aggregations['numeric columns']|length > 0 %} - {%- set all_columns = get_columns(source_table) %} - {%- for column, column_type in all_columns.items() %} - {%- if column not in aggregations.keys() and column_type|lower in ['int', 'integer', 'bigint', 'smallint', 'number', 'numeric', 'float', 'float4', 'float8', 'decimal', 'double precision', 'real'] %} - {%- do aggregations.setdefault(column, []).extend(aggregations['numeric columns']) %} - {%- endif %} - {%- endfor %} - {%- set _ = aggregations.pop('numeric columns') %} +{%- set all_columns = get_columns(source_table) %} +{%- for column, column_type in all_columns.items() %} +{%- if column not in aggregations.keys() and column_type|lower in ['int', 'integer', 'bigint', 'smallint', 'number', 'numeric', 'float', 'float4', 'float8', 'decimal', 'double precision', 'real'] %} +{%- do aggregations.setdefault(column, []).extend(aggregations['numeric columns']) %} +{%- endif %} +{%- endfor %} +{%- set _ = aggregations.pop('numeric columns') %} {%- endif -%} {%- if 'nonnumeric columns' in aggregations.keys() and aggregations['nonnumeric columns']|length > 0 %} - {%- set all_columns = all_columns if all_columns is defined else get_columns(source_table) %} - {%- for column, column_type in all_columns.items() %} - {%- if column not in aggregations.keys() and column_type|lower not in ['int', 'integer', 'bigint', 'smallint', 'number', 'numeric', 'float', 'float4', 'float8', 'decimal', 'double precision', 'real'] %} - {%- do aggregations.setdefault(column, []).extend(aggregations['nonnumeric columns']) %} - {%- endif %} - {%- endfor %} - {%- set _ = aggregations.pop('nonnumeric columns') %} +{%- set all_columns = all_columns if all_columns is defined else get_columns(source_table) %} +{%- for column, column_type in all_columns.items() %} +{%- if column not in aggregations.keys() and column_type|lower not in ['int', 'integer', 'bigint', 'smallint', 'number', 'numeric', 'float', 'float4', 'float8', 'decimal', 'double precision', 'real'] %} +{%- do aggregations.setdefault(column, []).extend(aggregations['nonnumeric columns']) %} +{%- endif %} +{%- endfor %} +{%- set _ = aggregations.pop('nonnumeric columns') %} {%- endif -%} -SELECT -{%- for group_item in group_by %} - {{ group_item }}, -{%- endfor -%} +select + {%- for group_item in group_by %} {{ group_item }}, {%- endfor -%} -{%- for col, aggs in aggregations.items() %} + {%- for col, aggs in aggregations.items() %} {%- set outer_loop = loop -%} {%- for agg in aggs %} - {%- if ' DISTINCT' in agg|upper %} - {{ agg|upper|replace(" DISTINCT", "") }}(DISTINCT {{ col }}) as {{ col ~ '_' ~ agg|upper|replace(" DISTINCT", "") ~ 'DISTINCT'}}{{ '' if loop.last and outer_loop.last else ',' }} - {%- else %} - {{ agg }}({{ col }}) as {{ col + '_' + agg }}{{ '' if loop.last and outer_loop.last else ',' }} - {%- endif %} + {%- if ' DISTINCT' in agg|upper %} + {{ agg|upper|replace(" DISTINCT", "") }} (distinct {{ col }}) + as {{ col ~ '_' ~ agg|upper|replace(" DISTINCT", "") ~ 'DISTINCT' }}{{ '' if loop.last and outer_loop.last else ',' }} + {%- else %} + {{ agg }} ( + {{ col }} + ) as {{ col + '_' + agg }}{{ '' if loop.last and outer_loop.last else ',' }} + {%- endif %} {%- endfor -%} -{%- endfor %} -FROM {{ source_table }} -GROUP BY {{ group_by | join(', ') }} + {%- endfor %} +from {{ source_table }} +group by {{ group_by | join(', ') }} diff --git a/rasgotransforms/rasgotransforms/transforms/aggregate/bigquery/aggregate.sql b/rasgotransforms/rasgotransforms/transforms/aggregate/bigquery/aggregate.sql index 55f55e1d..0702d112 100644 --- a/rasgotransforms/rasgotransforms/transforms/aggregate/bigquery/aggregate.sql +++ b/rasgotransforms/rasgotransforms/transforms/aggregate/bigquery/aggregate.sql @@ -1,116 +1,125 @@ {%- set aggregations = aggregations.copy() %} {%- if 'numeric columns' in aggregations.keys() and aggregations['numeric columns']|length > 0 %} - {%- set all_columns = get_columns(source_table) %} - {%- for column, column_type in all_columns.items() %} - {%- if column not in aggregations.keys() and column_type|lower in ['int', 'integer', 'bigint', 'smallint', 'number', 'numeric', 'float', 'float4', 'float8', 'decimal', 'double precision', 'real'] %} - {%- do aggregations.setdefault(column, []).extend(aggregations['numeric columns']) %} - {%- endif %} - {%- endfor %} - {%- set _ = aggregations.pop('numeric columns') %} +{%- set all_columns = get_columns(source_table) %} +{%- for column, column_type in all_columns.items() %} +{%- if column not in aggregations.keys() and column_type|lower in ['int', 'integer', 'bigint', 'smallint', 'number', 'numeric', 'float', 'float4', 'float8', 'decimal', 'double precision', 'real'] %} +{%- do aggregations.setdefault(column, []).extend(aggregations['numeric columns']) %} +{%- endif %} +{%- endfor %} +{%- set _ = aggregations.pop('numeric columns') %} {%- endif -%} {%- if 'nonnumeric columns' in aggregations.keys() and aggregations['nonnumeric columns']|length > 0 %} - {%- set all_columns = all_columns if all_columns is defined else get_columns(source_table) %} - {%- for column, column_type in all_columns.items() %} - {%- if column not in aggregations.keys() and column_type|lower not in ['int', 'integer', 'bigint', 'smallint', 'number', 'numeric', 'float', 'float4', 'float8', 'decimal', 'double precision', 'real'] %} - {%- do aggregations.setdefault(column, []).extend(aggregations['nonnumeric columns']) %} - {%- endif %} - {%- endfor %} - {%- set _ = aggregations.pop('nonnumeric columns') %} +{%- set all_columns = all_columns if all_columns is defined else get_columns(source_table) %} +{%- for column, column_type in all_columns.items() %} +{%- if column not in aggregations.keys() and column_type|lower not in ['int', 'integer', 'bigint', 'smallint', 'number', 'numeric', 'float', 'float4', 'float8', 'decimal', 'double precision', 'real'] %} +{%- do aggregations.setdefault(column, []).extend(aggregations['nonnumeric columns']) %} +{%- endif %} +{%- endfor %} +{%- set _ = aggregations.pop('nonnumeric columns') %} {%- endif -%} {%- set median_aggs = dict() -%} {%- set mode_aggs = dict() -%} {%- for col, aggs in aggregations.items() -%} - {%- for agg in aggs -%} - {%- if 'MEDIAN' in agg|upper -%} - {%- set _ = median_aggs.update({col: agg}) -%} - {%- elif 'MODE' in agg|upper -%} - {%- set _ = mode_aggs.update({col: agg}) -%} - {%- endif -%} - {%- endfor -%} +{%- for agg in aggs -%} +{%- if 'MEDIAN' in agg|upper -%} +{%- set _ = median_aggs.update({col: agg}) -%} +{%- elif 'MODE' in agg|upper -%} +{%- set _ = mode_aggs.update({col: agg}) -%} +{%- endif -%} +{%- endfor -%} {%- endfor -%} {%- if median_aggs -%} - WITH MEDIAN_CTE AS( - SELECT - DISTINCT {{ group_by | join(', ') }} +with + median_cte as ( + select distinct + {{ group_by | join(', ') }} {%- for med_col, med_agg in median_aggs.items() %} - ,PERCENTILE_CONT( {{ med_col }}, 0.5) OVER (PARTITION BY {{ group_by | join(', ') }}) AS {{ med_col }}_MEDIAN + , + percentile_cont({{ med_col }}, 0.5) over ( + partition by {{ group_by | join(', ') }} + ) as {{ med_col }}_median {%- endfor %} - FROM {{ source_table }} + from {{ source_table }} ), {%- endif -%} {%- if mode_aggs -%} - {%- if not median_aggs %} - WITH - {%- endif %} +{%- if not median_aggs %} +with +{%- endif %} {%- for mode_col, mode_agg in mode_aggs.items() %} - {{ mode_col }}_CTE AS ( - SELECT - {{ group_by | join(',\n') }} - ,{{ mode_col }} AS {{ mode_col }}_MODE - FROM ( - SELECT - {{ group_by | join(', ') }} - ,{{ mode_col }} - ,ROW_NUMBER() OVER (PARTITION BY {{ group_by | join(', ') }} ORDER BY COUNT({{ mode_col }}) DESC) rn - FROM {{ source_table }} - GROUP BY {{ group_by | join(', ') }}, {{ mode_col }} + {{ mode_col }}_cte as ( + select {{ group_by | join(',\n') }},{{ mode_col }} as {{ mode_col }}_mode + from + ( + select + {{ group_by | join(', ') }}, + {{ mode_col }}, + row_number() over ( + partition by {{ group_by | join(', ') }} + order by count({{ mode_col }}) desc + ) rn + from {{ source_table }} + group by {{ group_by | join(', ') }}, {{ mode_col }} ) - WHERE rn = 1 - ), + where rn = 1 + ), {%- endfor %} {%- endif -%} {%- if not (median_aggs or mode_aggs) %} -WITH +with {%- endif %} -AGGS AS ( - SELECT - {{ group_by | join(',\n') }} - {%- for col, aggs in aggregations.items() %} + aggs as ( + select + {{ group_by | join(',\n') }} + {%- for col, aggs in aggregations.items() %} {%- set outer_loop = loop -%} {%- for agg in aggs %} - {%- if ('MEDIAN' not in agg|upper and 'MODE' not in agg|upper) %} - {%- if ' DISTINCT' in agg|upper %} - ,{{ agg|replace(" DISTINCT", "") }}(DISTINCT {{ col }}) as {{ col ~ '_' ~ agg|replace(" DISTINCT", "") ~ 'DISTINCT'}} - {%- else %} - ,{{ agg }}({{ col }}) as {{ col + '_' + agg }} - {%- endif %} - {%- endif %} + {%- if ('MEDIAN' not in agg|upper and 'MODE' not in agg|upper) %} + {%- if ' DISTINCT' in agg|upper %} + , + {{ agg|replace(" DISTINCT", "") }} ( + distinct {{ col }} + ) as {{ col ~ '_' ~ agg|replace(" DISTINCT", "") ~ 'DISTINCT' }} + {%- else %},{{ agg }} ({{ col }}) as {{ col + '_' + agg }} + {%- endif %} + {%- endif %} {%- endfor -%} - {%- endfor %} - FROM {{ source_table }} - GROUP BY {{ group_by | join(', ') }} -) -SELECT -a.* -{%- if median_aggs %} + {%- endfor %} + from {{ source_table }} + group by {{ group_by | join(', ') }} + ) +select + a.* + {%- if median_aggs %} {%- for med_col, med_agg in median_aggs.items() %} - ,med.{{ med_col }}_{{ med_agg }} + , med.{{ med_col }}_{{ med_agg }} {%- endfor %} -{%- endif %} -{%- if mode_aggs %} + {%- endif %} + {%- if mode_aggs %} {%- for mode_col, mode_agg in mode_aggs.items() %} - ,{{ mode_col }}_CTE.{{ mode_col }}_MODE + ,{{ mode_col }}_cte.{{ mode_col }}_mode {%- endfor %} -{%- endif %} -FROM AGGS a + {%- endif %} +from aggs a {%- if median_aggs %} - LEFT JOIN MEDIAN_CTE med - ON - {%- for group_col in group_by %} - {{'a.' + group_col + ' = med.' + group_col + (' AND' if not loop.last else '')}} +left join + median_cte med + on {%- for group_col in group_by %} + {{ 'a.' + group_col + ' = med.' + group_col + (' AND' if not loop.last else '') }} {%- endfor %} {%- endif %} {%- if mode_aggs %} - {%- for mode_col, mode_agg in mode_aggs.items() %} - LEFT JOIN {{ mode_col }}_CTE - ON - {%- for group_col in group_by %} - a.{{ group_col }} = {{ mode_col }}_CTE.{{ group_col }} {{ 'AND ' if not loop.last else '' }} - {%- endfor %} +{%- for mode_col, mode_agg in mode_aggs.items() %} +left join + {{ mode_col }}_cte + on {%- for group_col in group_by %} + a.{{ group_col }} + = {{ mode_col }}_cte.{{ group_col }} {{ 'AND ' if not loop.last else '' }} {%- endfor %} +{%- endfor %} {%- endif %} diff --git a/rasgotransforms/rasgotransforms/transforms/aggregate/snowflake/aggregate.sql b/rasgotransforms/rasgotransforms/transforms/aggregate/snowflake/aggregate.sql index 862b6a94..e4ec419a 100644 --- a/rasgotransforms/rasgotransforms/transforms/aggregate/snowflake/aggregate.sql +++ b/rasgotransforms/rasgotransforms/transforms/aggregate/snowflake/aggregate.sql @@ -1,37 +1,38 @@ {%- set aggregations = aggregations.copy() %} {%- if 'numeric columns' in aggregations.keys() and aggregations['numeric columns']|length > 0 %} - {%- set all_columns = get_columns(source_table) %} - {%- for column, column_type in all_columns.items() %} - {%- if column not in aggregations.keys() and column_type|lower in ['int', 'integer', 'bigint', 'smallint', 'number', 'numeric', 'float', 'float4', 'float8', 'decimal', 'double precision', 'real'] %} - {%- do aggregations.setdefault(column, []).extend(aggregations['numeric columns']) %} - {%- endif %} - {%- endfor %} - {%- set _ = aggregations.pop('numeric columns') %} +{%- set all_columns = get_columns(source_table) %} +{%- for column, column_type in all_columns.items() %} +{%- if column not in aggregations.keys() and column_type|lower in ['int', 'integer', 'bigint', 'smallint', 'number', 'numeric', 'float', 'float4', 'float8', 'decimal', 'double precision', 'real'] %} +{%- do aggregations.setdefault(column, []).extend(aggregations['numeric columns']) %} +{%- endif %} +{%- endfor %} +{%- set _ = aggregations.pop('numeric columns') %} {%- endif -%} {%- if 'nonnumeric columns' in aggregations.keys() and aggregations['nonnumeric columns']|length > 0 %} - {%- set all_columns = all_columns if all_columns is defined else get_columns(source_table) %} - {%- for column, column_type in all_columns.items() %} - {%- if column not in aggregations.keys() and column_type|lower not in ['int', 'integer', 'bigint', 'smallint', 'number', 'numeric', 'float', 'float4', 'float8', 'decimal', 'double precision', 'real'] %} - {%- do aggregations.setdefault(column, []).extend(aggregations['nonnumeric columns']) %} - {%- endif %} - {%- endfor %} - {%- set _ = aggregations.pop('nonnumeric columns') %} +{%- set all_columns = all_columns if all_columns is defined else get_columns(source_table) %} +{%- for column, column_type in all_columns.items() %} +{%- if column not in aggregations.keys() and column_type|lower not in ['int', 'integer', 'bigint', 'smallint', 'number', 'numeric', 'float', 'float4', 'float8', 'decimal', 'double precision', 'real'] %} +{%- do aggregations.setdefault(column, []).extend(aggregations['nonnumeric columns']) %} +{%- endif %} +{%- endfor %} +{%- set _ = aggregations.pop('nonnumeric columns') %} {%- endif -%} -SELECT -{%- for group_item in group_by %} - {{ group_item }}, -{%- endfor %} -{%- for col, aggs in aggregations.items() %} +select + {%- for group_item in group_by %} {{ group_item }}, {%- endfor %} + {%- for col, aggs in aggregations.items() %} {%- set outer_loop = loop -%} {%- for agg in aggs|unique %} - {%- if ' DISTINCT' in agg|upper %} - {{ agg|upper|replace(" DISTINCT", "") }}(DISTINCT {{ col }}) as {{ col ~ '_' ~ agg|upper|replace(" DISTINCT", "") ~ 'DISTINCT'}}{{ '' if loop.last and outer_loop.last else ',' }} - {%- else %} - {{ agg }}({{ col }}) as {{ col + '_' + agg }}{{ '' if loop.last and outer_loop.last else ',' }} - {%- endif %} + {%- if ' DISTINCT' in agg|upper %} + {{ agg|upper|replace(" DISTINCT", "") }} (distinct {{ col }}) + as {{ col ~ '_' ~ agg|upper|replace(" DISTINCT", "") ~ 'DISTINCT' }}{{ '' if loop.last and outer_loop.last else ',' }} + {%- else %} + {{ agg }} ( + {{ col }} + ) as {{ col + '_' + agg }}{{ '' if loop.last and outer_loop.last else ',' }} + {%- endif %} {%- endfor -%} -{%- endfor %} -FROM {{ source_table }} -GROUP BY {{ group_by | join(', ') }} \ No newline at end of file + {%- endfor %} +from {{ source_table }} +group by {{ group_by | join(', ') }} diff --git a/rasgotransforms/rasgotransforms/transforms/aggregate_string/aggregate_string.sql b/rasgotransforms/rasgotransforms/transforms/aggregate_string/aggregate_string.sql index a60dc35b..ca6599c1 100644 --- a/rasgotransforms/rasgotransforms/transforms/aggregate_string/aggregate_string.sql +++ b/rasgotransforms/rasgotransforms/transforms/aggregate_string/aggregate_string.sql @@ -1,7 +1,10 @@ -SELECT {{ group_by | join(', ') }} -{%- for agg_column in agg_columns %} -, listagg({{ 'distinct ' if distinct else ''}} {{agg_column}}, '{{sep}}') -WITHIN group (order by {{agg_column}} {{order}}) as {{agg_column}}_listagg -{%- endfor %} -FROM {{ source_table }} -GROUP BY {{ group_by | join(', ') }} \ No newline at end of file +select + {{ group_by | join(', ') }} + {%- for agg_column in agg_columns %} + , + listagg( + {{ 'distinct ' if distinct else '' }} {{ agg_column }}, '{{sep}}' + ) within group (order by {{ agg_column }} {{ order }}) as {{ agg_column }}_listagg + {%- endfor %} +from {{ source_table }} +group by {{ group_by | join(', ') }} diff --git a/rasgotransforms/rasgotransforms/transforms/apply/apply.sql b/rasgotransforms/rasgotransforms/transforms/apply/apply.sql index 80716eac..873497cc 100644 --- a/rasgotransforms/rasgotransforms/transforms/apply/apply.sql +++ b/rasgotransforms/rasgotransforms/transforms/apply/apply.sql @@ -1,3 +1,5 @@ -{# Placeholder code. Will be replaced by user supplied template #} -SELECT * FROM {{ source_table }} -{{ raise_exception('Placeholder code must be replaced by user supplied template') }} \ No newline at end of file +{# Placeholder code. Will be replaced by user supplied template #} +select * +from + {{ source_table }} + {{ raise_exception('Placeholder code must be replaced by user supplied template') }} diff --git a/rasgotransforms/rasgotransforms/transforms/bin/bigquery/bin.sql b/rasgotransforms/rasgotransforms/transforms/bin/bigquery/bin.sql index f121dd53..9ff7a95c 100644 --- a/rasgotransforms/rasgotransforms/transforms/bin/bigquery/bin.sql +++ b/rasgotransforms/rasgotransforms/transforms/bin/bigquery/bin.sql @@ -1,16 +1,22 @@ -SELECT *, -{%- if type == 'ntile' %} - ntile({{bin_count}}) OVER (ORDER BY {{column}}) AS {{column}}_{{bin_count}}_NTB -{%- elif type == 'equalwidth' %} - RANGE_BUCKET( - {{ column }}, - GENERATE_ARRAY( - (SELECT MIN({{ column }}) FROM {{ source_table }}) - ,(SELECT MAX({{ column }}) FROM {{ source_table }}) - ,(SELECT (MAX({{ column }}) - MIN({{ column }}))/20 FROM {{ source_table }}) - ) - ) AS {{column}}_{{bin_count}}_EWB -{%- else %} - {{ raise_exception('You must select either "ntile" or "equalwidth" as your binning type') }} -{%- endif %} -FROM {{ source_table }} +select + *, + {%- if type == 'ntile' %} + ntile({{ bin_count }}) over ( + order by {{ column }} + ) as {{ column }}_{{ bin_count }}_ntb + {%- elif type == 'equalwidth' %} + range_bucket( + {{ column }}, + generate_array( + (select min({{ column }}) from {{ source_table }}), + (select max({{ column }}) from {{ source_table }}), + ( + select (max({{ column }}) - min({{ column }})) / 20 + from {{ source_table }} + ) + ) + ) as {{ column }}_{{ bin_count }}_ewb + {%- else %} + {{ raise_exception('You must select either "ntile" or "equalwidth" as your binning type') }} + {%- endif %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/bin/bin.sql b/rasgotransforms/rasgotransforms/transforms/bin/bin.sql index e5b2915b..c2589883 100644 --- a/rasgotransforms/rasgotransforms/transforms/bin/bin.sql +++ b/rasgotransforms/rasgotransforms/transforms/bin/bin.sql @@ -1,12 +1,17 @@ -SELECT *, -{% if type == 'ntile' %} - ntile({{bin_count}}) OVER (ORDER BY {{column}}) AS {{column}}_{{bin_count}}_NTB -{% elif type == 'equalwidth' %} - width_bucket({{column}}, - (SELECT MIN({{column}}) FROM {{source_table}}), - (SELECT MAX({{column}}) FROM {{source_table}}), - {{bin_count}}) AS {{column}}_{{bin_count}}_EWB -{% else %} - {{ raise_exception('You must select either "ntile" or "equalwidth" as your binning type') }} -{% endif %} -FROM {{ source_table }} +select + *, + {% if type == 'ntile' %} + ntile({{ bin_count }}) over ( + order by {{ column }} + ) as {{ column }}_{{ bin_count }}_ntb + {% elif type == 'equalwidth' %} + width_bucket( + {{ column }}, + (select min({{ column }}) from {{ source_table }}), + (select max({{ column }}) from {{ source_table }}), + {{ bin_count }} + ) as {{ column }}_{{ bin_count }}_ewb + {% else %} + {{ raise_exception('You must select either "ntile" or "equalwidth" as your binning type') }} + {% endif %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/cast/bigquery/cast.sql b/rasgotransforms/rasgotransforms/transforms/cast/bigquery/cast.sql index f19de079..30deb827 100644 --- a/rasgotransforms/rasgotransforms/transforms/cast/bigquery/cast.sql +++ b/rasgotransforms/rasgotransforms/transforms/cast/bigquery/cast.sql @@ -1,32 +1,49 @@ {%- if overwrite_columns == true -%} - {%- set source_columns = get_columns(source_table) -%} - {%- set untouched_cols = source_columns | reject('in', casts) -%} +{%- set source_columns = get_columns(source_table) -%} +{%- set untouched_cols = source_columns | reject('in', casts) -%} - SELECT {% for col in untouched_cols %}{{ col }},{% endfor %} +select + {% for col in untouched_cols %} {{ col }},{% endfor %} {%- for target_col, type in casts.items() %} - {%- if type|lower == 'float' %} - CAST({{target_col}} AS FLOAT64) AS {{target_col}}{{", " if not loop.last else ""}}f - {%- elif type|lower == 'number' %} - CAST({{target_col}} AS NUMERIC) AS {{target_col}}{{", " if not loop.last else ""}} - {%- else %} - CAST({{target_col}} AS {{type}}) AS {{target_col}}{{", " if not loop.last else ""}} - {%- endif %} + {%- if type|lower == 'float' %} + cast( + {{ target_col }} as float64 + ) as {{ target_col }}{{ ", " if not loop.last else "" }}f + {%- elif type|lower == 'number' %} + cast( + {{ target_col }} as numeric + ) as {{ target_col }}{{ ", " if not loop.last else "" }} + {%- else %} + cast( + {{ target_col }} as {{ type }} + ) as {{ target_col }}{{ ", " if not loop.last else "" }} + {%- endif %} {%- endfor %} - FROM {{ source_table }} +from {{ source_table }} {%- else -%} - SELECT * +select + * {%- for target_col, type in casts.items() %} - {%- if type|lower == 'float' %} - ,CAST({{target_col}} AS FLOAT64) AS {{cleanse_name(target_col)+'_'+cleanse_name(type)}} - {%- elif type|lower == 'number' %} - ,CAST({{target_col}} AS NUMERIC) AS {{cleanse_name(target_col)+'_'+cleanse_name(type)}} - {%- else %} - ,CAST({{target_col}} AS {{type}}) AS {{cleanse_name(target_col)+'_'+cleanse_name(type)}} - {%- endif %} + {%- if type|lower == 'float' %} + , + cast( + {{ target_col }} as float64 + ) as {{ cleanse_name(target_col)+'_'+cleanse_name(type) }} + {%- elif type|lower == 'number' %} + , + cast( + {{ target_col }} as numeric + ) as {{ cleanse_name(target_col)+'_'+cleanse_name(type) }} + {%- else %} + , + cast( + {{ target_col }} as {{ type }} + ) as {{ cleanse_name(target_col)+'_'+cleanse_name(type) }} + {%- endif %} {%- endfor %} - FROM {{ source_table }} +from {{ source_table }} {%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/cast/cast.sql b/rasgotransforms/rasgotransforms/transforms/cast/cast.sql index 0c228dd9..c6276be3 100644 --- a/rasgotransforms/rasgotransforms/transforms/cast/cast.sql +++ b/rasgotransforms/rasgotransforms/transforms/cast/cast.sql @@ -3,18 +3,25 @@ {%- set source_columns = get_columns(source_table) -%} {%- set untouched_cols = source_columns | reject('in', casts) -%} -SELECT {% for col in untouched_cols %}{{ col }},{% endfor %} -{%- for target_col, type in casts.items() %} - CAST({{target_col}} AS {{type}}) AS {{target_col}}{{", " if not loop.last else ""}} -{%- endfor %} -FROM {{ source_table }} +select + {% for col in untouched_cols %} {{ col }},{% endfor %} + {%- for target_col, type in casts.items() %} + cast( + {{ target_col }} as {{ type }} + ) as {{ target_col }}{{ ", " if not loop.last else "" }} + {%- endfor %} +from {{ source_table }} {%- else -%} -SELECT * -{%- for target_col, type in casts.items() %} - , CAST({{target_col}} AS {{type}}) AS {{cleanse_name(target_col)+'_'+cleanse_name(type)}} -{%- endfor %} -FROM {{ source_table }} +select + * + {%- for target_col, type in casts.items() %} + , + cast( + {{ target_col }} as {{ type }} + ) as {{ cleanse_name(target_col)+'_'+cleanse_name(type) }} + {%- endfor %} +from {{ source_table }} -{%- endif -%} \ No newline at end of file +{%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/clean/bigquery/clean.sql b/rasgotransforms/rasgotransforms/transforms/clean/bigquery/clean.sql index 613ecacc..6556bce1 100644 --- a/rasgotransforms/rasgotransforms/transforms/clean/bigquery/clean.sql +++ b/rasgotransforms/rasgotransforms/transforms/clean/bigquery/clean.sql @@ -1,89 +1,84 @@ {%- set source_col_names = get_columns(source_table) -%} {%- for column_name in source_col_names.keys() -%} - {%- if column_name not in columns.keys() -%} - {%- do columns.__setitem__(column_name, {}) -%} - {%- endif -%} +{%- if column_name not in columns.keys() -%} +{%- do columns.__setitem__(column_name, {}) -%} +{%- endif -%} {%- endfor -%} {%- set drop_cols = [] -%} {%- for column in columns.keys() -%} - {%- if columns[column].drop -%} - {{drop_cols.append(column) or ""}} - {%- endif -%} -{%- endfor -%} -{%- for col in drop_cols -%} - {%- set _x = columns.pop(col) -%} +{%- if columns[column].drop -%} {{ drop_cols.append(column) or "" }} {%- endif -%} {%- endfor -%} +{%- for col in drop_cols -%} {%- set _x = columns.pop(col) -%} {%- endfor -%} {%- macro get_select_column(name, col) -%} - {%- if col.type is defined -%} - {%- if col.type|lower == 'float' -%} - {%- set source_col = 'cast(' + name + ' as FLOAT64)' -%} - {%- elif col.type|lower == 'number' -%} - {%- set source_col = 'cast(' + name + ' as NUMERIC)' -%} - {%- else -%} - {%- set source_col = 'cast(' + name + ' as ' + col.type + ')' -%} - {%- endif -%} - {%- else -%} - {%- set source_col = name -%} - {%- endif -%} - {% set output_col_name = name if col.name is not defined else cleanse_name(col.name) %} - {%- set impute_expression = 'NULL' -%} - {%- if col.impute is not defined -%} - {%- set output = source_col -%} - {%- else -%} - {%- if col.impute | lower in ['mean', 'mode', 'max', 'min', 'sum', 'avg', 'count'] -%} - {%- set impute = 'avg' if col.impute|lower == 'mean' else col.impute -%} - {%- set impute_expression = impute + '(' + source_col + ') over ()' -%} - {%- elif col.impute|lower == 'count_distinct' -%} - {%- set impute_expression = 'count(distinct ' + source_col + ') over ()' -%} - {%- elif col.impute|lower == "median" -%} - {%- if col.type is defined -%} - {%- if col.type|lower == 'float' -%} - {%- set impute_expression = 'cast(PERCENTILE_CONT(' + source_col + ', 0.5) over () as FLOAT64)' -%} - {%- elif col.type|lower == 'number' -%} - {%- set impute_expression = 'cast(PERCENTILE_CONT(' + source_col + ', 0.5) over () as NUMERIC)' -%} - {%- else -%} - {%- set impute_expression = 'cast(PERCENTILE_CONT(' + source_col + ', 0.5) over () as ' + col.type + ')' -%} - {%- endif -%} - {%- else -%} - {%- set impute_expression = 'PERCENTILE_CONT(' + source_col + ', 0.5) over ()' -%} - {%- endif -%} - {%- else -%} - {%- set impute_expression = "'" + col.impute + "'" if col.impute is string else col.impute|string -%} - {%- endif -%} - {%- set output = 'coalesce(' + source_col + ', ' + impute_expression + ')' -%} - {%- endif -%} - {{ output }} as {{ output_col_name }} +{%- if col.type is defined -%} +{%- if col.type|lower == 'float' -%} +{%- set source_col = 'cast(' + name + ' as FLOAT64)' -%} +{%- elif col.type|lower == 'number' -%} +{%- set source_col = 'cast(' + name + ' as NUMERIC)' -%} +{%- else -%} {%- set source_col = 'cast(' + name + ' as ' + col.type + ')' -%} +{%- endif -%} +{%- else -%} {%- set source_col = name -%} +{%- endif -%} +{% set output_col_name = name if col.name is not defined else cleanse_name(col.name) %} +{%- set impute_expression = 'NULL' -%} +{%- if col.impute is not defined -%} {%- set output = source_col -%} +{%- else -%} +{%- if col.impute | lower in ['mean', 'mode', 'max', 'min', 'sum', 'avg', 'count'] -%} +{%- set impute = 'avg' if col.impute|lower == 'mean' else col.impute -%} +{%- set impute_expression = impute + '(' + source_col + ') over ()' -%} +{%- elif col.impute|lower == 'count_distinct' -%} +{%- set impute_expression = 'count(distinct ' + source_col + ') over ()' -%} +{%- elif col.impute|lower == "median" -%} +{%- if col.type is defined -%} +{%- if col.type|lower == 'float' -%} +{%- set impute_expression = 'cast(PERCENTILE_CONT(' + source_col + ', 0.5) over () as FLOAT64)' -%} +{%- elif col.type|lower == 'number' -%} +{%- set impute_expression = 'cast(PERCENTILE_CONT(' + source_col + ', 0.5) over () as NUMERIC)' -%} +{%- else -%} +{%- set impute_expression = 'cast(PERCENTILE_CONT(' + source_col + ', 0.5) over () as ' + col.type + ')' -%} +{%- endif -%} +{%- else -%} +{%- set impute_expression = 'PERCENTILE_CONT(' + source_col + ', 0.5) over ()' -%} +{%- endif -%} +{%- else -%} +{%- set impute_expression = "'" + col.impute + "'" if col.impute is string else col.impute|string -%} +{%- endif -%} +{%- set output = 'coalesce(' + source_col + ', ' + impute_expression + ')' -%} +{%- endif -%} +{{ output }} as {{ output_col_name }} {%- endmacro -%} {%- macro get_filter_statement(columns) -%} - {%- set filter_statements = [] -%} - {%- for column in columns.keys() %} - {%- set output_col_name = column if columns[column].name is not defined else cleanse_name(columns[column].name) -%} - {%- if columns[column].filter is defined -%} - {{ filter_statements.append(output_col_name + ' ' + columns[column].filter) or ""}} - {%- endif -%} - {%- endfor -%} - {%- for filter_statement in filter_statements -%} - {{ "where " if loop.first else "" }}{{ filter_statement }}{{ " \n and " if not loop.last else "" }} - {%- endfor %} +{%- set filter_statements = [] -%} +{%- for column in columns.keys() %} +{%- set output_col_name = column if columns[column].name is not defined else cleanse_name(columns[column].name) -%} +{%- if columns[column].filter is defined -%} +{{ filter_statements.append(output_col_name + ' ' + columns[column].filter) or "" }} +{%- endif -%} +{%- endfor -%} +{%- for filter_statement in filter_statements -%} +{{ "where " if loop.first else "" }}{{ filter_statement }}{{ " \n and " if not loop.last else "" }} +{%- endfor %} {%- endmacro -%} {%- set filter_statement = get_filter_statement(columns) -%} {%- if filter_statement == "" -%} select -{%- for column in columns.keys() %} - {{ get_select_column(column, columns[column]) }}{{ ", " if not loop.last else "" }} -{%- endfor %} + {%- for column in columns.keys() %} + {{ get_select_column(column, columns[column]) }}{{ ", " if not loop.last else "" }} + {%- endfor %} from {{ source_table }} {%- else -%} -with cleaned as ( - select - {%- for column in columns.keys() %} - {{ get_select_column(column, columns[column]) }}{{ ", " if not loop.last else "" }} - {%- endfor %} - from {{ source_table }} -) select * from cleaned -{{ filter_statement }} +with + cleaned as ( + select + {%- for column in columns.keys() %} + {{ get_select_column(column, columns[column]) }}{{ ", " if not loop.last else "" }} + {%- endfor %} + from {{ source_table }} + ) +select * +from cleaned {{ filter_statement }} {%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/clean/clean.sql b/rasgotransforms/rasgotransforms/transforms/clean/clean.sql index 346f971a..c1c87afa 100644 --- a/rasgotransforms/rasgotransforms/transforms/clean/clean.sql +++ b/rasgotransforms/rasgotransforms/transforms/clean/clean.sql @@ -1,67 +1,64 @@ {%- set source_col_names = get_columns(source_table) -%} {%- for column_name in source_col_names.keys() -%} - {%- if column_name not in columns.keys() -%} - {%- do columns.__setitem__(column_name, {}) -%} - {%- endif -%} +{%- if column_name not in columns.keys() -%} +{%- do columns.__setitem__(column_name, {}) -%} +{%- endif -%} {%- endfor -%} {%- set drop_cols = [] -%} {%- for column in columns.keys() -%} - {%- if columns[column].drop -%} - {{drop_cols.append(column) or ""}} - {%- endif -%} -{%- endfor -%} -{%- for col in drop_cols -%} - {%- set _x = columns.pop(col) -%} +{%- if columns[column].drop -%} {{ drop_cols.append(column) or "" }} {%- endif -%} {%- endfor -%} +{%- for col in drop_cols -%} {%- set _x = columns.pop(col) -%} {%- endfor -%} {%- macro get_select_column(name, col) -%} - {%- set source_col = 'cast(' + name + ' as ' + col.type + ')' if col.type is defined else name-%} - {% set output_col_name = name if col.name is not defined else cleanse_name(col.name) %} - {%- set impute_expression = 'NULL' -%} - {%- if col.impute is not defined -%} - {%- set output = source_col -%} - {%- else -%} - {%- if col.impute | lower in ['mean', 'median', 'mode', 'max', 'min', 'sum', 'avg', 'count'] -%} - {%- set impute = 'avg' if col.impute == 'mean' else col.impute -%} - {%- set impute_expression = impute + '(' + source_col + ') over ()' -%} - {%- elif col.impute|lower == 'count_distinct' -%} - {%- set impute_expression = 'count(distinct ' + source_col + ') over ()' -%} - {%- else -%} - {%- set impute_expression = "'" + col.impute + "'" if col.impute is string else col.impute|string -%} - {%- endif -%} - {%- set output = 'coalesce(' + source_col + ', ' + impute_expression + ')' -%} - {%- endif -%} - {{ output }} as {{ output_col_name }} +{%- set source_col = 'cast(' + name + ' as ' + col.type + ')' if col.type is defined else name -%} +{% set output_col_name = name if col.name is not defined else cleanse_name(col.name) %} +{%- set impute_expression = 'NULL' -%} +{%- if col.impute is not defined -%} {%- set output = source_col -%} +{%- else -%} +{%- if col.impute | lower in ['mean', 'median', 'mode', 'max', 'min', 'sum', 'avg', 'count'] -%} +{%- set impute = 'avg' if col.impute == 'mean' else col.impute -%} +{%- set impute_expression = impute + '(' + source_col + ') over ()' -%} +{%- elif col.impute|lower == 'count_distinct' -%} +{%- set impute_expression = 'count(distinct ' + source_col + ') over ()' -%} +{%- else -%} +{%- set impute_expression = "'" + col.impute + "'" if col.impute is string else col.impute|string -%} +{%- endif -%} +{%- set output = 'coalesce(' + source_col + ', ' + impute_expression + ')' -%} +{%- endif -%} +{{ output }} as {{ output_col_name }} {%- endmacro -%} {%- macro get_filter_statement(columns) -%} - {%- set filter_statements = [] -%} - {%- for column in columns.keys() %} - {%- set output_col_name = column if columns[column].name is not defined else cleanse_name(columns[column].name) -%} - {%- if columns[column].filter is defined -%} - {{ filter_statements.append(output_col_name + ' ' + columns[column].filter) or ""}} - {%- endif -%} - {%- endfor -%} - {%- for filter_statement in filter_statements -%} - {{ "where " if loop.first else "" }}{{ filter_statement }}{{ " \n and " if not loop.last else "" }} - {%- endfor %} +{%- set filter_statements = [] -%} +{%- for column in columns.keys() %} +{%- set output_col_name = column if columns[column].name is not defined else cleanse_name(columns[column].name) -%} +{%- if columns[column].filter is defined -%} +{{ filter_statements.append(output_col_name + ' ' + columns[column].filter) or "" }} +{%- endif -%} +{%- endfor -%} +{%- for filter_statement in filter_statements -%} +{{ "where " if loop.first else "" }}{{ filter_statement }}{{ " \n and " if not loop.last else "" }} +{%- endfor %} {%- endmacro -%} {%- set filter_statement = get_filter_statement(columns) -%} {%- if filter_statement == "" -%} select -{%- for column in columns.keys() %} - {{ get_select_column(column, columns[column]) }}{{ ", " if not loop.last else "" }} -{%- endfor %} + {%- for column in columns.keys() %} + {{ get_select_column(column, columns[column]) }}{{ ", " if not loop.last else "" }} + {%- endfor %} from {{ source_table }} {%- else -%} -with cleaned as ( - select - {%- for column in columns.keys() %} - {{ get_select_column(column, columns[column]) }}{{ ", " if not loop.last else "" }} - {%- endfor %} - from {{ source_table }} -) select * from cleaned -{{ filter_statement }} +with + cleaned as ( + select + {%- for column in columns.keys() %} + {{ get_select_column(column, columns[column]) }}{{ ", " if not loop.last else "" }} + {%- endfor %} + from {{ source_table }} + ) +select * +from cleaned {{ filter_statement }} {%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/concat/concat.sql b/rasgotransforms/rasgotransforms/transforms/concat/concat.sql index 11764e11..24d6e527 100644 --- a/rasgotransforms/rasgotransforms/transforms/concat/concat.sql +++ b/rasgotransforms/rasgotransforms/transforms/concat/concat.sql @@ -1,12 +1,11 @@ {%- set untouched_cols = get_columns(source_table)|list|reject('in', concat_list)|join(',') if overwrite_columns else "*" -%} {%- set alias = cleanse_name(alias) if alias is defined else 'CONCAT_' ~ cleanse_name(concat_list | join('_')) -%} - - -SELECT {{ untouched_cols }}, - CONCAT( +select + {{ untouched_cols }}, + concat( {%- for obj in concat_list %} - {{obj}}{{ ", " if not loop.last else "" }} + {{ obj }}{{ ", " if not loop.last else "" }} {%- endfor %} - ) AS {{ alias }} -FROM {{ source_table }} \ No newline at end of file + ) as {{ alias }} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/conditional_agg/conditional_agg.sql b/rasgotransforms/rasgotransforms/transforms/conditional_agg/conditional_agg.sql index 814ddc65..eaf09a9d 100644 --- a/rasgotransforms/rasgotransforms/transforms/conditional_agg/conditional_agg.sql +++ b/rasgotransforms/rasgotransforms/transforms/conditional_agg/conditional_agg.sql @@ -1,22 +1,19 @@ -{%- if distinct -%} - {%- set agg_thing = 'DISTINCT '~agg_column -%} -{%- else -%} - {%- set agg_thing = agg_column -%} -{%- endif -%} -{%- set rule_combos = [] -%} -{%- for r in rules -%} - {%- if loop.first -%} - {%- set rule_combos = rule_combos.append(r) -%} - {%- else -%} - {%- set new_rule = rule_combos[loop.index-2] ~ ' AND ' ~ r -%} - {%- set rule_combos = rule_combos.append(new_rule) -%} - {%- endif -%} -{%- endfor -%} -{%- for rule in rule_combos -%} -SELECT '{{ rule|replace("'","") }}' AS rule_desc, {{ agg }}({{ agg_thing }}) as QTY -FROM {{ source_table }} -WHERE {{ rule }} +{% if distinct %} {% set agg_thing = 'DISTINCT '~agg_column %} +{% else %} {% set agg_thing = agg_column %} +{% endif %} +{% set rule_combos = [] %} +{% for r in rules %} +{% if loop.first %} {% set rule_combos = rule_combos.append(r) %} +{% else %} +{% set new_rule = rule_combos[loop.index - 2] ~ ' AND ' ~ r %} +{% set rule_combos = rule_combos.append(new_rule) %} +{% endif %} +{% endfor %} +{% for rule in rule_combos %} +select '{{ rule|replace("\'","") }}' as rule_desc, {{ agg }} ({{ agg_thing }}) as qty +from {{ source_table }} +where {{ rule }} {% if not loop.last %} -UNION ALL +union all {% endif %} -{%- endfor -%} \ No newline at end of file +{% endfor %} diff --git a/rasgotransforms/rasgotransforms/transforms/correlation/correlation.sql b/rasgotransforms/rasgotransforms/transforms/correlation/correlation.sql index 6be9ba4e..788e1c3a 100644 --- a/rasgotransforms/rasgotransforms/transforms/correlation/correlation.sql +++ b/rasgotransforms/rasgotransforms/transforms/correlation/correlation.sql @@ -3,23 +3,33 @@ {%- set column_list = [] -%} {%- for key, value in names_types_list.items() -%} - {% if (value|upper == 'NUMBER' or 'FLOAT' in value|upper or 'INT' in value|upper) %} - {%- do column_list.append(key) -%} - {%- endif -%} +{% if (value|upper == 'NUMBER' or 'FLOAT' in value|upper or 'INT' in value|upper) %} +{%- do column_list.append(key) -%} +{%- endif -%} {%- endfor -%} -WITH source_sampled as ( - SELECT * from {{ source_table }} - {% if rows_to_sample is defined %} SAMPLE ({{ rows_to_sample }} ROWS) {% endif -%} -) +with + source_sampled as ( + select * + from + {{ source_table }} + {% if rows_to_sample is defined %} + sample({{ rows_to_sample }} rows) + {% endif -%} + ) -SELECT * FROM ( -{%- for combo in itertools.product(column_list, repeat=2) -%} - SELECT '{{ combo[0] }}' as COLUMN_A, - '{{ combo[1] }}' as COLUMN_B, - CORR({{ combo[0] }}, {{ combo[1] }}) as Correlation - FROM source_sampled - {% if not loop.last %} UNION {% endif -%} -{%- endfor -%} -) -ORDER BY COLUMN_A, COLUMN_B \ No newline at end of file +select * +from + ( + {%- for combo in itertools.product(column_list, repeat=2) -%} + select + '{{ combo[0] }}' as column_a, + '{{ combo[1] }}' as column_b, + corr({{ combo[0] }}, {{ combo[1] }}) as correlation + from source_sampled + {% if not loop.last %} + union + {% endif -%} + {%- endfor -%} + ) +order by column_a, column_b diff --git a/rasgotransforms/rasgotransforms/transforms/cumulative_agg/cumulative_agg.sql b/rasgotransforms/rasgotransforms/transforms/cumulative_agg/cumulative_agg.sql index ef005c4d..36775522 100644 --- a/rasgotransforms/rasgotransforms/transforms/cumulative_agg/cumulative_agg.sql +++ b/rasgotransforms/rasgotransforms/transforms/cumulative_agg/cumulative_agg.sql @@ -1,17 +1,16 @@ -SELECT * -{% for col, aggs in aggregations.items() -%} - {%- for agg in aggs %} - , {{ agg }}({{ col }}) OVER( - {%- if group_by %} - PARTITION BY {{ group_by | join(", ") }} - {% endif -%} - ORDER BY {{ order_by | join(", ") }} - {% if direction and direction.lower() == 'forward' -%} - ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING - {% else -%} - ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW - {%- endif -%} +select + * + {% for col, aggs in aggregations.items() -%} + {%- for agg in aggs %} + , + {{ agg }} ({{ col }}) over ( + {%- if group_by %}partition by {{ group_by | join(", ") }} {% endif -%} + order by {{ order_by | join(", ") }} + {% if direction and direction.lower() == 'forward' -%} + rows between current row and unbounded following + {% else -%} rows between unbounded preceding and current row + {%- endif -%} ) as {{ cleanse_name(agg + '_' + col) }} - {%- endfor -%} -{%- endfor %} -FROM {{ source_table }} \ No newline at end of file + {%- endfor -%} + {%- endfor %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/datarobot_score/datarobot_score.sql b/rasgotransforms/rasgotransforms/transforms/datarobot_score/datarobot_score.sql index ce392e49..4c15f803 100644 --- a/rasgotransforms/rasgotransforms/transforms/datarobot_score/datarobot_score.sql +++ b/rasgotransforms/rasgotransforms/transforms/datarobot_score/datarobot_score.sql @@ -1,16 +1,21 @@ -SELECT {{ include_cols|join(',') }}, +select + {{ include_cols|join(',') }}, -{%- if num_explains is defined and threshold_low is defined and threshold_high is defined -%} - S:score AS PREDICTION + {%- if num_explains is defined and threshold_low is defined and threshold_high is defined -%} + s:score as prediction {%- set function_call = '(OBJECT_CONSTRUCT_KEEP_NULL(*),' ~ num_explains ~ ',' ~ threshold_low ~ ',' ~ threshold_high ~ ')' %} {% for i in range(num_explains) -%} - ,CONCAT(S:explanations[{{ i }}].featureName, '=', S:explanations[{{ i }}].featureValue, ' (', S:explanations[{{ i }}].strength, ')') AS TOP{{ i+1 }}_INFLUENCING_FACTOR + , + concat( + s:explanations[{{ i }}].featurename, + '=', + s:explanations[{{ i }}].featurevalue, + ' (', + s:explanations[{{ i }}].strength, + ')' + ) as top{{ i+1 }}_influencing_factor {% endfor -%} -{%- else -%} - S AS PREDICTION - {% set function_call = '(OBJECT_CONSTRUCT_KEEP_NULL(*))' %} -{%- endif %} -FROM ( - SELECT *, - {{ function_name }}{{ function_call }} AS S - FROM {{ source_table }} ) \ No newline at end of file + {%- else -%} + s as prediction {% set function_call = '(OBJECT_CONSTRUCT_KEEP_NULL(*))' %} + {%- endif %} +from (select *, {{ function_name }}{{ function_call }} as s from {{ source_table }}) diff --git a/rasgotransforms/rasgotransforms/transforms/dateadd/bigquery/dateadd.sql b/rasgotransforms/rasgotransforms/transforms/dateadd/bigquery/dateadd.sql index 2a7a6af7..85963907 100644 --- a/rasgotransforms/rasgotransforms/transforms/dateadd/bigquery/dateadd.sql +++ b/rasgotransforms/rasgotransforms/transforms/dateadd/bigquery/dateadd.sql @@ -1,12 +1,14 @@ {%- if overwrite_columns -%} {%- set alias = date -%} {%- set untouched_cols = get_columns(source_table)|list|reject('in', [alias])|join(',') -%} -{%- else -%} -{%- set untouched_cols = "*" -%} +{%- else -%} {%- set untouched_cols = "*" -%} {%- endif -%} {%- set alias = alias if alias is defined else date + '_add' + offset|string + date_part -%} -SELECT {{ untouched_cols }}, - DATE_ADD({{ date }}, INTERVAL {{ offset }} {{ date_part }}) AS {{ cleanse_name(alias) }} -FROM {{ source_table }} \ No newline at end of file +select + {{ untouched_cols }}, + date_add( + {{ date }}, interval {{ offset }} {{ date_part }} + ) as {{ cleanse_name(alias) }} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/dateadd/dateadd.sql b/rasgotransforms/rasgotransforms/transforms/dateadd/dateadd.sql index 8cdc8aec..40757b35 100644 --- a/rasgotransforms/rasgotransforms/transforms/dateadd/dateadd.sql +++ b/rasgotransforms/rasgotransforms/transforms/dateadd/dateadd.sql @@ -1,12 +1,12 @@ {%- if overwrite_columns -%} {%- set alias = date -%} {%- set untouched_cols = get_columns(source_table)|list|reject('in', [alias])|join(',') -%} -{%- else -%} -{%- set untouched_cols = "*" -%} +{%- else -%} {%- set untouched_cols = "*" -%} {%- endif -%} {%- set alias = alias if alias is defined else date + '_add' + offset|string + date_part -%} -SELECT {{ untouched_cols }}, - {{ date }} + INTERVAL {{ offset }} {{ date_part }} AS {{ cleanse_name(alias) }} -FROM {{ source_table }} \ No newline at end of file +select + {{ untouched_cols }}, + {{ date }} + interval {{ offset }} {{ date_part }} as {{ cleanse_name(alias) }} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/dateadd/postgresql/dateadd.sql b/rasgotransforms/rasgotransforms/transforms/dateadd/postgresql/dateadd.sql index 1b191667..00285320 100644 --- a/rasgotransforms/rasgotransforms/transforms/dateadd/postgresql/dateadd.sql +++ b/rasgotransforms/rasgotransforms/transforms/dateadd/postgresql/dateadd.sql @@ -1,12 +1,12 @@ {%- if overwrite_columns -%} {%- set alias = date -%} {%- set untouched_cols = get_columns(source_table)|list|reject('in', [alias])|join(',') -%} -{%- else -%} -{%- set untouched_cols = "*" -%} +{%- else -%} {%- set untouched_cols = "*" -%} {%- endif -%} {%- set alias = alias if alias is defined else date + '_add' + offset|string + date_part -%} -SELECT {{ untouched_cols }}, - {{ date }} + INTERVAL '{{ offset }} {{ date_part }}' AS {{ cleanse_name(alias) }} -FROM {{ source_table }} \ No newline at end of file +select + {{ untouched_cols }}, + {{ date }} + interval '{{ offset }} {{ date_part }}' as {{ cleanse_name(alias) }} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/dateadd/redshift/dateadd.sql b/rasgotransforms/rasgotransforms/transforms/dateadd/redshift/dateadd.sql index 1b191667..00285320 100644 --- a/rasgotransforms/rasgotransforms/transforms/dateadd/redshift/dateadd.sql +++ b/rasgotransforms/rasgotransforms/transforms/dateadd/redshift/dateadd.sql @@ -1,12 +1,12 @@ {%- if overwrite_columns -%} {%- set alias = date -%} {%- set untouched_cols = get_columns(source_table)|list|reject('in', [alias])|join(',') -%} -{%- else -%} -{%- set untouched_cols = "*" -%} +{%- else -%} {%- set untouched_cols = "*" -%} {%- endif -%} {%- set alias = alias if alias is defined else date + '_add' + offset|string + date_part -%} -SELECT {{ untouched_cols }}, - {{ date }} + INTERVAL '{{ offset }} {{ date_part }}' AS {{ cleanse_name(alias) }} -FROM {{ source_table }} \ No newline at end of file +select + {{ untouched_cols }}, + {{ date }} + interval '{{ offset }} {{ date_part }}' as {{ cleanse_name(alias) }} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/dateadd/snowflake/dateadd.sql b/rasgotransforms/rasgotransforms/transforms/dateadd/snowflake/dateadd.sql index 063030f6..02db86e7 100644 --- a/rasgotransforms/rasgotransforms/transforms/dateadd/snowflake/dateadd.sql +++ b/rasgotransforms/rasgotransforms/transforms/dateadd/snowflake/dateadd.sql @@ -1,12 +1,12 @@ {%- if overwrite_columns -%} {%- set alias = date -%} {%- set untouched_cols = get_columns(source_table)|list|reject('in', [alias])|join(',') -%} -{%- else -%} -{%- set untouched_cols = "*" -%} +{%- else -%} {%- set untouched_cols = "*" -%} {%- endif -%} {%- set alias = alias if alias is defined else date + '_add' + offset|string + date_part -%} -SELECT {{ untouched_cols }}, - DATEADD({{ date_part }}, {{ offset }}, {{ date }}) AS {{ cleanse_name(alias) }} -FROM {{ source_table }} \ No newline at end of file +select + {{ untouched_cols }}, + dateadd({{ date_part }}, {{ offset }}, {{ date }}) as {{ cleanse_name(alias) }} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/datediff/bigquery/datediff.sql b/rasgotransforms/rasgotransforms/transforms/datediff/bigquery/datediff.sql index 5b244c40..2154ce11 100644 --- a/rasgotransforms/rasgotransforms/transforms/datediff/bigquery/datediff.sql +++ b/rasgotransforms/rasgotransforms/transforms/datediff/bigquery/datediff.sql @@ -1,9 +1,6 @@ -{%- if alias is defined -%} - {%- set alias = cleanse_name(alias) -%} -{%- else -%} - {%- set alias = 'DIFF_'~ cleanse_name(date_1~'_'~date_2) -%} +{%- if alias is defined -%} {%- set alias = cleanse_name(alias) -%} +{%- else -%} {%- set alias = 'DIFF_'~ cleanse_name(date_1~'_'~date_2) -%} {%- endif -%} -SELECT *, - DATE_DIFF({{ date_1 }}, {{ date_2 }}, {{ date_part }}) as {{ alias }} -FROM {{ source_table }} \ No newline at end of file +select *, date_diff({{ date_1 }}, {{ date_2 }}, {{ date_part }}) as {{ alias }} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/datediff/datediff.sql b/rasgotransforms/rasgotransforms/transforms/datediff/datediff.sql index c206f0c1..01aac216 100644 --- a/rasgotransforms/rasgotransforms/transforms/datediff/datediff.sql +++ b/rasgotransforms/rasgotransforms/transforms/datediff/datediff.sql @@ -1,9 +1,8 @@ -{%- if alias is defined -%} - {%- set alias = cleanse_name(alias) -%} -{%- else -%} - {%- set alias = 'DIFF_'~ cleanse_name(date_1~'_'~date_2) -%} +{%- if alias is defined -%} {%- set alias = cleanse_name(alias) -%} +{%- else -%} {%- set alias = 'DIFF_'~ cleanse_name(date_1~'_'~date_2) -%} {%- endif -%} -SELECT *, - EXTRACT({{ date_part }} FROM DATE {{ date_1 }} - DATE {{ date_2 }}) AS {{ alias }} -FROM {{ source_table }} \ No newline at end of file +select + *, + extract({{ date_part }} from date {{ date_1 }} - date {{ date_2 }}) as {{ alias }} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/datediff/snowflake/datediff.sql b/rasgotransforms/rasgotransforms/transforms/datediff/snowflake/datediff.sql index f365c0ec..15e452d3 100644 --- a/rasgotransforms/rasgotransforms/transforms/datediff/snowflake/datediff.sql +++ b/rasgotransforms/rasgotransforms/transforms/datediff/snowflake/datediff.sql @@ -1,9 +1,6 @@ -{%- if alias is defined -%} - {%- set alias = cleanse_name(alias) -%} -{%- else -%} - {%- set alias = 'DIFF_'~ cleanse_name(date_1~'_'~date_2) -%} +{%- if alias is defined -%} {%- set alias = cleanse_name(alias) -%} +{%- else -%} {%- set alias = 'DIFF_'~ cleanse_name(date_1~'_'~date_2) -%} {%- endif -%} -SELECT *, - DATEDIFF({{ date_part }}, {{ date_1 }}, {{ date_2 }}) as {{ alias }} -FROM {{ source_table }} +select *, datediff({{ date_part }}, {{ date_1 }}, {{ date_2 }}) as {{ alias }} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/datepart/bigquery/datepart.sql b/rasgotransforms/rasgotransforms/transforms/datepart/bigquery/datepart.sql index 3e6f7fee..1069f1fd 100644 --- a/rasgotransforms/rasgotransforms/transforms/datepart/bigquery/datepart.sql +++ b/rasgotransforms/rasgotransforms/transforms/datepart/bigquery/datepart.sql @@ -1,15 +1,25 @@ -SELECT *, -{%- for target_col, date_part in dates.items() %} - {%- if date_part|lower == 'weekiso' %} - EXTRACT(ISOWEEK FROM {{ target_col }}) AS {{ target_col }}_ISOWEEK {{ ", " if not loop.last else "" }} - {%- elif date_part|lower == 'dayofweekiso' %} - MOD(EXTRACT(DAYOFWEEK FROM {{ target_col }}) + 5, 7) + 1 AS {{ target_col }}_ISODAYOFWEEK {{ ", " if not loop.last else "" }} - {%- elif date_part|lower == 'yearofweekiso' %} - EXTRACT(ISOYEAR FROM {{ target_col }}) AS {{ target_col }}_ISOYEAR {{ ", " if not loop.last else "" }} - {%- elif date_part|lower == 'yearofweek' %} - EXTRACT(YEAR FROM {{ target_col }}) AS {{ target_col }}_YEAR {{ ", " if not loop.last else "" }} - {%- else %} - EXTRACT({{ date_part }} FROM {{ target_col }}) AS {{ target_col }}_{{ date_part }} {{ ", " if not loop.last else "" }} - {%- endif %} -{%- endfor %} -FROM {{ source_table }} +select + *, + {%- for target_col, date_part in dates.items() %} + {%- if date_part|lower == 'weekiso' %} + extract( + isoweek from {{ target_col }} + ) as {{ target_col }}_isoweek {{ ", " if not loop.last else "" }} + {%- elif date_part|lower == 'dayofweekiso' %} + mod(extract(dayofweek from {{ target_col }}) + 5, 7) + + 1 as {{ target_col }}_isodayofweek {{ ", " if not loop.last else "" }} + {%- elif date_part|lower == 'yearofweekiso' %} + extract( + isoyear from {{ target_col }} + ) as {{ target_col }}_isoyear {{ ", " if not loop.last else "" }} + {%- elif date_part|lower == 'yearofweek' %} + extract( + year from {{ target_col }} + ) as {{ target_col }}_year {{ ", " if not loop.last else "" }} + {%- else %} + extract( + {{ date_part }} from {{ target_col }} + ) as {{ target_col }}_{{ date_part }} {{ ", " if not loop.last else "" }} + {%- endif %} + {%- endfor %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/datepart/datepart.sql b/rasgotransforms/rasgotransforms/transforms/datepart/datepart.sql index 888ef77f..29ea9c62 100644 --- a/rasgotransforms/rasgotransforms/transforms/datepart/datepart.sql +++ b/rasgotransforms/rasgotransforms/transforms/datepart/datepart.sql @@ -1,5 +1,8 @@ -SELECT *, -{%- for target_col, date_part in dates.items() %} - EXTRACT({{date_part}} FROM {{target_col}}) AS {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }} -{%- endfor %} -FROM {{ source_table }} \ No newline at end of file +select + *, + {%- for target_col, date_part in dates.items() %} + extract( + {{ date_part }} from {{ target_col }} + ) as {{ target_col }}_{{ date_part }} {{ ", " if not loop.last else "" }} + {%- endfor %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/datepart/postgresql/datepart.sql b/rasgotransforms/rasgotransforms/transforms/datepart/postgresql/datepart.sql index 2474958d..1ad88e27 100644 --- a/rasgotransforms/rasgotransforms/transforms/datepart/postgresql/datepart.sql +++ b/rasgotransforms/rasgotransforms/transforms/datepart/postgresql/datepart.sql @@ -1,5 +1,8 @@ -SELECT *, -{%- for target_col, date_part in dates.items() %} - DATE_PART('{{date_part}}', {{target_col}}) AS {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }} -{%- endfor %} -FROM {{ source_table }} \ No newline at end of file +select + *, + {%- for target_col, date_part in dates.items() %} + date_part( + '{{date_part}}', {{ target_col }} + ) as {{ target_col }}_{{ date_part }} {{ ", " if not loop.last else "" }} + {%- endfor %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/datepart/redshift/datepart.sql b/rasgotransforms/rasgotransforms/transforms/datepart/redshift/datepart.sql index 2474958d..1ad88e27 100644 --- a/rasgotransforms/rasgotransforms/transforms/datepart/redshift/datepart.sql +++ b/rasgotransforms/rasgotransforms/transforms/datepart/redshift/datepart.sql @@ -1,5 +1,8 @@ -SELECT *, -{%- for target_col, date_part in dates.items() %} - DATE_PART('{{date_part}}', {{target_col}}) AS {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }} -{%- endfor %} -FROM {{ source_table }} \ No newline at end of file +select + *, + {%- for target_col, date_part in dates.items() %} + date_part( + '{{date_part}}', {{ target_col }} + ) as {{ target_col }}_{{ date_part }} {{ ", " if not loop.last else "" }} + {%- endfor %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/datepart/snowflake/datepart.sql b/rasgotransforms/rasgotransforms/transforms/datepart/snowflake/datepart.sql index 2474958d..1ad88e27 100644 --- a/rasgotransforms/rasgotransforms/transforms/datepart/snowflake/datepart.sql +++ b/rasgotransforms/rasgotransforms/transforms/datepart/snowflake/datepart.sql @@ -1,5 +1,8 @@ -SELECT *, -{%- for target_col, date_part in dates.items() %} - DATE_PART('{{date_part}}', {{target_col}}) AS {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }} -{%- endfor %} -FROM {{ source_table }} \ No newline at end of file +select + *, + {%- for target_col, date_part in dates.items() %} + date_part( + '{{date_part}}', {{ target_col }} + ) as {{ target_col }}_{{ date_part }} {{ ", " if not loop.last else "" }} + {%- endfor %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/datespine/bigquery/datespine.sql b/rasgotransforms/rasgotransforms/transforms/datespine/bigquery/datespine.sql index 89e6f770..0bc94d1e 100644 --- a/rasgotransforms/rasgotransforms/transforms/datespine/bigquery/datespine.sql +++ b/rasgotransforms/rasgotransforms/transforms/datespine/bigquery/datespine.sql @@ -4,36 +4,39 @@ select min(cast({{ date_col }} as date)) min_date, max(cast({{ date_col }} as da {% endset -%} {% set min_max_query_result = run_query(min_max_query) -%} {% if min_max_query_result is none -%} -{{ raise_exception('start_timstamp and end_timestamp must be provided when no Data Warehouse connection is available')}} +{{ raise_exception('start_timstamp and end_timestamp must be provided when no Data Warehouse connection is available') }} {% endif -%} {% endif -%} {% if start_timestamp is defined -%} - {% set min_date = (start_timestamp|todatetime).date() -%} +{% set min_date = (start_timestamp|todatetime).date() -%} {% else -%} - {% set min_date = min_max_query_result[min_max_query_result.columns[0]][0] -%} +{% set min_date = min_max_query_result[min_max_query_result.columns[0]][0] -%} {% endif -%} -{% if end_timestamp is defined -%} - {% set max_date = (end_timestamp|todatetime).date() -%} +{% if end_timestamp is defined -%} +{% set max_date = (end_timestamp|todatetime).date() -%} {% else -%} - {% set max_date = min_max_query_result[min_max_query_result.columns[1]][0] -%} +{% set max_date = min_max_query_result[min_max_query_result.columns[1]][0] -%} {% endif -%} -with calendar as ( - select - date_day, - date_trunc(date_day, week) as date_week, - date_trunc(date_day, month) as date_month, - date_trunc(date_day, quarter) as date_quarter, - date_trunc(date_day, year) as date_year - from unnest(generate_date_array('{{ min_date }}', '{{ max_date }}')) as date_day -), -spine as ( - select distinct date_{{ interval_type }} as period - from calendar -) +with + calendar as ( + select + date_day, + date_trunc(date_day, week) as date_week, + date_trunc(date_day, month) as date_month, + date_trunc(date_day, quarter) as date_quarter, + date_trunc(date_day, year) as date_year + from unnest(generate_date_array('{{ min_date }}', '{{ max_date }}')) as date_day + ), + spine as (select distinct date_{{ interval_type }} as period from calendar) select - cast(spine.period as timestamp) as {{ date_col }}_SPINE_START, - timestamp_add(cast(date_add(spine.period, INTERVAL 1 {{ interval_type }}) as timestamp), INTERVAL -1 second) as {{ date_col }}_SPINE_END, + cast(spine.period as timestamp) as {{ date_col }}_spine_start, + timestamp_add( + cast(date_add(spine.period, interval 1 {{ interval_type }}) as timestamp), + interval -1 second + ) as {{ date_col }}_spine_end, st.* from spine -left outer join {{ source_table }} st on - cast(date_trunc(cast(st.{{ date_col }} as date), {{ interval_type }}) as date) = spine.period +left outer join + {{ source_table }} st + on cast(date_trunc(cast(st.{{ date_col }} as date), {{ interval_type }}) as date) + = spine.period diff --git a/rasgotransforms/rasgotransforms/transforms/datespine/snowflake/datespine.sql b/rasgotransforms/rasgotransforms/transforms/datespine/snowflake/datespine.sql index 14abbc3c..93c6ed7e 100644 --- a/rasgotransforms/rasgotransforms/transforms/datespine/snowflake/datespine.sql +++ b/rasgotransforms/rasgotransforms/transforms/datespine/snowflake/datespine.sql @@ -4,50 +4,59 @@ select min(cast({{ date_col }} as date)) min_date, max(cast({{ date_col }} as da {% endset -%} {% set min_max_query_result = run_query(min_max_query) -%} {% if min_max_query_result is none -%} -{{ raise_exception('start_timstamp and end_timestamp must be provided when no Data Warehouse connection is available')}} +{{ raise_exception('start_timstamp and end_timestamp must be provided when no Data Warehouse connection is available') }} {% endif -%} {% endif -%} -{% if start_timestamp is defined -%} - {% set min_date = start_timestamp -%} +{% if start_timestamp is defined -%} {% set min_date = start_timestamp -%} {% else -%} - {% set min_date = min_max_query_result[min_max_query_result.columns[0]][0] -%} +{% set min_date = min_max_query_result[min_max_query_result.columns[0]][0] -%} {% endif -%} -{% if end_timestamp is defined -%} - {% set max_date = end_timestamp -%} +{% if end_timestamp is defined -%} {% set max_date = end_timestamp -%} {% else -%} - {% set max_date = min_max_query_result[min_max_query_result.columns[1]][0] -%} +{% set max_date = min_max_query_result[min_max_query_result.columns[1]][0] -%} {% endif -%} {% set num_days = (max_date|string|todatetime - min_date|string|todatetime).days + 1 -%} -with calendar as ( - select - row_number() over (order by null) as interval_id, - cast(dateadd( - 'day', - interval_id-1, - '{{ min_date }}'::timestamp_ntz) as date) as date_day, - cast(date_trunc('week', date_day) as date) as date_week, - cast(date_trunc('month', date_day) as date) as date_month, - case - when month(date_day) in (1, 2, 3) then date_from_parts(year(date_day), 1, 1) - when month(date_day) in (4, 5, 6) then date_from_parts(year(date_day), 4, 1) - when month(date_day) in (7, 8, 9) then date_from_parts(year(date_day), 7, 1) - when month(date_day) in (10, 11, 12) then date_from_parts(year(date_day), 10, 1) - end as date_quarter, - cast(date_trunc('year', date_day) as date) as date_year - from table (generator(rowcount => {{ num_days }})) -), -spine as ( - select distinct date_{{ interval_type }} as period - from calendar -) +with + calendar as ( + select + row_number() over (order by null) as interval_id, + cast( + dateadd('day', interval_id -1, '{{ min_date }}'::timestamp_ntz) as date + ) as date_day, + cast(date_trunc('week', date_day) as date) as date_week, + cast(date_trunc('month', date_day) as date) as date_month, + case + when month(date_day) in (1, 2, 3) + then date_from_parts(year(date_day), 1, 1) + when month(date_day) in (4, 5, 6) + then date_from_parts(year(date_day), 4, 1) + when month(date_day) in (7, 8, 9) + then date_from_parts(year(date_day), 7, 1) + when month(date_day) in (10, 11, 12) + then date_from_parts(year(date_day), 10, 1) + end as date_quarter, + cast(date_trunc('year', date_day) as date) as date_year + from table(generator(rowcount => {{ num_days }})) + ), + spine as (select distinct date_{{ interval_type }} as period from calendar) select - cast(spine.period as timestamp) as {{ date_col }}_SPINE_START, + cast(spine.period as timestamp) as {{ date_col }}_spine_start, {%- if interval_type|lower == 'quarter' %} - dateadd('second', -1, dateadd('month',3, {{ date_col }}_SPINE_START)) as {{ date_col }}_SPINE_END, + dateadd( + 'second', -1, dateadd('month', 3, {{ date_col }}_spine_start) + ) as {{ date_col }}_spine_end, {%- else %} - dateadd('second', -1, dateadd('{{ interval_type }}',1, {{ date_col }}_SPINE_START)) as {{ date_col }}_SPINE_END, + dateadd( + 'second', -1, dateadd('{{ interval_type }}', 1, {{ date_col }}_spine_start) + ) as {{ date_col }}_spine_end, {%- endif %} {{ source_table }}.* from spine -left outer join {{ source_table }} on - cast(date_trunc('{{ interval_type }}', cast({{ source_table}}.{{ date_col }} as date)) as date) = spine.period \ No newline at end of file +left outer join + {{ source_table }} + on cast( + date_trunc( + '{{ interval_type }}', cast({{ source_table }}.{{ date_col }} as date) + ) as date + ) + = spine.period diff --git a/rasgotransforms/rasgotransforms/transforms/datespine_groups/bigquery/datespine_groups.sql b/rasgotransforms/rasgotransforms/transforms/datespine_groups/bigquery/datespine_groups.sql index 7b52db71..7c6471de 100644 --- a/rasgotransforms/rasgotransforms/transforms/datespine_groups/bigquery/datespine_groups.sql +++ b/rasgotransforms/rasgotransforms/transforms/datespine_groups/bigquery/datespine_groups.sql @@ -4,78 +4,72 @@ select min(cast({{ date_col }} as date)) min_date, max(cast({{ date_col }} as da {% endset -%} {% set min_max_query_result = run_query(min_max_query) -%} {% if min_max_query_result is none -%} -{{ raise_exception('start_timstamp and end_timestamp must be provided when no Data Warehouse connection is available')}} +{{ raise_exception('start_timstamp and end_timestamp must be provided when no Data Warehouse connection is available') }} {% endif -%} {% endif -%} -{% if start_timestamp is defined -%} - {% set min_date = start_timestamp -%} +{% if start_timestamp is defined -%} {% set min_date = start_timestamp -%} {% else -%} - {% set min_date = min_max_query_result[min_max_query_result.columns[0]][0] -%} +{% set min_date = min_max_query_result[min_max_query_result.columns[0]][0] -%} {% endif -%} -{% if end_timestamp is defined -%} - {% set max_date = end_timestamp -%} +{% if end_timestamp is defined -%} {% set max_date = end_timestamp -%} {% else -%} - {% set max_date = min_max_query_result[min_max_query_result.columns[1]][0] -%} +{% set max_date = min_max_query_result[min_max_query_result.columns[1]][0] -%} {% endif -%} {% set row_count = (max_date|string|todatetime - min_date|string|todatetime).days + 1 -%} -with calendar as ( - select - date_day, - date_trunc(date_day, week) as date_week, - date_trunc(date_day, month) as date_month, - date_trunc(date_day, quarter) as date_quarter, - date_trunc(date_day, year) as date_year, - from unnest(generate_date_array('{{ min_date }}', '{{ max_date }}')) as date_day -), -GLOBAL_SPINE AS ( - select - distinct date_{{ interval_type }} as SPINE_START, - date_add(date_{{ interval_type }}, INTERVAL 1 {{ interval_type }}) SPINE_END, - from calendar -), -CATEGORIES AS ( - SELECT - {% for col in group_by -%} - {{ col }}, - {%- endfor %} - MIN({{ date_col }}) AS LOCAL_START, - MAX({{ date_col }}) AS LOCAL_END - FROM {{ source_table }} - GROUP BY - {% for col in group_by -%} - {{ col }}{{ ', ' if not loop.last else ' ' }} - {%- endfor %} -), -GROUP_SPINE AS ( - SELECT - {% for col in group_by -%} - {{ col }}, - {%- endfor %} - SPINE_START AS GROUP_START, - SPINE_END AS GROUP_END - FROM CATEGORIES G - CROSS JOIN ( - SELECT - SPINE_START, SPINE_END - FROM GLOBAL_SPINE S - {% if group_bounds == 'local' %} - WHERE S.SPINE_START BETWEEN G.LOCAL_START AND G.LOCAL_END - {% elif group_bounds == 'mixed' %} - WHERE S.SPINE_START >= G.LOCAL_START - {% endif %} - ) -) +with + calendar as ( + select + date_day, + date_trunc(date_day, week) as date_week, + date_trunc(date_day, month) as date_month, + date_trunc(date_day, quarter) as date_quarter, + date_trunc(date_day, year) as date_year, + from unnest(generate_date_array('{{ min_date }}', '{{ max_date }}')) as date_day + ), + global_spine as ( + select distinct + date_{{ interval_type }} as spine_start, + date_add( + date_{{ interval_type }}, interval 1 {{ interval_type }} + ) spine_end, + from calendar + ), + categories as ( + select + {% for col in group_by -%} {{ col }}, {%- endfor %} + min({{ date_col }}) as local_start, + max({{ date_col }}) as local_end + from {{ source_table }} + group by + {% for col in group_by -%} + {{ col }}{{ ', ' if not loop.last else ' ' }} + {%- endfor %} + ), + group_spine as ( + select + {% for col in group_by -%} {{ col }}, {%- endfor %} + spine_start as group_start, + spine_end as group_end + from categories g + cross join + ( + select spine_start, spine_end + from global_spine s + {% if group_bounds == 'local' %} + where s.spine_start between g.local_start and g.local_end + {% elif group_bounds == 'mixed' %} where s.spine_start >= g.local_start + {% endif %} + ) + ) -SELECT - {% for col in group_by -%} - G.{{ col }} AS GROUP_BY_{{ col }}, - {%- endfor %} - GROUP_START, - GROUP_END, - T.* -FROM GROUP_SPINE G -LEFT JOIN {{ source_table }} T - ON {{ date_col }} >= G.GROUP_START - AND {{ date_col }} < G.GROUP_END - {% for col in group_by %} AND G.{{ col }} = T.{{ col }} - {%- endfor %} +select + {% for col in group_by -%} g.{{ col }} as group_by_{{ col }}, {%- endfor %} + group_start, + group_end, + t.* +from group_spine g +left join + {{ source_table }} t + on {{ date_col }} >= g.group_start + and {{ date_col }} < g.group_end + {% for col in group_by %} and g.{{ col }} = t.{{ col }} {%- endfor %} diff --git a/rasgotransforms/rasgotransforms/transforms/datespine_groups/snowflake/datespine_groups.sql b/rasgotransforms/rasgotransforms/transforms/datespine_groups/snowflake/datespine_groups.sql index 2f707162..d437d48d 100644 --- a/rasgotransforms/rasgotransforms/transforms/datespine_groups/snowflake/datespine_groups.sql +++ b/rasgotransforms/rasgotransforms/transforms/datespine_groups/snowflake/datespine_groups.sql @@ -4,71 +4,69 @@ select min(cast({{ date_col }} as date)) min_date, max(cast({{ date_col }} as da {% endset -%} {% set min_max_query_result = run_query(min_max_query) -%} {% if min_max_query_result is none -%} -{{ raise_exception('start_timstamp and end_timestamp must be provided when no Data Warehouse connection is available')}} +{{ raise_exception('start_timstamp and end_timestamp must be provided when no Data Warehouse connection is available') }} {% endif -%} {% endif -%} -{% if start_timestamp is defined -%} - {% set min_date = start_timestamp -%} +{% if start_timestamp is defined -%} {% set min_date = start_timestamp -%} {% else -%} - {% set min_date = min_max_query_result[min_max_query_result.columns[0]][0] -%} +{% set min_date = min_max_query_result[min_max_query_result.columns[0]][0] -%} {% endif -%} -{% if end_timestamp is defined -%} - {% set max_date = end_timestamp -%} +{% if end_timestamp is defined -%} {% set max_date = end_timestamp -%} {% else -%} - {% set max_date = min_max_query_result[min_max_query_result.columns[1]][0] -%} +{% set max_date = min_max_query_result[min_max_query_result.columns[1]][0] -%} {% endif -%} {% set row_count = (max_date|string|todatetime - min_date|string|todatetime).days + 1 -%} -WITH GLOBAL_SPINE AS ( - SELECT - ROW_NUMBER() OVER (ORDER BY NULL) as INTERVAL_ID, - DATEADD('{{ interval_type }}', (INTERVAL_ID - 1), '{{ min_date }}'::timestamp_ntz) as SPINE_START, - DATEADD('{{ interval_type }}', INTERVAL_ID, '{{ min_date }}'::timestamp_ntz) as SPINE_END - FROM TABLE (GENERATOR(ROWCOUNT => {{ row_count }})) -), -GROUPS AS ( - SELECT - {% for col in group_by -%} - {{ col }}, - {%- endfor %} - MIN({{ date_col }}) AS LOCAL_START, - MAX({{ date_col }}) AS LOCAL_END - FROM {{ source_table }} - GROUP BY - {% for col in group_by -%} - {{ col }}{{ ', ' if not loop.last else ' ' }} - {%- endfor %} -), -GROUP_SPINE AS ( - SELECT - {% for col in group_by -%} - {{ col }}, - {%- endfor %} - SPINE_START AS GROUP_START, - SPINE_END AS GROUP_END - FROM GROUPS G - CROSS JOIN LATERAL ( - SELECT - SPINE_START, SPINE_END - FROM GLOBAL_SPINE S - {% if group_bounds == 'local' %} - WHERE S.SPINE_START BETWEEN G.LOCAL_START AND G.LOCAL_END - {% elif group_bounds == 'mixed' %} - WHERE S.SPINE_START >= G.LOCAL_START - {% endif %} - ) -) +with + global_spine as ( + select + row_number() over (order by null) as interval_id, + dateadd( + '{{ interval_type }}', + (interval_id - 1), + '{{ min_date }}'::timestamp_ntz + ) as spine_start, + dateadd( + '{{ interval_type }}', interval_id, '{{ min_date }}'::timestamp_ntz + ) as spine_end + from table(generator(rowcount => {{ row_count }})) + ), + groups as ( + select + {% for col in group_by -%} {{ col }}, {%- endfor %} + min({{ date_col }}) as local_start, + max({{ date_col }}) as local_end + from {{ source_table }} + group by + {% for col in group_by -%} + {{ col }}{{ ', ' if not loop.last else ' ' }} + {%- endfor %} + ), + group_spine as ( + select + {% for col in group_by -%} {{ col }}, {%- endfor %} + spine_start as group_start, + spine_end as group_end + from groups g + cross join + lateral( + select spine_start, spine_end + from global_spine s + {% if group_bounds == 'local' %} + where s.spine_start between g.local_start and g.local_end + {% elif group_bounds == 'mixed' %} where s.spine_start >= g.local_start + {% endif %} + ) + ) -SELECT - {% for col in group_by -%} - G.{{ col }} AS GROUP_BY_{{ col }}, - {%- endfor %} - GROUP_START, - GROUP_END, - T.* -FROM GROUP_SPINE G -LEFT JOIN {{ source_table }} T - ON {{ date_col }} >= G.GROUP_START - AND {{ date_col }} < G.GROUP_END - {% for col in group_by %} AND G.{{ col }} = T.{{ col }} - {%- endfor %} +select + {% for col in group_by -%} g.{{ col }} as group_by_{{ col }}, {%- endfor %} + group_start, + group_end, + t.* +from group_spine g +left join + {{ source_table }} t + on {{ date_col }} >= g.group_start + and {{ date_col }} < g.group_end + {% for col in group_by %} and g.{{ col }} = t.{{ col }} {%- endfor %} diff --git a/rasgotransforms/rasgotransforms/transforms/datetrunc/bigquery/datetrunc.sql b/rasgotransforms/rasgotransforms/transforms/datetrunc/bigquery/datetrunc.sql index 5cec7e37..83fd14ad 100644 --- a/rasgotransforms/rasgotransforms/transforms/datetrunc/bigquery/datetrunc.sql +++ b/rasgotransforms/rasgotransforms/transforms/datetrunc/bigquery/datetrunc.sql @@ -1,5 +1,8 @@ -SELECT *, -{%- for target_col, date_part in dates.items() %} - DATE_TRUNC({{target_col}}, {{date_part}}) as {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }} -{%- endfor %} -FROM {{ source_table }} \ No newline at end of file +select + *, + {%- for target_col, date_part in dates.items() %} + date_trunc( + {{ target_col }}, {{ date_part }} + ) as {{ target_col }}_{{ date_part }} {{ ", " if not loop.last else "" }} + {%- endfor %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/datetrunc/postgresql/datetrunc.sql b/rasgotransforms/rasgotransforms/transforms/datetrunc/postgresql/datetrunc.sql index ad99ae2b..676f0a21 100644 --- a/rasgotransforms/rasgotransforms/transforms/datetrunc/postgresql/datetrunc.sql +++ b/rasgotransforms/rasgotransforms/transforms/datetrunc/postgresql/datetrunc.sql @@ -1,5 +1,8 @@ -SELECT *, -{%- for target_col, date_part in dates.items() %} - DATE_TRUNC({{date_part}}, {{target_col}}) as {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }} -{%- endfor %} -FROM {{ source_table }} \ No newline at end of file +select + *, + {%- for target_col, date_part in dates.items() %} + date_trunc( + {{ date_part }}, {{ target_col }} + ) as {{ target_col }}_{{ date_part }} {{ ", " if not loop.last else "" }} + {%- endfor %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/datetrunc/redshift/datetrunc.sql b/rasgotransforms/rasgotransforms/transforms/datetrunc/redshift/datetrunc.sql index ad99ae2b..676f0a21 100644 --- a/rasgotransforms/rasgotransforms/transforms/datetrunc/redshift/datetrunc.sql +++ b/rasgotransforms/rasgotransforms/transforms/datetrunc/redshift/datetrunc.sql @@ -1,5 +1,8 @@ -SELECT *, -{%- for target_col, date_part in dates.items() %} - DATE_TRUNC({{date_part}}, {{target_col}}) as {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }} -{%- endfor %} -FROM {{ source_table }} \ No newline at end of file +select + *, + {%- for target_col, date_part in dates.items() %} + date_trunc( + {{ date_part }}, {{ target_col }} + ) as {{ target_col }}_{{ date_part }} {{ ", " if not loop.last else "" }} + {%- endfor %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/datetrunc/snowflake/datetrunc.sql b/rasgotransforms/rasgotransforms/transforms/datetrunc/snowflake/datetrunc.sql index ad99ae2b..676f0a21 100644 --- a/rasgotransforms/rasgotransforms/transforms/datetrunc/snowflake/datetrunc.sql +++ b/rasgotransforms/rasgotransforms/transforms/datetrunc/snowflake/datetrunc.sql @@ -1,5 +1,8 @@ -SELECT *, -{%- for target_col, date_part in dates.items() %} - DATE_TRUNC({{date_part}}, {{target_col}}) as {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }} -{%- endfor %} -FROM {{ source_table }} \ No newline at end of file +select + *, + {%- for target_col, date_part in dates.items() %} + date_trunc( + {{ date_part }}, {{ target_col }} + ) as {{ target_col }}_{{ date_part }} {{ ", " if not loop.last else "" }} + {%- endfor %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/describe/bigquery/describe.sql b/rasgotransforms/rasgotransforms/transforms/describe/bigquery/describe.sql index be07e849..b7fb909c 100644 --- a/rasgotransforms/rasgotransforms/transforms/describe/bigquery/describe.sql +++ b/rasgotransforms/rasgotransforms/transforms/describe/bigquery/describe.sql @@ -2,41 +2,41 @@ {%- for key, value in names_types_list.items() -%} - {% if (value == 'NUMBER' or 'FLOAT' in value or 'INT' in value) %} - SELECT - '{{ key }}' AS FEATURE - ,'{{ value }}' AS DTYPE - ,COUNT(COL) as COUNT - ,SUM(CASE WHEN COL IS NULL THEN 1 ELSE 0 END) AS NULL_COUNT - ,COUNT(DISTINCT COL) AS UNIQUE_COUNT - ,NULL as MOST_FREQUENT - ,AVG(COL) AS MEAN - ,STDDEV(COL) as STD_DEV - ,CAST(MIN(COL) AS STRING) AS MIN - ,percentile_cont(0.25) within group (order by COL) as _25_PERCENTILE - ,percentile_cont(0.5) within group (order by COL) as _50_PERCENTILE - ,percentile_cont(0.75) within group (order by COL) as _75_PERCENTILE - ,CAST(MAX(COL) AS STRING) AS MAX - FROM - (SELECT {{key}} AS COL FROM {{ source_table }}) - {{"UNION ALL " if not loop.last else ""}} - {% else %} - SELECT - '{{ key }}' AS FEATURE - ,'{{ value }}' AS DTYPE - ,COUNT(COL) as COUNT - ,SUM(CASE WHEN COL IS NULL THEN 1 ELSE 0 END) AS NULL_COUNT - ,COUNT(DISTINCT COL) AS UNIQUE_COUNT - ,NULL as MOST_FREQUENT - ,NULL AS MEAN - ,NULL as STD_DEV - ,CAST(MIN(COL) AS STRING) AS MIN - ,NULL as _25_PERCENTILE - ,NULL as _50_PERCENTILE - ,NULL as _75_PERCENTILE - ,CAST(MAX(COL) AS STRING) AS MAX - FROM - (SELECT {{key}} AS COL FROM {{ source_table }}) - {{"UNION ALL " if not loop.last else ""}} - {% endif -%} -{%- endfor -%} \ No newline at end of file +{% if (value == 'NUMBER' or 'FLOAT' in value or 'INT' in value) %} +select + '{{ key }}' as feature, + '{{ value }}' as dtype, + count(col) as count, + sum(case when col is null then 1 else 0 end) as null_count, + count(distinct col) as unique_count, + null as most_frequent, + avg(col) as mean, + stddev(col) as std_dev, + cast(min(col) as string) as min, + percentile_cont(0.25) within group (order by col) as _25_percentile, + percentile_cont(0.5) within group (order by col) as _50_percentile, + percentile_cont(0.75) within group (order by col) as _75_percentile, + cast(max(col) as string) as max +from + (select {{ key }} as col from {{ source_table }}) + {{ "UNION ALL " if not loop.last else "" }} +{% else %} +select + '{{ key }}' as feature, + '{{ value }}' as dtype, + count(col) as count, + sum(case when col is null then 1 else 0 end) as null_count, + count(distinct col) as unique_count, + null as most_frequent, + null as mean, + null as std_dev, + cast(min(col) as string) as min, + null as _25_percentile, + null as _50_percentile, + null as _75_percentile, + cast(max(col) as string) as max +from + (select {{ key }} as col from {{ source_table }}) + {{ "UNION ALL " if not loop.last else "" }} +{% endif -%} +{%- endfor -%} diff --git a/rasgotransforms/rasgotransforms/transforms/describe/snowflake/describe.sql b/rasgotransforms/rasgotransforms/transforms/describe/snowflake/describe.sql index 3672515e..37fccc4e 100644 --- a/rasgotransforms/rasgotransforms/transforms/describe/snowflake/describe.sql +++ b/rasgotransforms/rasgotransforms/transforms/describe/snowflake/describe.sql @@ -2,41 +2,41 @@ {%- for key, value in names_types_list.items() -%} - {% if (value == 'NUMBER' or 'FLOAT' in value or 'INT' in value) %} - SELECT - '{{ key }}' AS FEATURE - ,'{{ value }}' AS DTYPE - ,COUNT(COL) as COUNT - ,SUM(CASE WHEN COL IS NULL THEN 1 ELSE 0 END) AS NULL_COUNT - ,COUNT(DISTINCT COL) AS UNIQUE_COUNT - ,MODE(COL)::string as MOST_FREQUENT - ,AVG(COL) AS MEAN - ,STDDEV(COL) as STD_DEV - ,MIN(COL)::string AS MIN - ,percentile_cont(0.25) within group (order by COL) as _25_PERCENTILE - ,percentile_cont(0.5) within group (order by COL) as _50_PERCENTILE - ,percentile_cont(0.75) within group (order by COL) as _75_PERCENTILE - ,MAX(COL)::string AS MAX - FROM - (SELECT {{key}} AS COL FROM {{ source_table }}) - {{"UNION ALL " if not loop.last else ""}} - {% else %} - SELECT - '{{ key }}' AS FEATURE - ,'{{ value }}' AS DTYPE - ,COUNT(COL) as COUNT - ,SUM(CASE WHEN COL IS NULL THEN 1 ELSE 0 END) AS NULL_COUNT - ,COUNT(DISTINCT COL) AS UNIQUE_COUNT - ,MODE(COL)::string as MOST_FREQUENT - ,NULL AS MEAN - ,NULL as STD_DEV - ,MIN(COL)::string AS MIN - ,NULL as _25_PERCENTILE - ,NULL as _50_PERCENTILE - ,NULL as _75_PERCENTILE - ,MAX(COL)::string AS MAX - FROM - (SELECT {{key}} AS COL FROM {{ source_table }}) - {{"UNION ALL " if not loop.last else ""}} - {% endif -%} -{%- endfor -%} \ No newline at end of file +{% if (value == 'NUMBER' or 'FLOAT' in value or 'INT' in value) %} +select + '{{ key }}' as feature, + '{{ value }}' as dtype, + count(col) as count, + sum(case when col is null then 1 else 0 end) as null_count, + count(distinct col) as unique_count, + mode(col)::string as most_frequent, + avg(col) as mean, + stddev(col) as std_dev, + min(col)::string as min, + percentile_cont(0.25) within group (order by col) as _25_percentile, + percentile_cont(0.5) within group (order by col) as _50_percentile, + percentile_cont(0.75) within group (order by col) as _75_percentile, + max(col)::string as max +from + (select {{ key }} as col from {{ source_table }}) + {{ "UNION ALL " if not loop.last else "" }} +{% else %} +select + '{{ key }}' as feature, + '{{ value }}' as dtype, + count(col) as count, + sum(case when col is null then 1 else 0 end) as null_count, + count(distinct col) as unique_count, + mode(col)::string as most_frequent, + null as mean, + null as std_dev, + min(col)::string as min, + null as _25_percentile, + null as _50_percentile, + null as _75_percentile, + max(col)::string as max +from + (select {{ key }} as col from {{ source_table }}) + {{ "UNION ALL " if not loop.last else "" }} +{% endif -%} +{%- endfor -%} diff --git a/rasgotransforms/rasgotransforms/transforms/drop_columns/drop_columns.sql b/rasgotransforms/rasgotransforms/transforms/drop_columns/drop_columns.sql index faf18e19..3491a79b 100644 --- a/rasgotransforms/rasgotransforms/transforms/drop_columns/drop_columns.sql +++ b/rasgotransforms/rasgotransforms/transforms/drop_columns/drop_columns.sql @@ -9,18 +9,18 @@ {%- set exclude_cols = (exclude_cols|join(',')|upper).split(',') -%} {% set include_cols = [] -%} {% for column_name in source_col_names -%} - {% if column_name.upper() not in exclude_cols -%} - {% do include_cols.append(column_name) -%} - {% endif -%} +{% if column_name.upper() not in exclude_cols -%} +{% do include_cols.append(column_name) -%} +{% endif -%} {% endfor -%} {%- endif -%} {%- if include_cols is defined -%} -SELECT -{%- for col in include_cols %} - {{col}}{{ ", " if not loop.last else " " }} -{%- endfor %} -FROM {{source_table}} +select + {%- for col in include_cols %} + {{ col }}{{ ", " if not loop.last else " " }} + {%- endfor %} +from {{ source_table }} {%- endif -%} -{%- endif -%} \ No newline at end of file +{%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/dropna/dropna.sql b/rasgotransforms/rasgotransforms/transforms/dropna/dropna.sql index 90efe92d..b61851a9 100644 --- a/rasgotransforms/rasgotransforms/transforms/dropna/dropna.sql +++ b/rasgotransforms/rasgotransforms/transforms/dropna/dropna.sql @@ -1,36 +1,35 @@ {%- if subset is not defined -%} -{%- set subset = get_columns(source_table) -%} -{%- set source_col_names = subset -%} +{%- set subset = get_columns(source_table) -%} {%- set source_col_names = subset -%} {%- endif -%} -{%- if how is not defined -%} -{%- set how = "any" -%} -{%- endif -%} +{%- if how is not defined -%} {%- set how = "any" -%} {%- endif -%} {%- if how == "any" and thresh is not defined -%} -select * from {{ source_table }} -{%- for col in subset %} -{{ 'where' if loop.first else ' and' }} {{ col }} is not null -{%- endfor -%} +select * +from + {{ source_table }} + {%- for col in subset %} + {{ 'where' if loop.first else ' and' }} {{ col }} is not null + {%- endfor -%} {%- else -%} -{%- if thresh is not defined -%} -{%- set thresh = subset|length -%} +{%- if thresh is not defined -%} {%- set thresh = subset|length -%} {%- endif -%} {%- if source_col_names is not defined -%} {%- set source_col_names = get_columns(source_table) -%} {%- endif -%} -with not_null as ( - select *, - {%- for col in subset %} - cast({{ col }} is null as int) {{ "+ " if not loop.last else " " }} - {%- endfor %} - as NUM_IS_NA - from {{ source_table }} - where NUM_IS_NA < {{ thresh }} -) select - {% for col in source_col_names -%} - {{ col }}{{ ", " if not loop.last else " " }} +with + not_null as ( + select + *, + {%- for col in subset %} + cast({{ col }} is null as int) {{ "+ " if not loop.last else " " }} + {%- endfor %} as num_is_na + from {{ source_table }} + where num_is_na < {{ thresh }} + ) +select + {% for col in source_col_names -%} {{ col }}{{ ", " if not loop.last else " " }} {%- endfor %} from not_null -{%- endif -%} \ No newline at end of file +{%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/encode_values/bigquery/encode_values.sql b/rasgotransforms/rasgotransforms/transforms/encode_values/bigquery/encode_values.sql index 14aac1ac..af43caa6 100644 --- a/rasgotransforms/rasgotransforms/transforms/encode_values/bigquery/encode_values.sql +++ b/rasgotransforms/rasgotransforms/transforms/encode_values/bigquery/encode_values.sql @@ -3,23 +3,19 @@ {%- if method|lower == 'label' -%} -with distinct_values as ( - select - distinct rank() over(order by {{ column }} asc) as id, - {{ column }} - from {{ source_table }} - order by {{ column }} asc -) -select - {% if overwrite_columns -%} - {%- for col in untouched_cols -%} - t.{{col}}, - {%- endfor -%} - {% else -%} - t.*, +with + distinct_values as ( + select distinct rank() over (order by {{ column }} asc) as id, {{ column }} + from {{ source_table }} + order by {{ column }} asc + ) +select + {% if overwrite_columns -%} + {%- for col in untouched_cols -%} t.{{ col }}, {%- endfor -%} + {% else -%} t.*, {%- endif %} (v.id - 1) as {{ alias }} -FROM {{ source_table }} t +from {{ source_table }} t left join distinct_values v using ({{ column }}) {%- elif method|lower == 'target' -%} @@ -27,39 +23,40 @@ left join distinct_values v using ({{ column }}) {%- if target is not defined -%} {{ raise_exception("The 'target' parameter must be defined when using the target encoding method") }} {%- endif -%} -with means as ( - select - distinct {{column}} as value, - ROUND(AVG({{target}}), 3) as {{alias}} - from {{ source_table }} - group by value -) -select - {% if overwrite_columns -%} - {%- for col in untouched_cols -%} - t.{{col}}, - {%- endfor %} - {% else -%} - t.*, +with + means as ( + select distinct + {{ column }} as value, round(avg({{ target }}), 3) as {{ alias }} + from {{ source_table }} + group by value + ) +select + {% if overwrite_columns -%} + {%- for col in untouched_cols -%} t.{{ col }}, {%- endfor %} + {% else -%} t.*, {%- endif -%} - m.{{alias}} + m.{{ alias }} from {{ source_table }} t -left join - means m on t.{{column}} = m.value +left join means m on t.{{ column }} = m.value {%- elif method|lower == 'oh' -%} {%- set distinct_col_vals = run_query("SELECT DISTINCT " + column + " FROM " + source_table)[column].to_list() -%} -SELECT {{ untouched_cols|join(',') }}, -{%- for val in distinct_col_vals %} +select + {{ untouched_cols|join(',') }}, + {%- for val in distinct_col_vals %} {%- if val is not none %} - CASE WHEN {{ column }} = {{ "'" ~ val ~ "'"}} THEN 1 ELSE 0 END as {{ cleanse_name(column ~ '_' ~ val) }}{{ ', ' if not loop.last else '' }} + case + when {{ column }} = {{ "'" ~ val ~ "'" }} then 1 else 0 + end as {{ cleanse_name(column ~ '_' ~ val) }}{{ ', ' if not loop.last else '' }} {%- else %} - CASE WHEN {{ column }} IS NULL THEN 1 ELSE 0 END as {{ column }}_IS_NULL{{ ', ' if not loop.last else '' }} + case + when {{ column }} is null then 1 else 0 + end as {{ column }}_is_null{{ ', ' if not loop.last else '' }} {%- endif -%} -{% endfor %} -FROM {{ source_table }} + {% endfor %} +from {{ source_table }} {%- else -%} -{{ raise_exception("Method '" + method + "' is not recognized. Accepted encoding methods are 'label', 'target', and 'oh'")}} -{%- endif -%} \ No newline at end of file +{{ raise_exception("Method '" + method + "' is not recognized. Accepted encoding methods are 'label', 'target', and 'oh'") }} +{%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/encode_values/snowflake/encode_values.sql b/rasgotransforms/rasgotransforms/transforms/encode_values/snowflake/encode_values.sql index 917a9734..c3d5680f 100644 --- a/rasgotransforms/rasgotransforms/transforms/encode_values/snowflake/encode_values.sql +++ b/rasgotransforms/rasgotransforms/transforms/encode_values/snowflake/encode_values.sql @@ -3,14 +3,18 @@ {%- if method|lower == 'label' -%} -with distinct_values as ( - select - array_agg(distinct {{ column }}) within group (order by {{ column }} asc) as ALL_VALUES_ARRAY - from {{ source_table }} -) -select {{ untouched_cols|join(',') }}, - ALL_VALUES_ARRAY, - array_position({{ column }}::variant,ALL_VALUES_ARRAY) as {{ alias }} +with + distinct_values as ( + select + array_agg(distinct {{ column }}) within group ( + order by {{ column }} asc + ) as all_values_array + from {{ source_table }} + ) +select + {{ untouched_cols|join(',') }}, + all_values_array, + array_position({{ column }}::variant, all_values_array) as {{ alias }} from distinct_values, {{ source_table }} {%- elif method|lower == 'target' -%} @@ -18,39 +22,40 @@ from distinct_values, {{ source_table }} {%- if target is not defined -%} {{ raise_exception("The 'target' parameter must be defined when using the target encoding method") }} {%- endif -%} -with means as ( - select - distinct {{column}} as value, - ROUND(AVG({{target}}), 3) as {{alias}} - from {{ source_table }} - group by value -) -select - {% if overwrite_columns -%} - {%- for col in untouched_cols -%} - t.{{col}}, - {%- endfor %} - {% else -%} - t.*, +with + means as ( + select distinct + {{ column }} as value, round(avg({{ target }}), 3) as {{ alias }} + from {{ source_table }} + group by value + ) +select + {% if overwrite_columns -%} + {%- for col in untouched_cols -%} t.{{ col }}, {%- endfor %} + {% else -%} t.*, {%- endif -%} - m.{{alias}} + m.{{ alias }} from {{ source_table }} t -left join - means m on t.{{column}} = m.value +left join means m on t.{{ column }} = m.value {%- elif method|lower == 'oh' -%} {%- set distinct_col_vals = run_query("SELECT DISTINCT " + column + " FROM " + source_table)[column].to_list() -%} -SELECT {{ untouched_cols|join(',') }}, -{%- for val in distinct_col_vals %} +select + {{ untouched_cols|join(',') }}, + {%- for val in distinct_col_vals %} {%- if val is not none %} - CASE WHEN {{ column }} = {{ "'" ~ val ~ "'"}} THEN 1 ELSE 0 END as {{ cleanse_name(column ~ '_' ~ val) }}{{ ', ' if not loop.last else '' }} + case + when {{ column }} = {{ "'" ~ val ~ "'" }} then 1 else 0 + end as {{ cleanse_name(column ~ '_' ~ val) }}{{ ', ' if not loop.last else '' }} {%- else %} - CASE WHEN {{ column }} IS NULL THEN 1 ELSE 0 END as {{ column }}_IS_NULL{{ ', ' if not loop.last else '' }} + case + when {{ column }} is null then 1 else 0 + end as {{ column }}_is_null{{ ', ' if not loop.last else '' }} {%- endif -%} -{% endfor %} -FROM {{ source_table }} + {% endfor %} +from {{ source_table }} {%- else -%} -{{ raise_exception("Method '" + method + "' is not recognized. Accepted encoding methods are 'label', 'target', and 'oh'")}} -{%- endif -%} \ No newline at end of file +{{ raise_exception("Method '" + method + "' is not recognized. Accepted encoding methods are 'label', 'target', and 'oh'") }} +{%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/entropy/entropy.sql b/rasgotransforms/rasgotransforms/transforms/entropy/entropy.sql index 31ada022..6a92b900 100644 --- a/rasgotransforms/rasgotransforms/transforms/entropy/entropy.sql +++ b/rasgotransforms/rasgotransforms/transforms/entropy/entropy.sql @@ -4,49 +4,43 @@ {%- set final_col_list=[] -%} -WITH -{% for col in columns %} +with + {% for col in columns %} {%- if loop.index == 1 -%} {%- set ns.base_cte = 'CTE_' ~ col ~ '_ENTROPY' -%} {%- endif -%} -CTE_{{ col }} AS ( -SELECT - {{ group_by | join(', ') }}, - {{ col }}, - COUNT(1) AS C -FROM {{ source_table }} -GROUP BY {{ group_by | join(', ') }},{{ col }} -), -CTE_{{ col }}_RATIO AS ( -SELECT - {{ group_by | join(', ') }}, - {{ col }}, - C / SUM(C) OVER (PARTITION BY {{ group_by | join(', ') }}) AS P -FROM CTE_{{ col }} -), -CTE_{{ col }}_ENTROPY AS ( -SELECT - {{ group_by | join(', ') }}, - -SUM(P*LOG(2,P)) AS {{ col }}_ENTROPY -FROM CTE_{{ col }}_RATIO -GROUP BY {{ group_by | join(', ') }} -){{ '' if loop.last else ', ' }} -{%- do final_col_list.append('CTE_' ~ col ~ '_ENTROPY.' ~ col ~ '_ENTROPY') -%} -{%- endfor %} + cte_{{ col }} as ( + select {{ group_by | join(', ') }}, {{ col }}, count(1) as c + from {{ source_table }} + group by {{ group_by | join(', ') }},{{ col }} + ), + cte_{{ col }}_ratio as ( + select + {{ group_by | join(', ') }}, + {{ col }}, + c / sum(c) over (partition by {{ group_by | join(', ') }}) as p + from cte_{{ col }} + ), + cte_{{ col }}_entropy as ( + select {{ group_by | join(', ') }}, - sum(p * log(2, p)) as {{ col }}_entropy + from cte_{{ col }}_ratio + group by {{ group_by | join(', ') }} + ){{ '' if loop.last else ', ' }} + {%- do final_col_list.append('CTE_' ~ col ~ '_ENTROPY.' ~ col ~ '_ENTROPY') -%} + {%- endfor %} -SELECT -{%- for group_item in group_by %} - {{ ns.base_cte }}.{{ group_item}}, -{%- endfor -%} -{{ final_col_list|join(', ') }} -FROM +select + {%- for group_item in group_by %} {{ ns.base_cte }}.{{ group_item }}, {%- endfor -%} + {{ final_col_list|join(', ') }} +from {% for col in columns %} - {%- if loop.index == 1 -%} -CTE_{{ col }}_ENTROPY - {%- else %} - LEFT OUTER JOIN CTE_{{ col }}_ENTROPY ON - {%- for group_item in group_by %} - {{ ns.base_cte }}.{{ group_item }} = CTE_{{ col }}_ENTROPY.{{ group_item }}{{ '' if loop.last else ' AND ' }} - {%- endfor -%} - {%- endif -%} -{%- endfor -%} \ No newline at end of file + {%- if loop.index == 1 -%} cte_{{ col }}_entropy +{%- else %} +left outer join + cte_{{ col }}_entropy + on {%- for group_item in group_by %} + {{ ns.base_cte }}.{{ group_item }} + = cte_{{ col }}_entropy.{{ group_item }}{{ '' if loop.last else ' AND ' }} + {%- endfor -%} +{%- endif -%} +{%- endfor -%} diff --git a/rasgotransforms/rasgotransforms/transforms/extract_sequences/snowflake/extract_sequences.sql b/rasgotransforms/rasgotransforms/transforms/extract_sequences/snowflake/extract_sequences.sql index 2c5cc75e..84a13ff0 100644 --- a/rasgotransforms/rasgotransforms/transforms/extract_sequences/snowflake/extract_sequences.sql +++ b/rasgotransforms/rasgotransforms/transforms/extract_sequences/snowflake/extract_sequences.sql @@ -1,21 +1,27 @@ -WITH CTE_{{ column }} AS ( -select * from {{ source_table }} - match_recognize( - partition by {{ group_by | join(', ') }} - order by {{ order_by }} - measures - match_number() as SEQUENCE_NUMBER, - first({{ order_by }}) as SEQUENCE_START_DATE, - last({{ order_by }}) as SEQUENCE_END_DATE, - count(*) as SEQUENCE_LEN, - count(row_decrease.*) as SEQUENCE_DECREASE_CNT, - count(row_increase.*) as SEQUENCE_INCREASE_CNT - one row per match - after match skip to last row_increase - pattern(FOO row_decrease+ row_increase+) - define - row_decrease AS {{ column }} < lag({{ column }}), - row_increase AS {{ column }} > lag({{ column }}) - ) -) -SELECT * FROM CTE_{{ column }} ORDER BY {{ group_by | join(', ') }}, SEQUENCE_NUMBER \ No newline at end of file +with + cte_{{ column }} as ( + select * + from + {{ source_table }} + match_recognize( + partition by {{ group_by | join(', ') }} + order by + {{ order_by }} + measures + match_number() as sequence_number, + first({{ order_by }}) as sequence_start_date, + last({{ order_by }}) as sequence_end_date, + count(*) as sequence_len, + count(row_decrease.*) as sequence_decrease_cnt, + count(row_increase.*) as sequence_increase_cnt + one row per match + after match skip to last row_increase + pattern(foo row_decrease + row_increase +) + define + row_decrease as {{ column }} < lag({{ column }}), + row_increase as {{ column }} > lag({{ column }}) + ) + ) +select * +from cte_{{ column }} +order by {{ group_by | join(', ') }}, sequence_number diff --git a/rasgotransforms/rasgotransforms/transforms/filter/filter.sql b/rasgotransforms/rasgotransforms/transforms/filter/filter.sql index d19834f0..ee6ec86d 100644 --- a/rasgotransforms/rasgotransforms/transforms/filter/filter.sql +++ b/rasgotransforms/rasgotransforms/transforms/filter/filter.sql @@ -1,23 +1,24 @@ {%- if items is not defined -%} - {%- if filter_statements is not defined -%} - {{ raise_exception('items is empty: there are no filters to apply') }} - {%- else -%} - {%- set items = filter_statements -%} - {%- endif -%} +{%- if filter_statements is not defined -%} +{{ raise_exception('items is empty: there are no filters to apply') }} +{%- else -%} {%- set items = filter_statements -%} +{%- endif -%} {%- endif -%} -SELECT * -FROM {{ source_table }} -{% for filter_block in items %} -{%- set oloop = loop -%} -{{ 'WHERE ' if oloop.first else ' AND ' }} -{%- if filter_block is not mapping -%} -{{ filter_block }} -{%- else -%} +select * +from + {{ source_table }} + {% for filter_block in items %} + {%- set oloop = loop -%} + {{ 'WHERE ' if oloop.first else ' AND ' }} + {%- if filter_block is not mapping -%} {{ filter_block }} + {%- else -%} {%- if filter_block['operator'] == 'CONTAINS' -%} -{{ filter_block['operator'] }}({{ filter_block['columnName'] }}, {{ filter_block['comparisonValue'] }}) + {{ filter_block['operator'] }} ( + {{ filter_block['columnName'] }}, {{ filter_block['comparisonValue'] }} + ) {%- else -%} -{{ filter_block['columnName'] }} {{ filter_block['operator'] }} {{ filter_block['comparisonValue'] }} + {{ filter_block['columnName'] }} {{ filter_block['operator'] }} {{ filter_block['comparisonValue'] }} {%- endif -%} -{%- endif -%} -{%- endfor -%} \ No newline at end of file + {%- endif -%} + {%- endfor -%} diff --git a/rasgotransforms/rasgotransforms/transforms/funnel/funnel.sql b/rasgotransforms/rasgotransforms/transforms/funnel/funnel.sql index a715cddb..2b72dc2b 100644 --- a/rasgotransforms/rasgotransforms/transforms/funnel/funnel.sql +++ b/rasgotransforms/rasgotransforms/transforms/funnel/funnel.sql @@ -1,7 +1,4 @@ {%- for col_name in stage_columns -%} - SELECT - '{{ col_name }}' AS LABEL - ,SUM({{ col_name }}) AS LABEL_COUNT -FROM {{ source_table }} -{{ "UNION ALL" if not loop.last else "" }} -{% endfor %} \ No newline at end of file +select '{{ col_name }}' as label, sum({{ col_name }}) as label_count +from {{ source_table }} {{ "UNION ALL" if not loop.last else "" }} +{% endfor %} diff --git a/rasgotransforms/rasgotransforms/transforms/heatmap/heatmap.sql b/rasgotransforms/rasgotransforms/transforms/heatmap/heatmap.sql index 63a84f02..ebd776d6 100644 --- a/rasgotransforms/rasgotransforms/transforms/heatmap/heatmap.sql +++ b/rasgotransforms/rasgotransforms/transforms/heatmap/heatmap.sql @@ -1,66 +1,77 @@ -{%- if num_buckets is not defined -%} - {%- set bucket_count = 100 -%} -{%- else -%} - {%- set bucket_count = num_buckets -%} +{%- if num_buckets is not defined -%} {%- set bucket_count = 100 -%} +{%- else -%} {%- set bucket_count = num_buckets -%} {%- endif -%} -WITH AXIS_RANGE AS ( - -- Use a user-defined axis column to calculate the min & max of the axis (and buckets on the axis) - SELECT - MIN({{ x_axis }})-1 AS MIN_X_VAL - ,MAX({{ x_axis }})+1 AS MAX_X_VAL - ,MIN({{ y_axis }})-1 AS MIN_Y_VAL - ,MAX({{ y_axis }})+1 AS MAX_Y_VAL - FROM - {{ source_table }} - WHERE - {{ x_axis }} IS NOT NULL -), EDGES AS ( -SELECT MIN_X_VAL, MAX_X_VAL, (MIN_X_VAL-MAX_X_VAL) X_VAL_RANGE, ((MAX_X_VAL-MIN_X_VAL)/{{ bucket_count }}) X_BUCKET_SIZE, - MIN_Y_VAL, MAX_Y_VAL, (MIN_Y_VAL-MAX_Y_VAL) Y_VAL_RANGE, ((MAX_Y_VAL-MIN_Y_VAL)/{{ bucket_count }}) Y_BUCKET_SIZE - FROM AXIS_RANGE -), -BUCKETS AS ( - SELECT - -- Assigns a bucket to each value of each column in user's column list - -- Row count of result set should match the row count of the raw table - MIN_X_VAL - ,MAX_X_VAL - ,X_BUCKET_SIZE - ,MIN_Y_VAL - ,MAX_Y_VAL - ,Y_BUCKET_SIZE - ,CAST({{ x_axis }} AS FLOAT) AS COL_X_VAL - ,WIDTH_BUCKET(COL_X_VAL, MIN_X_VAL, MAX_X_VAL, {{ bucket_count }}) AS COL_X_BUCKET - ,CAST({{ y_axis }} AS FLOAT) AS COL_Y_VAL - ,WIDTH_BUCKET(COL_Y_VAL, MIN_Y_VAL, MAX_Y_VAL, {{ bucket_count }}) AS COL_Y_BUCKET - FROM - {{ source_table }} - CROSS JOIN EDGES - {%- if filters is defined and filters %} - {% for filter_block in filters %} - {%- set oloop = loop -%} - {{ 'WHERE ' if oloop.first else ' AND ' }} - {%- if filter_block is not mapping -%} - {{ filter_block }} - {%- else -%} - {%- if filter_block['operator'] == 'CONTAINS' -%} - {{ filter_block['operator'] }}({{ filter_block['columnName'] }}, {{ filter_block['comparisonValue'] }}) - {%- else -%} - {{ filter_block['columnName'] }} {{ filter_block['operator'] }} {{ filter_block['comparisonValue'] }} - {%- endif -%} - {%- endif -%} - {%- endfor -%} - {%- endif -%} -) +with + axis_range as ( + -- Use a user-defined axis column to calculate the min & max of the axis (and + -- buckets on the axis) + select + min({{ x_axis }}) -1 as min_x_val, + max({{ x_axis }}) + 1 as max_x_val, + min({{ y_axis }}) -1 as min_y_val, + max({{ y_axis }}) + 1 as max_y_val + from {{ source_table }} + where {{ x_axis }} is not null + ), + edges as ( + select + min_x_val, + max_x_val, + (min_x_val - max_x_val) x_val_range, + ((max_x_val - min_x_val) /{{ bucket_count }}) x_bucket_size, + min_y_val, + max_y_val, + (min_y_val - max_y_val) y_val_range, + ((max_y_val - min_y_val) /{{ bucket_count }}) y_bucket_size + from axis_range + ), + buckets as ( + select + -- Assigns a bucket to each value of each column in user's column list + -- Row count of result set should match the row count of the raw table + min_x_val, + max_x_val, + x_bucket_size, + min_y_val, + max_y_val, + y_bucket_size, + cast({{ x_axis }} as float) as col_x_val, + width_bucket( + col_x_val, min_x_val, max_x_val, {{ bucket_count }} + ) as col_x_bucket, + cast({{ y_axis }} as float) as col_y_val, + width_bucket( + col_y_val, min_y_val, max_y_val, {{ bucket_count }} + ) as col_y_bucket + from {{ source_table }} + cross join + edges + {%- if filters is defined and filters %} + {% for filter_block in filters %} + {%- set oloop = loop -%} + {{ 'WHERE ' if oloop.first else ' AND ' }} + {%- if filter_block is not mapping -%} {{ filter_block }} + {%- else -%} + {%- if filter_block['operator'] == 'CONTAINS' -%} + {{ filter_block['operator'] }} ( + {{ filter_block['columnName'] }}, {{ filter_block['comparisonValue'] }} + ) + {%- else -%} + {{ filter_block['columnName'] }} {{ filter_block['operator'] }} {{ filter_block['comparisonValue'] }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + ) -- Run final aggregates on the buckets -SELECT - MIN_X_VAL+((COL_X_BUCKET-1)*X_BUCKET_SIZE) AS {{ x_axis }}_MIN - ,MIN_X_VAL+(COL_X_BUCKET*X_BUCKET_SIZE) AS {{ x_axis }}_MAX - ,MIN_Y_VAL+((COL_Y_BUCKET-1)*Y_BUCKET_SIZE) AS {{ y_axis }}_MIN - ,MIN_Y_VAL+(COL_Y_BUCKET*Y_BUCKET_SIZE) AS {{ y_axis }}_MAX - ,COUNT(COL_Y_VAL)+COUNT(COL_X_VAL) as DENSITY +select + min_x_val + ((col_x_bucket -1) * x_bucket_size) as {{ x_axis }}_min, + min_x_val + (col_x_bucket * x_bucket_size) as {{ x_axis }}_max, + min_y_val + ((col_y_bucket -1) * y_bucket_size) as {{ y_axis }}_min, + min_y_val + (col_y_bucket * y_bucket_size) as {{ y_axis }}_max, + count(col_y_val) + count(col_x_val) as density -FROM BUCKETS -WHERE {{ x_axis }}_MIN is not NULL and {{ y_axis }}_MIN is not NULL -GROUP BY 1, 2, 3, 4 -ORDER BY 1, 3 \ No newline at end of file +from buckets +where {{ x_axis }}_min is not null and {{ y_axis }}_min is not null +group by 1, 2, 3, 4 +order by 1, 3 diff --git a/rasgotransforms/rasgotransforms/transforms/histogram/histogram.sql b/rasgotransforms/rasgotransforms/transforms/histogram/histogram.sql index ef225060..9f6f86de 100644 --- a/rasgotransforms/rasgotransforms/transforms/histogram/histogram.sql +++ b/rasgotransforms/rasgotransforms/transforms/histogram/histogram.sql @@ -1,54 +1,61 @@ -{%- if num_buckets is not defined -%} - {%- set bucket_count = 200 -%} -{%- else -%} - {%- set bucket_count = num_buckets -%} +{%- if num_buckets is not defined -%} {%- set bucket_count = 200 -%} +{%- else -%} {%- set bucket_count = num_buckets -%} {%- endif -%} -WITH COUNTS AS ( -SELECT - REPLACE('{{ column }}','"') AS FEATURE - ,COL AS VAL - ,COUNT(1) AS REC_CT -FROM - (SELECT CAST({{ column }} AS FLOAT) AS COL FROM {{ source_table }} - {%- if filters is defined and filters %} - {% for filter_block in filters %} - {%- set oloop = loop -%} - {{ 'WHERE ' if oloop.first else ' AND ' }} - {%- if filter_block is not mapping -%} - {{ filter_block }} - {%- else -%} - {%- if filter_block['operator'] == 'CONTAINS' -%} - {{ filter_block['operator'] }}({{ filter_block['columnName'] }}, {{ filter_block['comparisonValue'] }}) - {%- else -%} +with + counts as ( + select replace('{{ column }}', '"') as feature, col as val, count(1) as rec_ct + from + ( + select cast({{ column }} as float) as col + from + {{ source_table }} + {%- if filters is defined and filters %} + {% for filter_block in filters %} + {%- set oloop = loop -%} + {{ 'WHERE ' if oloop.first else ' AND ' }} + {%- if filter_block is not mapping -%} {{ filter_block }} + {%- else -%} + {%- if filter_block['operator'] == 'CONTAINS' -%} + {{ filter_block['operator'] }} ( + {{ filter_block['columnName'] }}, + {{ filter_block['comparisonValue'] }} + ) + {%- else -%} {{ filter_block['columnName'] }} {{ filter_block['operator'] }} {{ filter_block['comparisonValue'] }} - {%- endif -%} - {%- endif -%} - {%- endfor -%} - {%- endif -%} - ) -WHERE - COL IS NOT NULL -GROUP BY 2), -CALCS AS (SELECT MIN(VAL)-1 MIN_VAL, MAX(VAL)+1 MAX_VAL FROM COUNTS), -EDGES AS (SELECT MIN_VAL, MAX_VAL, (MIN_VAL-MAX_VAL) VAL_RANGE, ((MAX_VAL-MIN_VAL)/{{ bucket_count }}) BUCKET_SIZE FROM CALCS), -FREQS AS ( -SELECT - FEATURE - ,VAL - ,REC_CT - ,WIDTH_BUCKET(VAL, MIN_VAL, MAX_VAL, {{ bucket_count }}) AS HIST_BUCKET - ,MIN_VAL - ,MAX_VAL - ,BUCKET_SIZE -FROM - COUNTS -CROSS JOIN EDGES) -SELECT - MIN_VAL+((HIST_BUCKET-1)*BUCKET_SIZE) AS {{ column }}_MIN - ,MIN_VAL+(HIST_BUCKET*BUCKET_SIZE) AS {{ column }}_MAX - ,SUM(REC_CT) AS RECORD_COUNT -FROM - FREQS -GROUP BY 1,2 -order by 1 \ No newline at end of file + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + ) + where col is not null + group by 2 + ), + calcs as (select min(val) -1 min_val, max(val) + 1 max_val from counts), + edges as ( + select + min_val, + max_val, + (min_val - max_val) val_range, + ((max_val - min_val) /{{ bucket_count }}) bucket_size + from calcs + ), + freqs as ( + select + feature, + val, + rec_ct, + width_bucket(val, min_val, max_val, {{ bucket_count }}) as hist_bucket, + min_val, + max_val, + bucket_size + from counts + cross join edges + ) +select + min_val + ((hist_bucket -1) * bucket_size) as {{ column }}_min, + min_val + (hist_bucket * bucket_size) as {{ column }}_max, + sum(rec_ct) as record_count +from freqs +group by 1, 2 +order by 1 diff --git a/rasgotransforms/rasgotransforms/transforms/if_then/if_then.sql b/rasgotransforms/rasgotransforms/transforms/if_then/if_then.sql index aae532de..b916ff55 100644 --- a/rasgotransforms/rasgotransforms/transforms/if_then/if_then.sql +++ b/rasgotransforms/rasgotransforms/transforms/if_then/if_then.sql @@ -1,8 +1,9 @@ -SELECT -*, -CASE -{%- for condition in conditions %} - {{"WHEN " + condition[0] }} THEN {{ condition[1] }} {% endfor %} - ELSE {{ default }} -END AS {{ cleanse_name(alias) }} -FROM {{ source_table }} +select + *, + case + {%- for condition in conditions %} + {{ "WHEN " + condition[0] }} then {{ condition[1] }} + {% endfor %} + else {{ default }} + end as {{ cleanse_name(alias) }} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/join/join.sql b/rasgotransforms/rasgotransforms/transforms/join/join.sql index f98b8371..ede48c83 100644 --- a/rasgotransforms/rasgotransforms/transforms/join/join.sql +++ b/rasgotransforms/rasgotransforms/transforms/join/join.sql @@ -1,7 +1,6 @@ {# Jinja Macro to get the table name from source_id #} {%- macro get_table_name(join_table) -%} - {%- set table = join_table.split('.')[-1] -%} - {{ table }} +{%- set table = join_table.split('.')[-1] -%} {{ table }} {%- endmacro -%} {# Get all Columns in Source Table #} @@ -12,34 +11,35 @@ {%- set join_table_name = get_table_name(join_table) -%} -SELECT -{%- for source_col in source_col_names %} - t1.{{ source_col }}{{ ', ' if not loop.last else '' }} -{%- endfor -%} -{%- for join_col in join_col_names %} +select + {%- for source_col in source_col_names %} + t1.{{ source_col }}{{ ', ' if not loop.last else '' }} + {%- endfor -%} + {%- for join_col in join_col_names %} {%- if join_prefix -%} - , t2.{{ join_col }} as {{ cleanse_name(join_prefix)~'_'~join_col }} - {%- elif join_col not in source_col_names -%} - , t2.{{ join_col }} + , t2.{{ join_col }} as {{ cleanse_name(join_prefix)~'_'~join_col }} + {%- elif join_col not in source_col_names -%}, t2.{{ join_col }} {% endif %} -{%- endfor %} -FROM {{ source_table }} as t1 -{{ join_type + ' ' if join_type else '' | upper }}JOIN {{ join_table }} as t2 -{%- for t1_join_col, t2_join_col in join_columns.items() %} -{{ ' AND' if not loop.first else 'ON'}} t1.{{ t1_join_col }} = t2.{{ t2_join_col }} -{%- endfor -%} -{%- if filters is defined and filters %} + {%- endfor %} +from {{ source_table }} as t1 {{ join_type + ' ' if join_type else '' | upper }} +join + {{ join_table }} as t2 + {%- for t1_join_col, t2_join_col in join_columns.items() %} + {{ ' AND' if not loop.first else 'ON' }} t1.{{ t1_join_col }} = t2.{{ t2_join_col }} + {%- endfor -%} + {%- if filters is defined and filters %} {% for filter_block in filters %} - {%- set oloop = loop -%} - {{ 'WHERE ' if oloop.first else ' AND ' }} - {%- if filter_block is not mapping -%} - {{ filter_block }} - {%- else -%} - {%- if filter_block['operator'] == 'CONTAINS' -%} - {{ filter_block['operator'] }}({{ filter_block['columnName'] }}, {{ filter_block['comparisonValue'] }}) - {%- else -%} - {{ filter_block['columnName'] }} {{ filter_block['operator'] }} {{ filter_block['comparisonValue'] }} - {%- endif -%} - {%- endif -%} + {%- set oloop = loop -%} + {{ 'WHERE ' if oloop.first else ' AND ' }} + {%- if filter_block is not mapping -%} {{ filter_block }} + {%- else -%} + {%- if filter_block['operator'] == 'CONTAINS' -%} + {{ filter_block['operator'] }} ( + {{ filter_block['columnName'] }}, {{ filter_block['comparisonValue'] }} + ) + {%- else -%} + {{ filter_block['columnName'] }} {{ filter_block['operator'] }} {{ filter_block['comparisonValue'] }} + {%- endif -%} + {%- endif -%} {%- endfor -%} -{%- endif -%} \ No newline at end of file + {%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/joins/joins.sql b/rasgotransforms/rasgotransforms/transforms/joins/joins.sql index 3f9dcf94..82629215 100644 --- a/rasgotransforms/rasgotransforms/transforms/joins/joins.sql +++ b/rasgotransforms/rasgotransforms/transforms/joins/joins.sql @@ -1,6 +1,4 @@ -{%- macro table_from_fqtn(fqtn) -%} - {{ fqtn.split('.')[-1] }} -{%- endmacro -%} +{%- macro table_from_fqtn(fqtn) -%} {{ fqtn.split('.')[-1] }} {%- endmacro -%} {# Create global variables to track query components #} {%- set ns = namespace() -%} @@ -12,63 +10,62 @@ {# assemble lists of all columns and all tables in the query using a loop #} {%- for join_dict in join_dicts -%} - {%- if loop.index == 1 -%} - {%- set base_table = source_table -%} - {%- else -%} - {%- set base_table = join_dict['table_a'] -%} - {%- endif -%} - {%- set jtable = join_dict['table_b'] -%} - {# Check if base_table in the running list of all tables yet or not. If not, we need to add it and add its columns to all_columns. #} - {%- if base_table not in ns.all_tables -%} - {%- set base_cols = get_columns(base_table) -%} - {%- set x=ns.all_columns.__setitem__(base_table, base_cols.keys()|list) -%} - {%- endif -%} - {%- if jtable not in ns.all_tables -%} - {%- set jtable_cols = get_columns(jtable) -%} - {%- set x=ns.all_columns.__setitem__(jtable, jtable_cols.keys()|list) -%} - {%- endif -%} - {%- set ns.all_tables = ns.all_tables + [base_table, jtable] -%} +{%- if loop.index == 1 -%} {%- set base_table = source_table -%} +{%- else -%} {%- set base_table = join_dict['table_a'] -%} +{%- endif -%} +{%- set jtable = join_dict['table_b'] -%} +{# Check if base_table in the running list of all tables yet or not. If not, we need to add it and add its columns to all_columns. #} +{%- if base_table not in ns.all_tables -%} +{%- set base_cols = get_columns(base_table) -%} +{%- set x=ns.all_columns.__setitem__(base_table, base_cols.keys()|list) -%} +{%- endif -%} +{%- if jtable not in ns.all_tables -%} +{%- set jtable_cols = get_columns(jtable) -%} +{%- set x=ns.all_columns.__setitem__(jtable, jtable_cols.keys()|list) -%} +{%- endif -%} +{%- set ns.all_tables = ns.all_tables + [base_table, jtable] -%} {%- endfor -%} {# loop through all columns in all tables to check for column names that are repeated between tables. These need to be aliased. #} {%- for fqtn, columns in ns.all_columns.items() -%} - {%- set ns.columns_to_check = [] -%} - {%- for check_fqtn in ns.all_tables -%} - {%- if fqtn != check_fqtn -%} - {%- set ns.columns_to_check = ns.columns_to_check + ns.all_columns[check_fqtn] -%} - {%- endif -%} - {%- endfor -%} - {%- set columns_to_alias = columns|select("in", ns.columns_to_check)|list -%} - {%- set ns.alias_columns = ns.alias_columns + columns_to_alias -%} +{%- set ns.columns_to_check = [] -%} +{%- for check_fqtn in ns.all_tables -%} +{%- if fqtn != check_fqtn -%} +{%- set ns.columns_to_check = ns.columns_to_check + ns.all_columns[check_fqtn] -%} +{%- endif -%} +{%- endfor -%} +{%- set columns_to_alias = columns|select("in", ns.columns_to_check)|list -%} +{%- set ns.alias_columns = ns.alias_columns + columns_to_alias -%} {%- endfor -%} {# assemble the SELECT clause by aliasing columns that need to be aliased and just using column name otherwise. #} {%- for fqtn, columns in ns.all_columns.items() -%} - {%- set o_loop = loop -%} - {%- for column in columns -%} - {%- set table_prefix = table_from_fqtn(fqtn) -%} - {%- if column in ns.alias_columns -%} - {%- set ns.select_columns = ns.select_columns ~ table_prefix ~ '.' ~ column ~ ' AS ' ~ table_prefix ~ '_' ~ column -%} - {%- else -%} - {%- set ns.select_columns = ns.select_columns ~ ' ' ~ column -%} - {%- endif -%} - {%- if not (loop.last and o_loop.last) -%} - {%- set ns.select_columns = ns.select_columns ~ ', ' -%} - {%- endif -%} - {%- endfor -%} +{%- set o_loop = loop -%} +{%- for column in columns -%} +{%- set table_prefix = table_from_fqtn(fqtn) -%} +{%- if column in ns.alias_columns -%} +{%- set ns.select_columns = ns.select_columns ~ table_prefix ~ '.' ~ column ~ ' AS ' ~ table_prefix ~ '_' ~ column -%} +{%- else -%} {%- set ns.select_columns = ns.select_columns ~ ' ' ~ column -%} +{%- endif -%} +{%- if not (loop.last and o_loop.last) -%} +{%- set ns.select_columns = ns.select_columns ~ ', ' -%} +{%- endif -%} +{%- endfor -%} {%- endfor -%} {# assemble the full query #} -SELECT {{ ns.select_columns }} -FROM {{ source_table }} +select {{ ns.select_columns }} +from {{ source_table }} {% for join_dict in join_dicts %} {%- set outer_loop = loop -%} - {{ join_dict["join_type"] }} JOIN {{ join_dict["table_b"] }} + {{ join_dict["join_type"] }} +join + {{ join_dict["table_b"] }} {%- if join_dict["join_type"]|upper != 'CROSS' %} - ON - {%- for join_col1, join_col2 in join_dict["join_on"].items() -%} - {{ " AND " if loop.index != 1 else "" }} - {{ table_from_fqtn(source_table) if outer_loop.index == 1 else table_from_fqtn(join_dict["table_a"]) }}.{{ join_col1 }} = {{ table_from_fqtn(join_dict["table_b"]) }}.{{ join_col2 }} + on {%- for join_col1, join_col2 in join_dict["join_on"].items() -%} + {{ " AND " if loop.index != 1 else "" }} + {{ table_from_fqtn(source_table) if outer_loop.index == 1 else table_from_fqtn(join_dict["table_a"]) }}.{{ join_col1 }} + = {{ table_from_fqtn(join_dict["table_b"]) }}.{{ join_col2 }} {% endfor %} {%- endif -%} -{%- endfor -%} \ No newline at end of file +{%- endfor -%} diff --git a/rasgotransforms/rasgotransforms/transforms/label_encode/bigquery/label_encode.sql b/rasgotransforms/rasgotransforms/transforms/label_encode/bigquery/label_encode.sql index 4ff55975..a23aa1b6 100644 --- a/rasgotransforms/rasgotransforms/transforms/label_encode/bigquery/label_encode.sql +++ b/rasgotransforms/rasgotransforms/transforms/label_encode/bigquery/label_encode.sql @@ -1,11 +1,9 @@ -with distinct_values as ( - select distinct - rank() over(order by {{ column }} asc) as id, - {{ column }} - from {{ source_table }} - order by {{ column }} asc -) -select *, - (v.id - 1) as {{ column }}_encoded -FROM {{ source_table }} t -left join distinct_values v using ({{ column }}) \ No newline at end of file +with + distinct_values as ( + select distinct rank() over (order by {{ column }} asc) as id, {{ column }} + from {{ source_table }} + order by {{ column }} asc + ) +select *, (v.id - 1) as {{ column }}_encoded +from {{ source_table }} t +left join distinct_values v using ({{ column }}) diff --git a/rasgotransforms/rasgotransforms/transforms/label_encode/snowflake/label_encode.sql b/rasgotransforms/rasgotransforms/transforms/label_encode/snowflake/label_encode.sql index 1e459a77..85734001 100644 --- a/rasgotransforms/rasgotransforms/transforms/label_encode/snowflake/label_encode.sql +++ b/rasgotransforms/rasgotransforms/transforms/label_encode/snowflake/label_encode.sql @@ -1,6 +1,11 @@ -with distinct_values as ( - select array_agg(distinct {{ column }}) within group (order by {{ column }} asc) as all_values_array from {{ source_table }} -) -select *, -array_position({{ column }}::variant,all_values_array) as {{ column }}_encoded -from distinct_values,{{ source_table }} \ No newline at end of file +with + distinct_values as ( + select + array_agg(distinct {{ column }}) within group ( + order by {{ column }} asc + ) as all_values_array + from {{ source_table }} + ) +select + *, array_position({{ column }}::variant, all_values_array) as {{ column }}_encoded +from distinct_values,{{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/lag/bigquery/lag.sql b/rasgotransforms/rasgotransforms/transforms/lag/bigquery/lag.sql index 60379b42..208fba2d 100644 --- a/rasgotransforms/rasgotransforms/transforms/lag/bigquery/lag.sql +++ b/rasgotransforms/rasgotransforms/transforms/lag/bigquery/lag.sql @@ -1,19 +1,22 @@ {%- if partition is not defined or partition|length == 0 -%} -{%- set partition = ["NULL"]-%} +{%- set partition = ["NULL"] -%} {%- endif -%} {%- if order_by is not defined or order_by|length == 0 -%} -{%- set order_by = ["NULL"]-%} +{%- set order_by = ["NULL"] -%} {%- endif -%} {%- for amount in amounts -%} - {%- if amount < 0 -%} - {{ raise_exception('BigQuery cannot use negative values for a lag function. Please utilize lead for forward looking windows.') }} - {%- endif -%} +{%- if amount < 0 -%} +{{ raise_exception('BigQuery cannot use negative values for a lag function. Please utilize lead for forward looking windows.') }} +{%- endif -%} {%- endfor -%} -SELECT *, +select + *, {%- for col in columns -%} - {%- for amount in amounts %} - lag({{col}}, {{amount}}) over (partition by {{partition | join(", ")}} order by {{order_by | join(", ")}}) as Lag_{{ cleanse_name(col ~ '_' ~ amount) }}{{ "," if not loop.last else "" }} - {%- endfor -%} - {{ ", " if not loop.last else "" }} + {%- for amount in amounts %} + lag({{ col }}, {{ amount }}) over ( + partition by {{ partition | join(", ") }} order by {{ order_by | join(", ") }} + ) as lag_{{ cleanse_name(col ~ '_' ~ amount) }}{{ "," if not loop.last else "" }} + {%- endfor -%} + {{ ", " if not loop.last else "" }} {%- endfor %} from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/lag/lag.sql b/rasgotransforms/rasgotransforms/transforms/lag/lag.sql index bdc4a7f0..f817e426 100644 --- a/rasgotransforms/rasgotransforms/transforms/lag/lag.sql +++ b/rasgotransforms/rasgotransforms/transforms/lag/lag.sql @@ -1,14 +1,17 @@ {%- if partition is not defined or partition|length == 0 -%} -{%- set partition = ["NULL"]-%} +{%- set partition = ["NULL"] -%} {%- endif -%} {%- if order_by is not defined or order_by|length == 0 -%} -{%- set order_by = ["NULL"]-%} +{%- set order_by = ["NULL"] -%} {%- endif -%} -SELECT *, +select + *, {%- for col in columns -%} - {%- for amount in amounts %} - lag({{col}}, {{amount}}) over (partition by {{partition | join(", ")}} order by {{order_by | join(", ")}}) as Lag_{{ cleanse_name(col ~ '_' ~ amount) }}{{ "," if not loop.last else "" }} - {%- endfor -%} - {{ ", " if not loop.last else "" }} + {%- for amount in amounts %} + lag({{ col }}, {{ amount }}) over ( + partition by {{ partition | join(", ") }} order by {{ order_by | join(", ") }} + ) as lag_{{ cleanse_name(col ~ '_' ~ amount) }}{{ "," if not loop.last else "" }} + {%- endfor -%} + {{ ", " if not loop.last else "" }} {%- endfor %} -from {{ source_table }} \ No newline at end of file +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/latest/latest.sql b/rasgotransforms/rasgotransforms/transforms/latest/latest.sql index c8551a24..de339acb 100644 --- a/rasgotransforms/rasgotransforms/transforms/latest/latest.sql +++ b/rasgotransforms/rasgotransforms/transforms/latest/latest.sql @@ -1,17 +1,17 @@ {%- set source_col_names = get_columns(source_table) -%} -SELECT -{%- for group_item in group_by %} - {{ group_item }}, -{%- endfor -%} +select + {%- for group_item in group_by %} {{ group_item }}, {%- endfor -%} -{%- for order_item in order_by %} - {{ order_item }}, -{%- endfor -%} + {%- for order_item in order_by %} {{ order_item }}, {%- endfor -%} -{%- for source_col in source_col_names %} - {%- if source_col not in group_by and source_col not in order_by -%} - LAST_VALUE({{ source_col }} {{ nulls }} NULLS) OVER (PARTITION BY {{ group_by | join(', ') }} ORDER BY {{ order_by | join(', ') }} ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS LATEST_{{ source_col }}{{ ', ' if not loop.last else ' ' }} - {%- endif -%} -{%- endfor -%} -FROM {{ source_table }} \ No newline at end of file + {%- for source_col in source_col_names %} + {%- if source_col not in group_by and source_col not in order_by -%} + last_value({{ source_col }} {{ nulls }} nulls) over ( + partition by {{ group_by | join(', ') }} + order by {{ order_by | join(', ') }} + rows between unbounded preceding and current row + ) as latest_{{ source_col }}{{ ', ' if not loop.last else ' ' }} + {%- endif -%} + {%- endfor -%} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/lead/bigquery/lead.sql b/rasgotransforms/rasgotransforms/transforms/lead/bigquery/lead.sql index f0b4207e..8154bf0f 100644 --- a/rasgotransforms/rasgotransforms/transforms/lead/bigquery/lead.sql +++ b/rasgotransforms/rasgotransforms/transforms/lead/bigquery/lead.sql @@ -1,19 +1,22 @@ {%- if partition is not defined or partition|length == 0 -%} -{%- set partition = ["NULL"]-%} +{%- set partition = ["NULL"] -%} {%- endif -%} {%- if order_by is not defined or order_by|length == 0 -%} -{%- set order_by = ["NULL"]-%} +{%- set order_by = ["NULL"] -%} {%- endif -%} {%- for amount in amounts -%} - {%- if amount < 0 -%} - {{ raise_exception('BigQuery cannot use negative values for a lead function. Please utilize lag for backwards looking windows.') }} - {%- endif -%} +{%- if amount < 0 -%} +{{ raise_exception('BigQuery cannot use negative values for a lead function. Please utilize lag for backwards looking windows.') }} +{%- endif -%} {%- endfor -%} -SELECT *, +select + *, {%- for col in columns -%} - {%- for amount in amounts %} - lead({{col}}, {{amount}}) over (partition by {{partition | join(", ")}} order by {{order_by | join(", ")}}) as lead_{{ cleanse_name(col ~ '_' ~ amount) }}{{ "," if not loop.last else "" }} - {%- endfor -%} - {{ ", " if not loop.last else "" }} + {%- for amount in amounts %} + lead({{ col }}, {{ amount }}) over ( + partition by {{ partition | join(", ") }} order by {{ order_by | join(", ") }} + ) as lead_{{ cleanse_name(col ~ '_' ~ amount) }}{{ "," if not loop.last else "" }} + {%- endfor -%} + {{ ", " if not loop.last else "" }} {%- endfor %} from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/lead/lead.sql b/rasgotransforms/rasgotransforms/transforms/lead/lead.sql index ba57c0e5..7fe69e9e 100644 --- a/rasgotransforms/rasgotransforms/transforms/lead/lead.sql +++ b/rasgotransforms/rasgotransforms/transforms/lead/lead.sql @@ -1,14 +1,17 @@ {%- if partition is not defined or partition|length == 0 -%} -{%- set partition = ["NULL"]-%} +{%- set partition = ["NULL"] -%} {%- endif -%} {%- if order_by is not defined or order_by|length == 0 -%} -{%- set order_by = ["NULL"]-%} +{%- set order_by = ["NULL"] -%} {%- endif -%} -SELECT *, +select + *, {%- for col in columns -%} - {%- for amount in amounts %} - lead({{col}}, {{amount}}) over (partition by {{partition | join(", ")}} order by {{order_by | join(", ")}}) as lead_{{ cleanse_name(col ~ '_' ~ amount) }}{{ "," if not loop.last else "" }} - {%- endfor -%} - {{ ", " if not loop.last else "" }} + {%- for amount in amounts %} + lead({{ col }}, {{ amount }}) over ( + partition by {{ partition | join(", ") }} order by {{ order_by | join(", ") }} + ) as lead_{{ cleanse_name(col ~ '_' ~ amount) }}{{ "," if not loop.last else "" }} + {%- endfor -%} + {{ ", " if not loop.last else "" }} {%- endfor %} from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/levenshtein/levenshtein.sql b/rasgotransforms/rasgotransforms/transforms/levenshtein/levenshtein.sql index 5144e710..800904aa 100644 --- a/rasgotransforms/rasgotransforms/transforms/levenshtein/levenshtein.sql +++ b/rasgotransforms/rasgotransforms/transforms/levenshtein/levenshtein.sql @@ -1,7 +1,10 @@ -SELECT *, +select + *, {%- for col in columns1 -%} {%- for col2 in columns2 %} - EDITDISTANCE({{col}}, {{col2}}) as {{col}}_{{col2}}_Distance{{ ", " if not loop.last else "" }} - {%- endfor -%}{{ ", " if not loop.last else "" }} + editdistance( + {{ col }}, {{ col2 }} + ) as {{ col }}_{{ col2 }}_distance{{ ", " if not loop.last else "" }} + {%- endfor -%} {{ ", " if not loop.last else "" }} {%- endfor %} -FROM {{source_table}} \ No newline at end of file +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/linear_regression/linear_regression.sql b/rasgotransforms/rasgotransforms/transforms/linear_regression/linear_regression.sql index 21ea598c..acde65ee 100644 --- a/rasgotransforms/rasgotransforms/transforms/linear_regression/linear_regression.sql +++ b/rasgotransforms/rasgotransforms/transforms/linear_regression/linear_regression.sql @@ -1,7 +1,7 @@ -SELECT {{ group_by | join(', ') }}{{ ', ' if group_by else ''}} - REGR_SLOPE({{y}}, {{x}}) Slope, - REGR_INTERCEPT({{y}}, {{x}}) Intercept, - REGR_R2({{y}}, {{x}}) R2, - CONCAT('Y = ',Slope,'*X + ',Intercept) as Formula -FROM {{ source_table }} -{{ 'GROUP BY ' if group_by else ''}}{{ group_by | join(', ') }} \ No newline at end of file +select + {{ group_by | join(', ') }}{{ ', ' if group_by else '' }} + regr_slope({{ y }}, {{ x }}) slope, + regr_intercept({{ y }}, {{ x }}) intercept, + regr_r2({{ y }}, {{ x }}) r2, + concat('Y = ', slope, '*X + ', intercept) as formula +from {{ source_table }} {{ 'GROUP BY ' if group_by else '' }}{{ group_by | join(', ') }} diff --git a/rasgotransforms/rasgotransforms/transforms/market_basket/market_basket.sql b/rasgotransforms/rasgotransforms/transforms/market_basket/market_basket.sql index bca2a4d1..3612dde9 100644 --- a/rasgotransforms/rasgotransforms/transforms/market_basket/market_basket.sql +++ b/rasgotransforms/rasgotransforms/transforms/market_basket/market_basket.sql @@ -1,13 +1,17 @@ -WITH order_detail as -(SELECT {{transaction_id}}, -listagg({{agg_column}}, '{{sep}}') -WITHIN group (order by {{agg_column}}) as {{agg_column}}_listagg, -COUNT({{agg_column}}) as num_products -FROM {{ source_table }} -GROUP BY {{transaction_id}} ) +with + order_detail as ( + select + {{ transaction_id }}, + listagg({{ agg_column }}, '{{sep}}') within group ( + order by {{ agg_column }} + ) as {{ agg_column }}_listagg, + count({{ agg_column }}) as num_products + from {{ source_table }} + group by {{ transaction_id }} + ) -SELECT {{agg_column}}_listagg, count({{transaction_id}}) as NumTransactions -FROM order_detail +select {{ agg_column }}_listagg, count({{ transaction_id }}) as numtransactions +from order_detail where num_products > 1 -GROUP BY {{agg_column}}_listagg -order by count({{transaction_id}}) desc \ No newline at end of file +group by {{ agg_column }}_listagg +order by count({{ transaction_id }}) desc diff --git a/rasgotransforms/rasgotransforms/transforms/math/bigquery/math.sql b/rasgotransforms/rasgotransforms/transforms/math/bigquery/math.sql index b346eb88..971cb2a9 100644 --- a/rasgotransforms/rasgotransforms/transforms/math/bigquery/math.sql +++ b/rasgotransforms/rasgotransforms/transforms/math/bigquery/math.sql @@ -1,23 +1,24 @@ {%- if names -%} - {%- if names|length != math_ops|length -%} +{%- if names|length != math_ops|length -%} {{ raise_exception('Provide a new column alias for each math operation') }} - {%- elif names|length == math_ops|length -%} +{%- elif names|length == math_ops|length -%} -SELECT * -{%- for math_op in math_ops %} - , {{math_op}} as {{cleanse_name(names[loop.index-1])}} -{%- endfor %} -FROM {{source_table}} +select + * + {%- for math_op in math_ops %} + , {{ math_op }} as {{ cleanse_name(names[loop.index-1]) }} + {%- endfor %} +from {{ source_table }} - {%- endif -%} +{%- endif -%} {%- else -%} -SELECT * -{%- for math_op in math_ops %} - , {{math_op}} as {{cleanse_name(math_op)}} -{%- endfor %} -FROM {{source_table}} +select + * + {%- for math_op in math_ops %}, {{ math_op }} as {{ cleanse_name(math_op) }} + {%- endfor %} +from {{ source_table }} -{%- endif -%} \ No newline at end of file +{%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/math/math.sql b/rasgotransforms/rasgotransforms/transforms/math/math.sql index b346eb88..971cb2a9 100644 --- a/rasgotransforms/rasgotransforms/transforms/math/math.sql +++ b/rasgotransforms/rasgotransforms/transforms/math/math.sql @@ -1,23 +1,24 @@ {%- if names -%} - {%- if names|length != math_ops|length -%} +{%- if names|length != math_ops|length -%} {{ raise_exception('Provide a new column alias for each math operation') }} - {%- elif names|length == math_ops|length -%} +{%- elif names|length == math_ops|length -%} -SELECT * -{%- for math_op in math_ops %} - , {{math_op}} as {{cleanse_name(names[loop.index-1])}} -{%- endfor %} -FROM {{source_table}} +select + * + {%- for math_op in math_ops %} + , {{ math_op }} as {{ cleanse_name(names[loop.index-1]) }} + {%- endfor %} +from {{ source_table }} - {%- endif -%} +{%- endif -%} {%- else -%} -SELECT * -{%- for math_op in math_ops %} - , {{math_op}} as {{cleanse_name(math_op)}} -{%- endfor %} -FROM {{source_table}} +select + * + {%- for math_op in math_ops %}, {{ math_op }} as {{ cleanse_name(math_op) }} + {%- endfor %} +from {{ source_table }} -{%- endif -%} \ No newline at end of file +{%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/math/snowflake/math.sql b/rasgotransforms/rasgotransforms/transforms/math/snowflake/math.sql index b346eb88..971cb2a9 100644 --- a/rasgotransforms/rasgotransforms/transforms/math/snowflake/math.sql +++ b/rasgotransforms/rasgotransforms/transforms/math/snowflake/math.sql @@ -1,23 +1,24 @@ {%- if names -%} - {%- if names|length != math_ops|length -%} +{%- if names|length != math_ops|length -%} {{ raise_exception('Provide a new column alias for each math operation') }} - {%- elif names|length == math_ops|length -%} +{%- elif names|length == math_ops|length -%} -SELECT * -{%- for math_op in math_ops %} - , {{math_op}} as {{cleanse_name(names[loop.index-1])}} -{%- endfor %} -FROM {{source_table}} +select + * + {%- for math_op in math_ops %} + , {{ math_op }} as {{ cleanse_name(names[loop.index-1]) }} + {%- endfor %} +from {{ source_table }} - {%- endif -%} +{%- endif -%} {%- else -%} -SELECT * -{%- for math_op in math_ops %} - , {{math_op}} as {{cleanse_name(math_op)}} -{%- endfor %} -FROM {{source_table}} +select + * + {%- for math_op in math_ops %}, {{ math_op }} as {{ cleanse_name(math_op) }} + {%- endfor %} +from {{ source_table }} -{%- endif -%} \ No newline at end of file +{%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/metric/bigquery/metric.sql b/rasgotransforms/rasgotransforms/transforms/metric/bigquery/metric.sql index 06b5db02..fe32d1f2 100644 --- a/rasgotransforms/rasgotransforms/transforms/metric/bigquery/metric.sql +++ b/rasgotransforms/rasgotransforms/transforms/metric/bigquery/metric.sql @@ -10,7 +10,7 @@ {%- do filters.append({'columnName': time_dimension, 'operator': '>=', 'comparisonValue': "'" + start_date + "'" }) -%} {%- do filters.append({'columnName': time_dimension, 'operator': '<=', 'comparisonValue': "'" + end_date + "'" }) -%} {%- if dimensions and dimensions|length > 1 -%} -{{ raise_exception('Currently, only one dimension can be passed to group by')}} +{{ raise_exception('Currently, only one dimension can be passed to group by') }} {%- endif -%} {%- macro get_distinct_values(column) -%} @@ -24,13 +24,13 @@ order by vals desc limit {{ max_num_groups + 1}} {%- endset -%} - {%- set distinct_vals = run_query(distinct_val_query) -%} - {%- for val in distinct_vals.itertuples() -%} - {%- for column in distinct_vals.columns[:-1] -%} - {{ val[column] }}{{'_' if not loop.last else ''}} - {%- endfor -%} - {{ '|$|' if not loop.last else ''}} - {%- endfor %} +{%- set distinct_vals = run_query(distinct_val_query) -%} +{%- for val in distinct_vals.itertuples() -%} +{%- for column in distinct_vals.columns[:-1] -%} +{{ val[column] }}{{ '_' if not loop.last else '' }} +{%- endfor -%} +{{ '|$|' if not loop.last else '' }} +{%- endfor %} {%- endmacro -%} {%- if dimensions -%} @@ -54,147 +54,148 @@ {%- endfor %} {%- endset -%} -with source_query as ( - select - cast(date_trunc(cast({{ time_dimension }} as date), day) as date) as date_day, - {%- for dimension in dimensions %} - case - when cast({{ dimension }} as string) in ( - {%- for val in distinct_values %} - '{{ val }}'{{',' if not loop.last else ''}} - {%- endfor %} - ) then cast({{ dimension }} as string) - {%- if 'None' in distinct_values %} - when {{ dimension }} is null then 'None' - {%- endif %} - else '_OtherGroup' - end as {{ dimension }}, - {%- endfor %} - {{ target_expression }} as property_to_aggregate - from {{ source_table }} - {{ filter_statement }} -), -{%- if time_grain|lower == 'all'%} -spine as ( - select - cast('{{ start_date }}' as date) as PERIOD_MIN, - cast('{{ end_date }}' as date) as PERIOD_MAX -), -joined as ( - select * - from source_query - cross join spine -), -tidy_data as ( - select - PERIOD_MIN, - PERIOD_MAX, - {%- for dimension in dimensions %} - {{ dimension }}, - {%- endfor %} - {{ aggregation_type }}({{ 'distinct ' if distinct else ''}}joined.property_to_aggregate) as {{ alias }}, - from joined - group by {{ range(1, dimensions|length + 3)|join(', ') }} -) -{%- else %} -calendar as ( - select - date_day, - date_trunc(date_day, week) as date_week, - date_trunc(date_day, month) as date_month, - date_trunc(date_day, quarter) as date_quarter, - date_trunc(date_day, year) as date_year - from unnest(generate_date_array('{{ start_date }}', '{{ end_date }}')) as date_day -), -spine__time as ( +with + source_query as ( select - date_{{ time_grain }} as period, - date_day - from calendar -), -{%- for dimension in dimensions %} -spine__values__{{ dimension }} as ( - select distinct {{ dimension }} - from source_query -), -{%- endfor %} -spine as ( - select * - from spine__time - {%- for dimension in dimensions %} - cross join spine__values__{{ dimension }} - {%- endfor %} -), -joined as ( - select - spine.period, - {%- for dimension in dimensions %} - spine.{{ dimension }}, - {%- endfor %} - {{ aggregation_type }}({{ 'distinct ' if distinct else ''}}source_query.property_to_aggregate) as {{ alias }}, - logical_or(source_query.date_day is not null) as has_data - from spine - left outer join source_query on source_query.date_day = spine.date_day - {%- for dimension in dimensions %} - and (source_query.{{ dimension }} = spine.{{ dimension }} - or source_query.{{ dimension }} is null and spine.{{ dimension }} is null) - {%- endfor %} - group by {{ range(1, dimensions|length + 2)|join(', ') }} -), -bounded as ( - select - *, - min(case when has_data then period end) over () as lower_bound, - max(case when has_data then period end) over () as upper_bound - from joined -), -tidy_data as ( - select - cast(period as timestamp) as PERIOD_MIN, - {%- if time_grain|lower == 'quarter' %} - cast(date_add(period, INTERVAL 3 month) as timestamp) as PERIOD_MAX, - {%- else %} - cast(date_add(period, INTERVAL 1 {{ time_grain }}) as timestamp) as PERIOD_MAX, - {%- endif %} - {%- for dimension in dimensions %} - {{ dimension }}, - {%- endfor %} - coalesce({{ alias }}, 0) as {{ alias }} - from bounded - where period >= lower_bound - and period <= upper_bound - order by {{ range(1, dimensions|length + 2)|join(', ') }} -) -{%- endif %} -{%- if not dimensions or not flatten %} -select * from tidy_data order by 1 -{%- else -%} -, -pivoted as ( - select * - from ( - select - PERIOD_MIN, - PERIOD_MAX, - {{ alias }}, + cast( + date_trunc(cast({{ time_dimension }} as date), day) as date + ) as date_day, {%- for dimension in dimensions %} - {{ dimension }}{{ ',' if not loop.last }} + case + when + cast({{ dimension }} as string) in ( + {%- for val in distinct_values %} + '{{ val }}'{{ ',' if not loop.last else '' }} + {%- endfor %} + ) + then cast({{ dimension }} as string) + {%- if 'None' in distinct_values %} + when {{ dimension }} is null then 'None' + {%- endif %} + else '_OtherGroup' + end as {{ dimension }}, {%- endfor %} - from tidy_data + {{ target_expression }} as property_to_aggregate + from {{ source_table }} {{ filter_statement }} + ), + {%- if time_grain|lower == 'all' %} + spine as ( + select + cast('{{ start_date }}' as date) as period_min, + cast('{{ end_date }}' as date) as period_max + ), + joined as (select * from source_query cross join spine), + tidy_data as ( + select + period_min, + period_max, + {%- for dimension in dimensions %} {{ dimension }}, {%- endfor %} + {{ aggregation_type }} ( + {{ 'distinct ' if distinct else '' }}joined.property_to_aggregate + ) as {{ alias }}, + from joined + group by {{ range(1, dimensions|length + 3)|join(', ') }} ) - pivot ( - sum({{ alias }}) as {{ alias }} - for {{ dimensions[0] }} in ( - {% for val in distinct_values -%} - {%- if val is string -%} - '{{ val }}' - {%- else -%} - {{ val }} - {%- endif -%} - {{', ' if not loop.last else ''}} + {%- else %} + calendar as ( + select + date_day, + date_trunc(date_day, week) as date_week, + date_trunc(date_day, month) as date_month, + date_trunc(date_day, quarter) as date_quarter, + date_trunc(date_day, year) as date_year + from + unnest( + generate_date_array('{{ start_date }}', '{{ end_date }}') + ) as date_day + ), + spine__time as (select date_{{ time_grain }} as period, date_day from calendar), + {%- for dimension in dimensions %} + spine__values__{{ dimension }} as ( + select distinct {{ dimension }} from source_query + ), + {%- endfor %} + spine as ( + select * + from spine__time + {%- for dimension in dimensions %} cross join spine__values__{{ dimension }} + {%- endfor %} + ), + joined as ( + select + spine.period, + {%- for dimension in dimensions %} spine.{{ dimension }}, + {%- endfor %} + {{ aggregation_type }} ( + {{ 'distinct ' if distinct else '' }}source_query.property_to_aggregate + ) as {{ alias }}, + logical_or(source_query.date_day is not null) as has_data + from spine + left outer join + source_query on source_query.date_day = spine.date_day + {%- for dimension in dimensions %} + and ( + source_query.{{ dimension }} = spine.{{ dimension }} + or source_query.{{ dimension }} is null + and spine.{{ dimension }} is null + ) {%- endfor %} - ) + group by {{ range(1, dimensions|length + 2)|join(', ') }} + ), + bounded as ( + select + *, + min(case when has_data then period end) over () as lower_bound, + max(case when has_data then period end) over () as upper_bound + from joined + ), + tidy_data as ( + select + cast(period as timestamp) as period_min, + {%- if time_grain|lower == 'quarter' %} + cast(date_add(period, interval 3 month) as timestamp) as period_max, + {%- else %} + cast( + date_add(period, interval 1 {{ time_grain }}) as timestamp + ) as period_max, + {%- endif %} + {%- for dimension in dimensions %} {{ dimension }}, + {%- endfor %} + coalesce({{ alias }}, 0) as {{ alias }} + from bounded + where period >= lower_bound and period <= upper_bound + order by {{ range(1, dimensions|length + 2)|join(', ') }} ) -) -select * from pivoted order by 1 -{%- endif -%} \ No newline at end of file + {%- endif %} +{%- if not dimensions or not flatten %} select * from tidy_data order by 1 +{%- else -%} + , + pivoted as ( + select * + from + ( + select + period_min, + period_max, + {{ alias }}, + {%- for dimension in dimensions %} + {{ dimension }}{{ ',' if not loop.last }} + {%- endfor %} + from tidy_data + ) + pivot( + sum({{ alias }}) as {{ alias }} + for {{ dimensions[0] }} in ( + {% for val in distinct_values -%} + {%- if val is string -%}'{{ val }}' + {%- else -%}{{ val }} + {%- endif -%} + {{ ', ' if not loop.last else '' }} + {%- endfor %} + ) + ) + ) +select * +from pivoted +order by 1 +{%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/metric/snowflake/metric.sql b/rasgotransforms/rasgotransforms/transforms/metric/snowflake/metric.sql index 95245ef0..ec8e5760 100644 --- a/rasgotransforms/rasgotransforms/transforms/metric/snowflake/metric.sql +++ b/rasgotransforms/rasgotransforms/transforms/metric/snowflake/metric.sql @@ -36,13 +36,13 @@ order by vals desc limit {{ max_num_groups + 1}} {%- endset -%} - {%- set distinct_vals = run_query(distinct_val_query) -%} - {%- for val in distinct_vals.itertuples() -%} - {%- for column in distinct_vals.columns[:-1] -%} - {{ val[column] }}{{'_' if not loop.last else ''}} - {%- endfor -%} - {{ '|$|' if not loop.last else ''}} - {%- endfor %} +{%- set distinct_vals = run_query(distinct_val_query) -%} +{%- for val in distinct_vals.itertuples() -%} +{%- for column in distinct_vals.columns[:-1] -%} +{{ val[column] }}{{ '_' if not loop.last else '' }} +{%- endfor -%} +{{ '|$|' if not loop.last else '' }} +{%- endfor %} {%- endmacro -%} {%- if dimensions -%} @@ -52,146 +52,151 @@ {%- endif -%} {%- endif -%} -with source_query as ( - select - cast(date_trunc('day', cast({{ time_dimension }} as date)) as date) as date_day, - {%- for dimension in dimensions %} - case - when to_char({{ dimension }}) in ( - {%- for val in distinct_values %} - '{{ val }}'{{',' if not loop.last else ''}} - {%- endfor %} - ) then to_char({{ dimension }}) - {%- if 'None' in distinct_values %} - when {{ dimension }} is null then 'None' - {%- endif %} - else '_OtherGroup' - end as {{ dimension }}, - {%- endfor %} - {{ target_expression }} as property_to_aggregate - from {{ source_table }} - {{ filter_statement }} -), -calendar as ( - select +with + source_query as ( + select + cast( + date_trunc('day', cast({{ time_dimension }} as date)) as date + ) as date_day, + {%- for dimension in dimensions %} + case + when + to_char({{ dimension }}) in ( + {%- for val in distinct_values %} + '{{ val }}'{{ ',' if not loop.last else '' }} + {%- endfor %} + ) + then to_char({{ dimension }}) + {%- if 'None' in distinct_values %} + when {{ dimension }} is null then 'None' + {%- endif %} + else '_OtherGroup' + end as {{ dimension }}, + {%- endfor %} + {{ target_expression }} as property_to_aggregate + from {{ source_table }} {{ filter_statement }} + ), + calendar as ( + select row_number() over (order by null) as interval_id, - cast(dateadd( - 'day', - interval_id-1, - '{{ start_date }}'::timestamp_ntz) as date) as date_day, + cast( + dateadd( + 'day', interval_id -1, '{{ start_date }}'::timestamp_ntz + ) as date + ) as date_day, cast(date_trunc('week', date_day) as date) as date_week, cast(date_trunc('month', date_day) as date) as date_month, case - when month(date_day) in (1, 2, 3) then date_from_parts(year(date_day), 1, 1) - when month(date_day) in (4, 5, 6) then date_from_parts(year(date_day), 4, 1) - when month(date_day) in (7, 8, 9) then date_from_parts(year(date_day), 7, 1) - when month(date_day) in (10, 11, 12) then date_from_parts(year(date_day), 10, 1) + when month(date_day) in (1, 2, 3) + then date_from_parts(year(date_day), 1, 1) + when month(date_day) in (4, 5, 6) + then date_from_parts(year(date_day), 4, 1) + when month(date_day) in (7, 8, 9) + then date_from_parts(year(date_day), 7, 1) + when month(date_day) in (10, 11, 12) + then date_from_parts(year(date_day), 10, 1) end as date_quarter, cast(date_trunc('year', date_day) as date) as date_year - from table (generator(rowcount => {{ num_days }})) -), -spine__time as ( - select - date_{{ time_grain }} as period, - date_day - from calendar -), -{%- for dimension in dimensions %} -spine__values__{{ dimension }} as ( - select distinct {{ dimension }} - from source_query -), -{%- endfor %} -spine as ( - select * - from spine__time + from table(generator(rowcount => {{ num_days }})) + ), + spine__time as (select date_{{ time_grain }} as period, date_day from calendar), + {%- for dimension in dimensions %} + spine__values__{{ dimension }} as ( + select distinct {{ dimension }} from source_query + ), + {%- endfor %} + spine as ( + select * + from spine__time {%- for dimension in dimensions %} cross join spine__values__{{ dimension }} {%- endfor %} -), -joined as ( - select - spine.period, - {%- for dimension in dimensions %} - spine.{{ dimension }}, - {%- endfor %} - {{ aggregation_type }}({{ 'distinct ' if distinct else ''}}source_query.property_to_aggregate) as {{ alias }}, - boolor_agg(source_query.date_day is not null) as has_data - from spine - left outer join source_query on source_query.date_day = spine.date_day - {%- for dimension in dimensions %} - and (source_query.{{ dimension }} = spine.{{ dimension }} - or source_query.{{ dimension }} is null and spine.{{ dimension }} is null) - {%- endfor %} - group by {{ range(1, dimensions|length + 2)|join(', ') }} -), -bounded as ( - select - *, - min(case when has_data then period end) over () as lower_bound, - max(case when has_data then period end) over () as upper_bound - from joined -), -tidy_data as ( - select - cast(period as timestamp) as period_min, - {%- if time_grain|lower == 'quarter' %} - dateadd('second', -1, dateadd('month',3, period_min)) as period_max, - {%- else %} - dateadd('second', -1, dateadd('{{ time_grain }}',1, period_min)) as period_max, - {%- endif %} - {%- for dimension in dimensions %} - {{ dimension }}, - {%- endfor %} - coalesce({{ alias }}, 0) as {{ alias }} - from bounded - where period >= lower_bound - and period <= upper_bound - order by {{ range(1, dimensions|length + 2)|join(', ') }} -) -{%- if not dimensions or not flatten %} -select * from tidy_data order by period_min + ), + joined as ( + select + spine.period, + {%- for dimension in dimensions %} spine.{{ dimension }}, {%- endfor %} + {{ aggregation_type }} ( + {{ 'distinct ' if distinct else '' }}source_query.property_to_aggregate + ) as {{ alias }}, + boolor_agg(source_query.date_day is not null) as has_data + from spine + left outer join + source_query on source_query.date_day = spine.date_day + {%- for dimension in dimensions %} + and ( + source_query.{{ dimension }} = spine.{{ dimension }} + or source_query.{{ dimension }} is null + and spine.{{ dimension }} is null + ) + {%- endfor %} + group by {{ range(1, dimensions|length + 2)|join(', ') }} + ), + bounded as ( + select + *, + min(case when has_data then period end) over () as lower_bound, + max(case when has_data then period end) over () as upper_bound + from joined + ), + tidy_data as ( + select + cast(period as timestamp) as period_min, + {%- if time_grain|lower == 'quarter' %} + dateadd('second', -1, dateadd('month', 3, period_min)) as period_max, + {%- else %} + dateadd( + 'second', -1, dateadd('{{ time_grain }}', 1, period_min) + ) as period_max, + {%- endif %} + {%- for dimension in dimensions %} {{ dimension }}, {%- endfor %} + coalesce({{ alias }}, 0) as {{ alias }} + from bounded + where period >= lower_bound and period <= upper_bound + order by {{ range(1, dimensions|length + 2)|join(', ') }} + ) +{%- if not dimensions or not flatten %} select * from tidy_data order by period_min {%- else -%} -, -combined_dimensions as ( - select - concat( - {%- for dimension in dimensions -%} - {{ dimension }}{{ ",'_'," if not loop.last else ''}} - {%- endfor -%}) as dimensions, - period_min, - period_max, - {{ alias }} - from tidy_data -), -pivoted as ( - select - period_min, - period_max, - {% for val in distinct_values -%} - {{ cleanse_name(val) }}{{',' if not loop.last else ''}} - {%- endfor %} - from ( - select + , + combined_dimensions as ( + select + concat( + {%- for dimension in dimensions -%} + {{ dimension }}{{ ",'_'," if not loop.last else '' }} + {%- endfor -%} + ) as dimensions, + period_min, + period_max, + {{ alias }} + from tidy_data + ), + pivoted as ( + select period_min, period_max, - {{ alias }}, - dimensions - from combined_dimensions - ) - pivot ( - sum({{ alias }}) for dimensions in ( {% for val in distinct_values -%} - '{{ val }}'{{',' if not loop.last else ''}} + {{ cleanse_name(val) }}{{ ',' if not loop.last else '' }} {%- endfor %} - ) - ) as p ( - period_min, - period_max, - {% for val in distinct_values -%} - {{ cleanse_name(val) }}{{',' if not loop.last else ''}} - {%- endfor %} + from + ( + select period_min, period_max, {{ alias }}, dimensions + from combined_dimensions + ) + pivot( + sum({{ alias }}) for dimensions in ( + {% for val in distinct_values -%} + '{{ val }}'{{ ',' if not loop.last else '' }} + {%- endfor %} + ) + ) as p( + period_min, + period_max, + {% for val in distinct_values -%} + {{ cleanse_name(val) }}{{ ',' if not loop.last else '' }} + {%- endfor %} + ) ) -) -select * from pivoted order by period_min +select * +from pivoted +order by period_min {%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/min_max_scaler/min_max_scaler.sql b/rasgotransforms/rasgotransforms/transforms/min_max_scaler/min_max_scaler.sql index 3a17a7fa..0627518e 100644 --- a/rasgotransforms/rasgotransforms/transforms/min_max_scaler/min_max_scaler.sql +++ b/rasgotransforms/rasgotransforms/transforms/min_max_scaler/min_max_scaler.sql @@ -1,23 +1,31 @@ {%- set untouched_cols = get_columns(source_table)|list|reject('in', columns_to_scale)|join(',') if overwrite_columns else "*" -%} {%- if minimums is not defined -%} -with min_max_vals as ( - select +with + min_max_vals as ( + select + {%- for column in columns_to_scale %} + min({{ column }}) as min_{{ column }}, + max({{ column }}) as max_{{ column }}{{ "," if not loop.last else "" }} + {%- endfor %} + from {{ source_table }} + ) +select + {{ source_table + ".*" if not overwrite_columns else untouched_cols }}, {%- for column in columns_to_scale %} - min({{column}}) as min_{{column}}, - max({{column}}) as max_{{column}}{{ "," if not loop.last else "" }} + ({{ column }} - min_{{ column }}) / (max_{{ column }} - min_{{ column }}) + as {{ column if overwrite_columns else column + "_MIN_MAX_SCALED" }}{{ ", " if not loop.last else "" }} {%- endfor %} - from {{source_table}} -) select {{ source_table + ".*" if not overwrite_columns else untouched_cols}}, -{%- for column in columns_to_scale %} - ({{column}} - min_{{column}}) / (max_{{column}} - min_{{column}}) as {{column if overwrite_columns else column + "_MIN_MAX_SCALED"}}{{ ", " if not loop.last else "" }} -{%- endfor %} -from min_max_vals, {{source_table}} +from min_max_vals, {{ source_table }} {%- else -%} -select {{ untouched_cols }}, -{%- for column in columns_to_scale %} - ({{column}} - {{minimums[loop.index0]}}) / ({{maximums[loop.index0]}} - {{minimums[loop.index0]}}) as {{column if overwrite_columns else column + "_MIN_MAX_SCALED"}}{{ ", " if not loop.last else "" }} -{%- endfor %} -from {{source_table}} -{%- endif -%} \ No newline at end of file +select + {{ untouched_cols }}, + {%- for column in columns_to_scale %} + ({{ column }} - {{ minimums[loop.index0] }}) / ( + {{ maximums[loop.index0] }} - {{ minimums[loop.index0] }} + ) + as {{ column if overwrite_columns else column + "_MIN_MAX_SCALED" }}{{ ", " if not loop.last else "" }} + {%- endfor %} +from {{ source_table }} +{%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/moving_avg/moving_avg.sql b/rasgotransforms/rasgotransforms/transforms/moving_avg/moving_avg.sql index 2ee5b3b8..93577fec 100644 --- a/rasgotransforms/rasgotransforms/transforms/moving_avg/moving_avg.sql +++ b/rasgotransforms/rasgotransforms/transforms/moving_avg/moving_avg.sql @@ -1,12 +1,18 @@ {%- for amount in window_sizes -%} - {%- if amount < 0 -%} - {{ raise_exception('Cannot use negative values for a moving average. Please only pass positive values in `window_sizes`.') }} - {%- endif -%} +{%- if amount < 0 -%} +{{ raise_exception('Cannot use negative values for a moving average. Please only pass positive values in `window_sizes`.') }} +{%- endif -%} {%- endfor -%} -SELECT * -{%- for column in input_columns -%} +select + * + {%- for column in input_columns -%} {%- for window in window_sizes -%} - , avg({{column}}) OVER(PARTITION BY {{partition | join(", ")}} ORDER BY {{order_by | join(", ")}} ROWS BETWEEN {{window - 1}} PRECEDING AND CURRENT ROW) AS mean_{{column}}_{{window}} + , + avg({{ column }}) over ( + partition by {{ partition | join(", ") }} + order by {{ order_by | join(", ") }} + rows between {{ window - 1 }} preceding and current row + ) as mean_{{ column }}_{{ window }} {%- endfor %} -{%- endfor %} -FROM {{ source_table }} + {%- endfor %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/one_hot_encode/one_hot_encode.sql b/rasgotransforms/rasgotransforms/transforms/one_hot_encode/one_hot_encode.sql index 0cf6e858..35304a24 100644 --- a/rasgotransforms/rasgotransforms/transforms/one_hot_encode/one_hot_encode.sql +++ b/rasgotransforms/rasgotransforms/transforms/one_hot_encode/one_hot_encode.sql @@ -4,21 +4,23 @@ Instead, please use the `list_of_vals` argument to provide these values explicit {%- endset -%} {%- if list_of_vals is not defined -%} - {%- set results = run_query("SELECT DISTINCT " + column + " FROM " + source_table) -%} - {%- if results is none -%} - {{ raise_exception(run_query_error_message) }} - {%- endif -%} - {%- set distinct_col_vals = results[column].to_list() -%} -{%- else -%} - {%- set distinct_col_vals = list_of_vals -%} +{%- set results = run_query("SELECT DISTINCT " + column + " FROM " + source_table) -%} +{%- if results is none -%} {{ raise_exception(run_query_error_message) }} {%- endif -%} +{%- set distinct_col_vals = results[column].to_list() -%} +{%- else -%} {%- set distinct_col_vals = list_of_vals -%} {%- endif -%} -SELECT *, -{% for val in distinct_col_vals %} +select + *, + {% for val in distinct_col_vals %} {%- if val is not none %} - CASE WHEN {{ column }} = {{ "'" ~ val ~ "'"}} THEN 1 ELSE 0 END as {{ cleanse_name(column ~ '_' ~ val) }}{{ ', ' if not loop.last else '' }} + case + when {{ column }} = {{ "'" ~ val ~ "'" }} then 1 else 0 + end as {{ cleanse_name(column ~ '_' ~ val) }}{{ ', ' if not loop.last else '' }} {%- else %} - CASE WHEN {{ column }} IS NULL THEN 1 ELSE 0 END as {{ column }}_IS_NULL{{ ', ' if not loop.last else '' }} + case + when {{ column }} is null then 1 else 0 + end as {{ column }}_is_null{{ ', ' if not loop.last else '' }} {%- endif -%} -{% endfor %} -FROM {{ source_table }} \ No newline at end of file + {% endfor %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/order/order.sql b/rasgotransforms/rasgotransforms/transforms/order/order.sql index 6dc43196..c9754aef 100644 --- a/rasgotransforms/rasgotransforms/transforms/order/order.sql +++ b/rasgotransforms/rasgotransforms/transforms/order/order.sql @@ -1,6 +1,6 @@ -SELECT * -FROM {{source_table}} -ORDER BY -{%- for col, order_method in order_by.items() %} +select * +from {{ source_table }} +order by + {%- for col, order_method in order_by.items() %} {{ col }} {{ order_method }}{{ ',' if not loop.last else ' ' }} -{%- endfor -%} \ No newline at end of file + {%- endfor -%} diff --git a/rasgotransforms/rasgotransforms/transforms/pivot/bigquery/pivot.sql b/rasgotransforms/rasgotransforms/transforms/pivot/bigquery/pivot.sql index 79439bc9..9fb95305 100644 --- a/rasgotransforms/rasgotransforms/transforms/pivot/bigquery/pivot.sql +++ b/rasgotransforms/rasgotransforms/transforms/pivot/bigquery/pivot.sql @@ -5,37 +5,32 @@ limit 1000 {%- endset -%} {%- if list_of_vals is not defined -%} - {%- set results = run_query(distinct_val_query) -%} - {%- set distinct_vals = results[results.columns[0]].to_list() -%} -{%- else -%} - {%- set distinct_vals = list_of_vals -%} +{%- set results = run_query(distinct_val_query) -%} +{%- set distinct_vals = results[results.columns[0]].to_list() -%} +{%- else -%} {%- set distinct_vals = list_of_vals -%} {%- endif -%} -SELECT * FROM ( - SELECT - {%- for dimension in dimensions %} - {{ dimension }}, - {%- endfor %} - {{ pivot_column }}, - {{ value_column }} - FROM {{ source_table }} -) -PIVOT ( - {%- if agg_method|lower == "median" %} +select * +from + ( + select + {%- for dimension in dimensions %} {{ dimension }}, {%- endfor %} + {{ pivot_column }}, + {{ value_column }} + from {{ source_table }} + ) + pivot( + {%- if agg_method|lower == "median" %} {{ raise_exception('BigQuery does not support median aggregation while pivoting.') }} - {%- else %} - {{ agg_method }} ( {{ pivot_column }} ) as _ - {%- endif %} - FOR {{ value_column }} IN ( - {%- for val in distinct_vals %} - {%- if val is string -%} - '{{ val }}' {{ cleanse_name(val) }} - {%- elif val is none -%} - NULL NULL_REC - {%- else -%} - {{ val }} {{ cleanse_name(val) }} - {%- endif -%} - {{', ' if not loop.last else ''}} - {%- endfor -%} - ) -) \ No newline at end of file + {%- else %}{{ agg_method }} ({{ pivot_column }}) as _ + {%- endif %} + for {{ value_column }} in ( + {%- for val in distinct_vals %} + {%- if val is string -%}'{{ val }}' {{ cleanse_name(val) }} + {%- elif val is none -%}null null_rec + {%- else -%}{{ val }} {{ cleanse_name(val) }} + {%- endif -%} + {{ ', ' if not loop.last else '' }} + {%- endfor -%} + ) + ) diff --git a/rasgotransforms/rasgotransforms/transforms/pivot/snowflake/pivot.sql b/rasgotransforms/rasgotransforms/transforms/pivot/snowflake/pivot.sql index 29065f79..587575c5 100644 --- a/rasgotransforms/rasgotransforms/transforms/pivot/snowflake/pivot.sql +++ b/rasgotransforms/rasgotransforms/transforms/pivot/snowflake/pivot.sql @@ -10,13 +10,10 @@ Instead, please use the `list_of_vals` argument to provide these values explicit {%- endset -%} {%- if list_of_vals is not defined -%} - {%- set results = run_query(distinct_val_query) -%} - {%- if results is none -%} - {{ raise_exception(run_query_error_message) }} - {%- endif -%} - {%- set distinct_vals = results[results.columns[0]].to_list() -%} -{%- else -%} - {%- set distinct_vals = list_of_vals -%} +{%- set results = run_query(distinct_val_query) -%} +{%- if results is none -%} {{ raise_exception(run_query_error_message) }} {%- endif -%} +{%- set distinct_vals = results[results.columns[0]].to_list() -%} +{%- else -%} {%- set distinct_vals = list_of_vals -%} {%- endif -%} {# Jinja Macro to get the comma separated cleansed name list #} @@ -27,7 +24,20 @@ Instead, please use the `list_of_vals` argument to provide these values explicit {%- endmacro -%} -SELECT {{ dimensions | join(", ") }}{{ ',' if dimensions else ''}} {{ get_values(distinct_vals) }} -FROM ( SELECT {{ dimensions | join(", ") }}{{ ',' if dimensions else ''}} {{ pivot_column }}, {{ value_column }} FROM {{ source_table }}) -PIVOT ( {{ agg_method }} ( {{ pivot_column }} ) FOR {{ value_column }} IN ( '{{ distinct_vals | join("', '") }}' ) ) as p -( {{ dimensions | join(", ") }}{{ ',' if dimensions else ''}} {{ get_values(distinct_vals) }} ) \ No newline at end of file +select + {{ dimensions | join(", ") }}{{ ',' if dimensions else '' }} {{ get_values(distinct_vals) }} +from + ( + select + {{ dimensions | join(", ") }}{{ ',' if dimensions else '' }} {{ pivot_column }}, + {{ value_column }} + from {{ source_table }} + ) + pivot( + {{ agg_method }} ({{ pivot_column }}) for {{ value_column }} in ( + '{{ distinct_vals | join("', '") }}' + ) + ) as p + ( + {{ dimensions | join(", ") }}{{ ',' if dimensions else '' }} {{ get_values(distinct_vals) }} + ) diff --git a/rasgotransforms/rasgotransforms/transforms/plot/bigquery/plot.sql b/rasgotransforms/rasgotransforms/transforms/plot/bigquery/plot.sql index 76ef78e2..5c7382fd 100644 --- a/rasgotransforms/rasgotransforms/transforms/plot/bigquery/plot.sql +++ b/rasgotransforms/rasgotransforms/transforms/plot/bigquery/plot.sql @@ -4,15 +4,15 @@ {%- set filters = filters if filters is defined else [] -%} {%- set axis_type_dict = get_columns(source_table) -%} {%- set axis_type_response = axis_type_dict[x_axis].upper() -%} -{%- set group_by = [group_by] if group_by is defined and group_by is string else group_by-%} +{%- set group_by = [group_by] if group_by is defined and group_by is string else group_by -%} {%- if 'DATE' in axis_type_response or 'TIME' in axis_type_response -%} - {%- set axis_type = "date" -%} +{%- set axis_type = "date" -%} {%- elif 'NUM' in axis_type_response or 'FLOAT' in axis_type_response or 'INT' in axis_type_response or 'DECIMAL' in axis_type_response or 'DOUBLE' in axis_type_response or 'REAL' in axis_type_response -%} - {%- set axis_type = "numeric" -%} +{%- set axis_type = "numeric" -%} {%- elif 'BINARY' in axis_type_response or 'TEXT' in axis_type_response or 'BOOLEAN' in axis_type_response or 'CHAR' in axis_type_response or 'STRING' in axis_type_response or 'VARBINARY' in axis_type_response -%} - {%- set axis_type = "categorical" -%} +{%- set axis_type = "categorical" -%} {%- else -%} - {{ raise_exception('The column selected as an axis is not categorical, numeric, or datetime. Please choose an axis that is any of these data types and recreate the transform.') }} +{{ raise_exception('The column selected as an axis is not categorical, numeric, or datetime. Please choose an axis that is any of these data types and recreate the transform.') }} {%- endif -%} {%- if axis_type == 'date' -%} {%- if timeseries_options -%} @@ -20,7 +20,7 @@ {%- set end_date = '2030-01-01' if not timeseries_options.end_date else timeseries_options.end_date -%} {%- set time_grain = 'day' if not timeseries_options.time_grain else timeseries_options.time_grain -%} {%- else -%} -{{ raise_exception("Parameter 'timeseries_options' must be given when 'x_axis' is a column of type datetime")}} +{{ raise_exception("Parameter 'timeseries_options' must be given when 'x_axis' is a column of type datetime") }} {%- endif -%} {%- set num_days = (end_date|string|todatetime - start_date|string|todatetime).days + 1 -%} {%- do filters.append({'columnName': x_axis, 'operator': '>=', 'comparisonValue': "'" + start_date + "'" }) -%} @@ -41,8 +41,8 @@ {%- endset -%} {%- macro get_distinct_values(columns) -%} - {%- set target_column = (metrics.keys()|list)[0] -%} - {%- set aggregation_type = metrics[target_column][0] -%} +{%- set target_column = (metrics.keys()|list)[0] -%} +{%- set aggregation_type = metrics[target_column][0] -%} {%- set distinct_val_query %} select concat( @@ -57,13 +57,13 @@ order by vals desc limit {{ max_num_groups + 1}} {%- endset -%} - {%- set distinct_vals = run_query(distinct_val_query) -%} - {%- for val in distinct_vals.itertuples() -%} - {%- for column in distinct_vals.columns[:-1] -%} - {{ val[column] }}{{'_' if not loop.last else ''}} - {%- endfor -%} - {{ '|$|' if not loop.last else ''}} - {%- endfor %} +{%- set distinct_vals = run_query(distinct_val_query) -%} +{%- for val in distinct_vals.itertuples() -%} +{%- for column in distinct_vals.columns[:-1] -%} +{{ val[column] }}{{ '_' if not loop.last else '' }} +{%- endfor -%} +{{ '|$|' if not loop.last else '' }} +{%- endfor %} {%- endmacro -%} {%- if group_by -%} @@ -73,340 +73,341 @@ {%- endif -%} {%- endif -%} -with combined_dimensions as ( - select *, - concat(''{{',' if group_by}} - {%- for column in group_by %} - {{ column }}{{", '_', " if not loop.last}} +with + combined_dimensions as ( + select + *, + concat( + ''{{ ',' if group_by }} + {%- for column in group_by %} + {{ column }}{{ ", '_', " if not loop.last }} + {%- endfor %} + ) as dimensions + from {{ source_table }} + ), + {%- if axis_type == 'date' %} + source_query as ( + select + cast(date_trunc(cast({{ x_axis }} as date), day) as date) as date_day, + {%- if group_by %} + case + when + dimensions in ( + {%- for val in distinct_values %} + '{{ val }}'{{ ',' if not loop.last else '' }} + {%- endfor %} + ) + then dimensions + {%- if 'None' in distinct_values %} + when dimensions is null then 'None' + {%- endif %} + else '_OtherGroup' + end as dimensions, + {%- endif %} + {%- for column in metrics.keys() %} + {{ column }}{{ ',' if not loop.last }} {%- endfor %} - ) as dimensions - from {{ source_table }} -), -{%- if axis_type == 'date' %} -source_query as ( - select - cast(date_trunc(cast({{ x_axis }} as date), day) as date) as date_day, - {%- if group_by %} - case - when dimensions in ( - {%- for val in distinct_values %} - '{{ val }}'{{',' if not loop.last else ''}} - {%- endfor %} - ) then dimensions - {%- if 'None' in distinct_values %} - when dimensions is null then 'None' + from combined_dimensions {{ filter_statement }} + ), + {%- if time_grain|lower == 'all' %} + spine as ( + select + cast('{{ start_date }}' as timestamp) as {{ x_axis }}_min, + cast('{{ end_date }}' as timestamp) as {{ x_axis }}_max + ), + joined as (select * from source_query cross join spine), + tidy_data as ( + select + {{ x_axis }}_min, + {{ x_axis }}_max, + {{ '\n dimensions,' if group_by }} + {%- for column, aggs in metrics.items() %} + {%- set oloop = loop %} + {%- for aggregation_type in aggs %} + {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }} ( + {{ 'distinct ' if 'distinct' in aggregation_type|lower else '' }}{{ column }} + ) + as {{ cleanse_name(aggregation_type + '_' + column) }}{{ ',' if not (loop.last and oloop.last) }} + {%- endfor %} + {%- endfor %} + from joined + group by 1, 2{{ ', 3' if group_by }} + ) + {%- else %} + calendar as ( + select + date_day, + date_trunc(date_day, week) as date_week, + date_trunc(date_day, month) as date_month, + date_trunc(date_day, quarter) as date_quarter, + date_trunc(date_day, year) as date_year + from + unnest( + generate_date_array('{{ start_date }}', '{{ end_date }}') + ) as date_day + ), + {%- if group_by %} + spine__time as (select date_{{ time_grain }} as period, date_day from calendar), + spine__values__dimensions as (select distinct dimensions from source_query), + spine as (select * from spine__time cross join spine__values__dimensions), + {%- else %} + spine as (select date_{{ time_grain }} as period, date_day from calendar), + {%- endif %} + joined as ( + select + spine.period, + {{ '\n spine.dimensions,' if group_by }} + {%- for column, aggs in metrics.items() %} + {%- for aggregation_type in aggs %} + {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }} ( + {{ 'distinct ' if 'distinct' in aggregation_type|lower else '' }} source_query.{{ column }} + ) as {{ cleanse_name(aggregation_type + '_' + column) }}, + {%- endfor %} + {%- endfor %} + logical_or(source_query.date_day is not null) as has_data + from spine + left outer join + source_query on source_query.date_day = spine.date_day + {%- if group_by %} + and ( + source_query.dimensions = spine.dimensions + or source_query.dimensions is null + and spine.dimensions is null + ) {%- endif %} - else '_OtherGroup' - end as dimensions, - {%- endif %} - {%- for column in metrics.keys() %} - {{ column }}{{ ',' if not loop.last }} - {%- endfor %} - from combined_dimensions - {{ filter_statement }} -), -{%- if time_grain|lower == 'all'%} -spine as ( - select - cast('{{ start_date }}' as timestamp) as {{ x_axis }}_min, - cast('{{ end_date }}' as timestamp) as {{ x_axis }}_max -), -joined as ( - select * - from source_query - cross join spine -), -tidy_data as ( - select - {{ x_axis }}_min, - {{ x_axis }}_max, {{ '\n dimensions,' if group_by }} - {%- for column, aggs in metrics.items() %} - {%- set oloop = loop %} - {%- for aggregation_type in aggs %} - {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }}({{ 'distinct ' if 'distinct' in aggregation_type|lower else ''}}{{ column }}) as {{ cleanse_name(aggregation_type + '_' + column)}}{{ ',' if not (loop.last and oloop.last) }} - {%- endfor %} - {%- endfor %} - from joined - group by 1, 2{{ ', 3' if group_by}} -) -{%- else %} -calendar as ( - select - date_day, - date_trunc(date_day, week) as date_week, - date_trunc(date_day, month) as date_month, - date_trunc(date_day, quarter) as date_quarter, - date_trunc(date_day, year) as date_year - from unnest(generate_date_array('{{ start_date }}', '{{ end_date }}')) as date_day -), -{%- if group_by %} -spine__time as ( + group by 1{{ ', 2' if group_by }} + ), + bounded as ( select - date_{{ time_grain }} as period, - date_day - from calendar -), -spine__values__dimensions as ( - select distinct dimensions - from source_query -), -spine as ( - select * - from spine__time - cross join spine__values__dimensions -), -{%- else %} -spine as ( + *, + min(case when has_data then period end) over () as lower_bound, + max(case when has_data then period end) over () as upper_bound + from joined + ), + tidy_data as ( select - date_{{ time_grain }} as period, - date_day - from calendar -), -{%- endif %} -joined as ( - select - spine.period,{{ '\n spine.dimensions,' if group_by }} - {%- for column, aggs in metrics.items() %} - {%- for aggregation_type in aggs %} - {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }}({{ 'distinct ' if 'distinct' in aggregation_type|lower else ''}} source_query.{{ column }}) as {{ cleanse_name(aggregation_type + '_' + column)}}, - {%- endfor %} - {%- endfor %} - logical_or(source_query.date_day is not null) as has_data - from spine - left outer join source_query on source_query.date_day = spine.date_day - {%- if group_by %} - and (source_query.dimensions = spine.dimensions - or source_query.dimensions is null and spine.dimensions is null) - {%- endif %} - group by 1{{ ', 2' if group_by }} -), -bounded as ( - select - *, - min(case when has_data then period end) over () as lower_bound, - max(case when has_data then period end) over () as upper_bound - from joined -), -tidy_data as ( - select - cast(period as timestamp) as {{ x_axis }}_min, - {%- if time_grain|lower == 'quarter' %} - cast(date_add(period, INTERVAL 3 month) as timestamp) as {{ x_axis }}_max, - {%- else %} - cast(date_add(period, INTERVAL 1 {{ time_grain }}) as timestamp) as {{ x_axis }}_max, - {%- endif %}{{ '\n dimensions,' if group_by }} - {%- for column, aggs in metrics.items() %} - {%- set oloop = loop %} - {%- for aggregation_type in aggs %} - {{ cleanse_name(aggregation_type + '_' + column)}}{{ ',' if not (loop.last and oloop.last) }} - {%- endfor %} - {%- endfor %} - from bounded - where period >= lower_bound - and period <= upper_bound - order by 1, 2{{ ', 3' if group_by }} -) -{%- endif %} + cast(period as timestamp) as {{ x_axis }}_min, + {%- if time_grain|lower == 'quarter' %} + cast(date_add(period, interval 3 month) as timestamp) as {{ x_axis }}_max, + {%- else %} + cast( + date_add(period, interval 1 {{ time_grain }}) as timestamp + ) as {{ x_axis }}_max, + {%- endif %} {{ '\n dimensions,' if group_by }} + {%- for column, aggs in metrics.items() %} + {%- set oloop = loop %} + {%- for aggregation_type in aggs %} + {{ cleanse_name(aggregation_type + '_' + column) }}{{ ',' if not (loop.last and oloop.last) }} + {%- endfor %} + {%- endfor %} + from bounded + where period >= lower_bound and period <= upper_bound + order by 1, 2{{ ', 3' if group_by }} + ) + {%- endif %} -{%- elif axis_type == 'numeric' %} + {%- elif axis_type == 'numeric' %} -axis_range as ( - select - min({{ x_axis }}) - 1 as min_val, - max({{ x_axis }}) + 1 as max_val - from {{ source_table }} - where {{ x_axis }} is not null -), -edges as ( - select - min_val, - max_val, - (min_val-max_val) val_range, - ((max_val-min_val)/{{ bucket_count }}) bucket_size - from axis_range -), -buckets as ( - select - dimensions, - min_val, - max_val, - bucket_size, - cast({{ x_axis }} as numeric) as col_a_val, - range_bucket(cast({{ x_axis }} as numeric), generate_array(min_val, max_val, (max_val - min_val)/{{ bucket_count }})) as bucket, - {%- for column in metrics.keys() %} - {{ column }}{{ ',' if not loop.last }} - {%- endfor %} - from - combined_dimensions - cross join edges - {{ filter_statement }} -), -source_query as ( - select - bucket, - {%- if group_by %} - case - when dimensions in ( - {%- for val in distinct_values %} - '{{ val }}'{{',' if not loop.last else ''}} - {%- endfor %} - ) then dimensions - {%- if 'None' in distinct_values %} - when dimensions is null then 'None' + axis_range as ( + select min({{ x_axis }}) - 1 as min_val, max({{ x_axis }}) + 1 as max_val + from {{ source_table }} + where {{ x_axis }} is not null + ), + edges as ( + select + min_val, + max_val, + (min_val - max_val) val_range, + ((max_val - min_val) /{{ bucket_count }}) bucket_size + from axis_range + ), + buckets as ( + select + dimensions, + min_val, + max_val, + bucket_size, + cast({{ x_axis }} as numeric) as col_a_val, + range_bucket( + cast({{ x_axis }} as numeric), + generate_array( + min_val, max_val, (max_val - min_val) /{{ bucket_count }} + ) + ) as bucket, + {%- for column in metrics.keys() %} {{ column }}{{ ',' if not loop.last }} + {%- endfor %} + from combined_dimensions + cross join edges {{ filter_statement }} + ), + source_query as ( + select + bucket, + {%- if group_by %} + case + when + dimensions in ( + {%- for val in distinct_values %} + '{{ val }}'{{ ',' if not loop.last else '' }} + {%- endfor %} + ) + then dimensions + {%- if 'None' in distinct_values %} + when dimensions is null then 'None' + {%- endif %} + else '_OtherGroup' + end as dimensions, {%- endif %} - else '_OtherGroup' - end as dimensions, - {%- endif %} - {%- for column in metrics.keys() %} - {{ column }}{{ ',' if not loop.last }} - {%- endfor %} - from buckets -), -{%- if group_by %} -bucket_spine as ( - select bucket - from unnest(generate_array(1, {{ bucket_count }})) as bucket -), -spine__values__dimensions as ( - select distinct dimensions from source_query -), -spine as ( - select * from bucket_spine - cross join spine__values__dimensions -), -{%- else %} -spine as ( - select bucket - from unnest(generate_array(1, {{ bucket_count }})) as bucket -), -{%- endif %} -joined as ( - select {{ '\n spine.dimensions,' if group_by}} - spine.bucket, - {%- for column, aggs in metrics.items() %} - {%- for aggregation_type in aggs %} - {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }}({{ 'distinct ' if 'distinct' in aggregation_type|lower else ''}}source_query.{{ column }}) as {{ cleanse_name(aggregation_type + '_' + column)}}, - {%- endfor %} - {%- endfor %} - logical_or(source_query.bucket is not null) as has_data - from spine - left outer join source_query on source_query.bucket = spine.bucket - {%- if group_by %} - and (source_query.dimensions = spine.dimensions - or source_query.dimensions is null and spine.dimensions is null) - {%- endif %} - group by 1{{ ', 2' if group_by }} -), -tidy_data as ( - select - min_val+((bucket-1)*bucket_size) as {{ x_axis }}_min, - min_val+(bucket*bucket_size) as {{ x_axis }}_max, {{ '\n dimensions,' if group_by }} - {%- for column, aggs in metrics.items() %} - {%- set oloop = loop %} - {%- for aggregation_type in aggs %} - {{ cleanse_name(aggregation_type + '_' + column)}}{{ '' if loop.last and oloop.last else ',' }} - {%- endfor %} - {%- endfor %} - from joined + {%- for column in metrics.keys() %} {{ column }}{{ ',' if not loop.last }} + {%- endfor %} + from buckets + ), + {%- if group_by %} + bucket_spine as ( + select bucket from unnest(generate_array(1, {{ bucket_count }})) as bucket + ), + spine__values__dimensions as (select distinct dimensions from source_query), + spine as (select * from bucket_spine cross join spine__values__dimensions), + {%- else %} + spine as ( + select bucket from unnest(generate_array(1, {{ bucket_count }})) as bucket + ), + {%- endif %} + joined as ( + select + {{ '\n spine.dimensions,' if group_by }} + spine.bucket, + {%- for column, aggs in metrics.items() %} + {%- for aggregation_type in aggs %} + {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }} ( + {{ 'distinct ' if 'distinct' in aggregation_type|lower else '' }}source_query.{{ column }} + ) as {{ cleanse_name(aggregation_type + '_' + column) }}, + {%- endfor %} + {%- endfor %} + logical_or(source_query.bucket is not null) as has_data + from spine + left outer join + source_query on source_query.bucket = spine.bucket + {%- if group_by %} + and ( + source_query.dimensions = spine.dimensions + or source_query.dimensions is null + and spine.dimensions is null + ) + {%- endif %} + group by 1{{ ', 2' if group_by }} + ), + tidy_data as ( + select + min_val + ((bucket -1) * bucket_size) as {{ x_axis }}_min, + min_val + (bucket * bucket_size) as {{ x_axis }}_max, + {{ '\n dimensions,' if group_by }} + {%- for column, aggs in metrics.items() %} + {%- set oloop = loop %} + {%- for aggregation_type in aggs %} + {{ cleanse_name(aggregation_type + '_' + column) }}{{ '' if loop.last and oloop.last else ',' }} + {%- endfor %} + {%- endfor %} + from joined cross join edges -) + ) -{%- elif axis_type == 'categorical' -%} -source_query as ( - select - {{ x_axis }}, - {%- if group_by %} - case - when dimensions in ( - {%- for val in distinct_values %} - '{{ val }}'{{',' if not loop.last else ''}} - {%- endfor %} - ) then dimensions - {%- if 'None' in distinct_values %} - when dimensions is null then 'None' + {%- elif axis_type == 'categorical' -%} + source_query as ( + select + {{ x_axis }}, + {%- if group_by %} + case + when + dimensions in ( + {%- for val in distinct_values %} + '{{ val }}'{{ ',' if not loop.last else '' }} + {%- endfor %} + ) + then dimensions + {%- if 'None' in distinct_values %} + when dimensions is null then 'None' + {%- endif %} + else '_OtherGroup' + end as dimensions, {%- endif %} - else '_OtherGroup' - end as dimensions, - {%- endif %} - {%- for column in metrics.keys() %} - {{ column }}{{ ',' if not loop.last }} - {%- endfor %} - from combined_dimensions - {{ filter_statement }} -), -tidy_data as ( - select - {%- if not group_by or not flatten %} - {{ x_axis }}, - {%- else %} - {{ x_axis }} as {{ x_axis }}_min, - {{ x_axis }} as {{ x_axis }}_max, - {%- endif %}{{ '\n dimensions,' if group_by}} - {%- for column, aggs in metrics.items() %} - {%- set oloop = loop -%} - {%- for aggregation_type in aggs %} - {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }}({{ 'distinct ' if 'distinct' in aggregation_type|lower else ''}}{{ column }}) as {{ cleanse_name(aggregation_type + '_' + column)}}{{ '' if loop.last and oloop.last else ',' }} - {%- endfor -%} - {%- endfor %} - from source_query - group by 1{{ ', 2' if group_by }}{{ ', 3' if group_by and flatten }} -) -{%- endif -%} -{%- if not group_by or not flatten %} -select * from tidy_data order by 1 {{ x_axis_order if x_axis_order }} -{%- else -%} -, -{% set metric_names = [] -%} -{%- set column_names = [] -%} -{%- for column, aggs in metrics.items() -%} -{%- for aggregation_type in aggs -%} -{%- set metric_name = cleanse_name(aggregation_type + '_' + column) -%} -{%- do metric_names.append(metric_name) %} -pivoted__{{ metric_name }} as ( - select * from ( + {%- for column in metrics.keys() %} {{ column }}{{ ',' if not loop.last }} + {%- endfor %} + from combined_dimensions {{ filter_statement }} + ), + tidy_data as ( select - {{ x_axis }}_min as x_min_{{ metric_name }}, - {{ x_axis }}_max as x_max_{{ metric_name }}, - {{ metric_name }}, - dimensions - from tidy_data - ) - pivot ( - sum( {{ metric_name }} ) as {{ metric_name }} - for dimensions in ( - {%- for val in distinct_values %} - {%- set column_name = metric_name + '_' + (val|string) -%} - {%- do column_names.append(column_name) -%} - {%- if val is string -%} - '{{ val }}' - {%- else -%} - {{ val }} - {%- endif -%} - {{', ' if not loop.last else ''}} + {%- if not group_by or not flatten %} {{ x_axis }}, + {%- else %} + {{ x_axis }} as {{ x_axis }}_min, {{ x_axis }} as {{ x_axis }}_max, + {%- endif %} {{ '\n dimensions,' if group_by }} + {%- for column, aggs in metrics.items() %} + {%- set oloop = loop -%} + {%- for aggregation_type in aggs %} + {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }} ( + {{ 'distinct ' if 'distinct' in aggregation_type|lower else '' }}{{ column }} + ) + as {{ cleanse_name(aggregation_type + '_' + column) }}{{ '' if loop.last and oloop.last else ',' }} {%- endfor -%} - ) + {%- endfor %} + from source_query + group by 1{{ ', 2' if group_by }}{{ ', 3' if group_by and flatten }} ) -), -{%- endfor %} -{%- endfor %} -pivoted as ( - select * - from pivoted__{{ metric_names[0] }} + {%- endif -%} +{%- if not group_by or not flatten %} +select * from tidy_data order by 1 {{ x_axis_order if x_axis_order }} +{%- else -%} + , + {% set metric_names = [] -%} + {%- set column_names = [] -%} + {%- for column, aggs in metrics.items() -%} + {%- for aggregation_type in aggs -%} + {%- set metric_name = cleanse_name(aggregation_type + '_' + column) -%} + {%- do metric_names.append(metric_name) %} + pivoted__{{ metric_name }} as ( + select * + from + ( + select + {{ x_axis }}_min as x_min_{{ metric_name }}, + {{ x_axis }}_max as x_max_{{ metric_name }}, + {{ metric_name }}, + dimensions + from tidy_data + ) + pivot( + sum({{ metric_name }}) as {{ metric_name }} + for dimensions in ( + {%- for val in distinct_values %} + {%- set column_name = metric_name + '_' + (val|string) -%} + {%- do column_names.append(column_name) -%} + {%- if val is string -%}'{{ val }}' + {%- else -%}{{ val }} + {%- endif -%} + {{ ', ' if not loop.last else '' }} + {%- endfor -%} + ) + ) + ), + {%- endfor %} + {%- endfor %} + pivoted as ( + select * + from pivoted__{{ metric_names[0] }} {%- for i in range(1, metric_names|length) %} - left join pivoted__{{ metric_names[i] }} + left join + pivoted__{{ metric_names[i] }} on x_min_{{ metric_names[0] }} = x_min_{{ metric_names[i] }} and x_max_{{ metric_names[0] }} = x_max_{{ metric_names[i] }} {%- endfor %} -) + ) select - {%- if axis_type == 'categorical' %} - x_min_{{ metric_names[0] }} as {{ x_axis }}, + {%- if axis_type == 'categorical' %} x_min_{{ metric_names[0] }} as {{ x_axis }}, {%- else %} x_min_{{ metric_names[0] }} as {{ x_axis }}_min, x_max_{{ metric_names[0] }} as {{ x_axis }}_max, {%- endif %} - {%- for column_name in column_names %} - {{ column_name }}{{ ',' if not loop.last }} + {%- for column_name in column_names %} {{ column_name }}{{ ',' if not loop.last }} {%- endfor %} -from pivoted order by 1 {{ x_axis_order if x_axis_order }} -{%- endif %} \ No newline at end of file +from pivoted +order by 1 {{ x_axis_order if x_axis_order }} +{%- endif %} diff --git a/rasgotransforms/rasgotransforms/transforms/plot/snowflake/plot.sql b/rasgotransforms/rasgotransforms/transforms/plot/snowflake/plot.sql index 4d7cabbf..ad337e96 100644 --- a/rasgotransforms/rasgotransforms/transforms/plot/snowflake/plot.sql +++ b/rasgotransforms/rasgotransforms/transforms/plot/snowflake/plot.sql @@ -4,15 +4,15 @@ {%- set filters = filters if filters is defined else [] -%} {%- set axis_type_dict = get_columns(source_table) -%} {%- set axis_type_response = axis_type_dict[x_axis.upper()].upper() -%} -{%- set group_by = [group_by] if group_by is defined and group_by is string else group_by-%} +{%- set group_by = [group_by] if group_by is defined and group_by is string else group_by -%} {%- if 'DATE' in axis_type_response or 'TIME' in axis_type_response -%} - {%- set axis_type = "date" -%} +{%- set axis_type = "date" -%} {%- elif 'NUM' in axis_type_response or 'FLOAT' in axis_type_response or 'INT' in axis_type_response or 'DECIMAL' in axis_type_response or 'DOUBLE' in axis_type_response or 'REAL' in axis_type_response -%} - {%- set axis_type = "numeric" -%} +{%- set axis_type = "numeric" -%} {%- elif 'BINARY' in axis_type_response or 'TEXT' in axis_type_response or 'BOOLEAN' in axis_type_response or 'CHAR' in axis_type_response or 'STRING' in axis_type_response or 'VARBINARY' in axis_type_response -%} - {%- set axis_type = "categorical" -%} +{%- set axis_type = "categorical" -%} {%- else -%} - {{ raise_exception('The column selected as an axis is not categorical, numeric, or datetime. Please choose an axis that is any of these data types and recreate the transform.') }} +{{ raise_exception('The column selected as an axis is not categorical, numeric, or datetime. Please choose an axis that is any of these data types and recreate the transform.') }} {%- endif -%} {%- if axis_type == 'date' -%} {%- if timeseries_options -%} @@ -20,7 +20,7 @@ {%- set end_date = '2030-01-01' if not timeseries_options.end_date else timeseries_options.end_date -%} {%- set time_grain = 'day' if not timeseries_options.time_grain else timeseries_options.time_grain -%} {%- else -%} -{{ raise_exception("Parameter 'timeseries_options' must be given when 'x_axis' is a column of type datetime")}} +{{ raise_exception("Parameter 'timeseries_options' must be given when 'x_axis' is a column of type datetime") }} {%- endif -%} {%- set num_days = (end_date|string|todatetime - start_date|string|todatetime).days + 1 -%} {%- do filters.append({'columnName': x_axis, 'operator': '>=', 'comparisonValue': "'" + start_date + "'" }) -%} @@ -41,8 +41,8 @@ {%- endset -%} {%- macro get_distinct_values(columns) -%} - {%- set target_column = (metrics.keys()|list)[0] -%} - {%- set aggregation_type = metrics[target_column][0] -%} +{%- set target_column = (metrics.keys()|list)[0] -%} +{%- set aggregation_type = metrics[target_column][0] -%} {%- set distinct_val_query %} select concat( @@ -57,13 +57,13 @@ order by vals desc limit {{ max_num_groups + 1}} {%- endset -%} - {%- set distinct_vals = run_query(distinct_val_query) -%} - {%- for val in distinct_vals.itertuples() -%} - {%- for column in distinct_vals.columns[:-1] -%} - {{ val[column] }}{{'_' if not loop.last else ''}} - {%- endfor -%} - {{ '|$|' if not loop.last else ''}} - {%- endfor %} +{%- set distinct_vals = run_query(distinct_val_query) -%} +{%- for val in distinct_vals.itertuples() -%} +{%- for column in distinct_vals.columns[:-1] -%} +{{ val[column] }}{{ '_' if not loop.last else '' }} +{%- endfor -%} +{{ '|$|' if not loop.last else '' }} +{%- endfor %} {%- endmacro -%} {%- if group_by -%} @@ -74,364 +74,360 @@ {%- endif -%} {%- if axis_type == 'date' %} -with source_query as ( - select - cast(date_trunc('day', cast({{ x_axis }} as date)) as date) as date_day, - {%- if group_by %} - concat( - {%- for column in group_by %} - {{ column }}{{", '_', " if not loop.last}} - {%- endfor %} - ) as combined_dimensions, - case - when combined_dimensions in ( - {%- for val in distinct_values %} - '{{ val }}'{{',' if not loop.last else ''}} +with + source_query as ( + select + cast(date_trunc('day', cast({{ x_axis }} as date)) as date) as date_day, + {%- if group_by %} + concat( + {%- for column in group_by %} + {{ column }}{{ ", '_', " if not loop.last }} {%- endfor %} - ) then combined_dimensions - {%- if 'None' in distinct_values %} - when combined_dimensions is null then 'None' + ) as combined_dimensions, + case + when + combined_dimensions in ( + {%- for val in distinct_values %} + '{{ val }}'{{ ',' if not loop.last else '' }} + {%- endfor %} + ) + then combined_dimensions + {%- if 'None' in distinct_values %} + when combined_dimensions is null then 'None' + {%- endif %} + else '_OtherGroup' + end as dimensions, {%- endif %} - else '_OtherGroup' - end as dimensions, - {%- endif %} - {%- for column in metrics.keys() %} - {{ column }}{{ ',' if not loop.last }} - {%- endfor %} - from {{ source_table }} - {{ filter_statement }} -), -{%- if time_grain|lower == 'all'%} -spine as ( - select - cast('{{ start_date }}' as timestamp_ntz) as {{ x_axis }}_min, - cast('{{ end_date }}' as timestamp_ntz) as {{ x_axis }}_max -), -joined as ( - select * - from source_query - cross join spine -), -tidy_data as ( - select - {{ x_axis }}_min, - {{ x_axis }}_max, {{ '\n dimensions,' if group_by }} - {%- for column, aggs in metrics.items() %} - {%- set oloop = loop %} - {%- for aggregation_type in aggs %} - {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }}({{ 'distinct ' if 'distinct' in aggregation_type|lower else ''}}{{ column }}) as {{ cleanse_name(aggregation_type + '_' + column)}}{{ ',' if not (loop.last and oloop.last) }} - {%- endfor %} - {%- endfor %} - from joined - group by 1, 2{{ ', 3' if group_by}} -) -{%- else %} -calendar as ( - select + {%- for column in metrics.keys() %} + {{ column }}{{ ',' if not loop.last }} + {%- endfor %} + from {{ source_table }} {{ filter_statement }} + ), + {%- if time_grain|lower == 'all' %} + spine as ( + select + cast('{{ start_date }}' as timestamp_ntz) as {{ x_axis }}_min, + cast('{{ end_date }}' as timestamp_ntz) as {{ x_axis }}_max + ), + joined as (select * from source_query cross join spine), + tidy_data as ( + select + {{ x_axis }}_min, + {{ x_axis }}_max, + {{ '\n dimensions,' if group_by }} + {%- for column, aggs in metrics.items() %} + {%- set oloop = loop %} + {%- for aggregation_type in aggs %} + {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }} ( + {{ 'distinct ' if 'distinct' in aggregation_type|lower else '' }}{{ column }} + ) + as {{ cleanse_name(aggregation_type + '_' + column) }}{{ ',' if not (loop.last and oloop.last) }} + {%- endfor %} + {%- endfor %} + from joined + group by 1, 2{{ ', 3' if group_by }} + ) + {%- else %} + calendar as ( + select row_number() over (order by null) as interval_id, - cast(dateadd( - 'day', - interval_id-1, - '{{ start_date }}'::timestamp_ntz) as date) as date_day, + cast( + dateadd( + 'day', interval_id -1, '{{ start_date }}'::timestamp_ntz + ) as date + ) as date_day, cast(date_trunc('week', date_day) as date) as date_week, cast(date_trunc('month', date_day) as date) as date_month, case - when month(date_day) in (1, 2, 3) then date_from_parts(year(date_day), 1, 1) - when month(date_day) in (4, 5, 6) then date_from_parts(year(date_day), 4, 1) - when month(date_day) in (7, 8, 9) then date_from_parts(year(date_day), 7, 1) - when month(date_day) in (10, 11, 12) then date_from_parts(year(date_day), 10, 1) + when month(date_day) in (1, 2, 3) + then date_from_parts(year(date_day), 1, 1) + when month(date_day) in (4, 5, 6) + then date_from_parts(year(date_day), 4, 1) + when month(date_day) in (7, 8, 9) + then date_from_parts(year(date_day), 7, 1) + when month(date_day) in (10, 11, 12) + then date_from_parts(year(date_day), 10, 1) end as date_quarter, cast(date_trunc('year', date_day) as date) as date_year - from table (generator(rowcount => {{ num_days }})) -), -{%- if group_by %} -spine__time as ( + from table(generator(rowcount => {{ num_days }})) + ), + {%- if group_by %} + spine__time as (select date_{{ time_grain }} as period, date_day from calendar), + spine__values__dimensions as (select distinct dimensions from source_query), + spine as (select * from spine__time cross join spine__values__dimensions), + {%- else %} + spine as (select date_{{ time_grain }} as period, date_day from calendar), + {%- endif %} + joined as ( + select + spine.period, + {{ '\n spine.dimensions,' if group_by }} + {%- for column, aggs in metrics.items() %} + {%- for aggregation_type in aggs %} + {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }} ( + {{ 'distinct ' if 'distinct' in aggregation_type|lower else '' }} source_query.{{ column }} + ) as {{ cleanse_name(aggregation_type + '_' + column) }}, + {%- endfor %} + {%- endfor %} + boolor_agg(source_query.date_day is not null) as has_data + from spine + left outer join + source_query on source_query.date_day = spine.date_day + {%- if group_by %} + and ( + source_query.dimensions = spine.dimensions + or source_query.dimensions is null + and spine.dimensions is null + ) + {%- endif %} + group by 1{{ ', 2' if group_by }} + ), + bounded as ( select - date_{{ time_grain }} as period, - date_day - from calendar -), -spine__values__dimensions as ( - select distinct dimensions - from source_query -), -spine as ( - select * - from spine__time - cross join spine__values__dimensions -), -{%- else %} -spine as ( + *, + min(case when has_data then period end) over () as lower_bound, + max(case when has_data then period end) over () as upper_bound + from joined + ), + tidy_data as ( select - date_{{ time_grain }} as period, - date_day - from calendar -), -{%- endif %} -joined as ( - select - spine.period,{{ '\n spine.dimensions,' if group_by }} - {%- for column, aggs in metrics.items() %} - {%- for aggregation_type in aggs %} - {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }}({{ 'distinct ' if 'distinct' in aggregation_type|lower else ''}} source_query.{{ column }}) as {{ cleanse_name(aggregation_type + '_' + column)}}, - {%- endfor %} - {%- endfor %} - boolor_agg(source_query.date_day is not null) as has_data - from spine - left outer join source_query on source_query.date_day = spine.date_day - {%- if group_by %} - and (source_query.dimensions = spine.dimensions - or source_query.dimensions is null and spine.dimensions is null) - {%- endif %} - group by 1{{ ', 2' if group_by }} -), -bounded as ( - select - *, - min(case when has_data then period end) over () as lower_bound, - max(case when has_data then period end) over () as upper_bound - from joined -), -tidy_data as ( - select - cast(period as timestamp) as {{ x_axis }}_min, - {%- if time_grain|lower == 'quarter' %} - dateadd('second', -1, dateadd('month',3, {{ x_axis }}_min)) as {{ x_axis }}_max, - {%- else %} - dateadd('second', -1, dateadd('{{ time_grain }}',1, {{ x_axis }}_min)) as {{ x_axis }}_max, - {%- endif %}{{ '\n dimensions,' if group_by }} - {%- for column, aggs in metrics.items() %} - {%- set oloop = loop %} - {%- for aggregation_type in aggs %} - {{ cleanse_name(aggregation_type + '_' + column)}}{{ ',' if not (loop.last and oloop.last) }} - {%- endfor %} - {%- endfor %} - from bounded - where period >= lower_bound - and period <= upper_bound - order by 1, 2{{ ', 3' if group_by }} -) -{%- endif %} + cast(period as timestamp) as {{ x_axis }}_min, + {%- if time_grain|lower == 'quarter' %} + dateadd( + 'second', -1, dateadd('month', 3, {{ x_axis }}_min) + ) as {{ x_axis }}_max, + {%- else %} + dateadd( + 'second', -1, dateadd('{{ time_grain }}', 1, {{ x_axis }}_min) + ) as {{ x_axis }}_max, + {%- endif %} {{ '\n dimensions,' if group_by }} + {%- for column, aggs in metrics.items() %} + {%- set oloop = loop %} + {%- for aggregation_type in aggs %} + {{ cleanse_name(aggregation_type + '_' + column) }}{{ ',' if not (loop.last and oloop.last) }} + {%- endfor %} + {%- endfor %} + from bounded + where period >= lower_bound and period <= upper_bound + order by 1, 2{{ ', 3' if group_by }} + ) + {%- endif %} {%- elif axis_type == 'numeric' %} -with axis_range as ( - select - min({{ x_axis }}) - 1 as min_val, - max({{ x_axis }}) + 1 as max_val - from {{ source_table }} - where {{ x_axis }} is not null -), -edges as ( - select - min_val, - max_val, - (min_val-max_val) val_range, - ((max_val-min_val)/{{ bucket_count }}) bucket_size - from axis_range -), -buckets as ( - select - {%- for column in group_by %} - {{ column }}, - {%- endfor %} - min_val, - max_val, - bucket_size, - cast({{ x_axis }} as float) as col_a_val, - width_bucket(col_a_val, min_val, max_val, {{ bucket_count }}) as bucket, - {%- for column in metrics.keys() %} - {{ column }}{{ ',' if not loop.last }} - {%- endfor %} - from - {{ source_table }} - cross join edges - {{ filter_statement }} -), -source_query as ( - select - bucket, - {%- if group_by %} - concat( - {%- for column in group_by %} - {{ column }}{{", '_', " if not loop.last}} +with + axis_range as ( + select min({{ x_axis }}) - 1 as min_val, max({{ x_axis }}) + 1 as max_val + from {{ source_table }} + where {{ x_axis }} is not null + ), + edges as ( + select + min_val, + max_val, + (min_val - max_val) val_range, + ((max_val - min_val) /{{ bucket_count }}) bucket_size + from axis_range + ), + buckets as ( + select + {%- for column in group_by %} {{ column }}, + {%- endfor %} + min_val, + max_val, + bucket_size, + cast({{ x_axis }} as float) as col_a_val, + width_bucket(col_a_val, min_val, max_val, {{ bucket_count }}) as bucket, + {%- for column in metrics.keys() %} {{ column }}{{ ',' if not loop.last }} {%- endfor %} - ) as combined_dimensions, - case - when combined_dimensions in ( - {%- for val in distinct_values %} - '{{ val }}'{{',' if not loop.last else ''}} + from {{ source_table }} + cross join edges {{ filter_statement }} + ), + source_query as ( + select + bucket, + {%- if group_by %} + concat( + {%- for column in group_by %} + {{ column }}{{ ", '_', " if not loop.last }} {%- endfor %} - ) then combined_dimensions - {%- if 'None' in distinct_values %} - when combined_dimensions is null then 'None' + ) as combined_dimensions, + case + when + combined_dimensions in ( + {%- for val in distinct_values %} + '{{ val }}'{{ ',' if not loop.last else '' }} + {%- endfor %} + ) + then combined_dimensions + {%- if 'None' in distinct_values %} + when combined_dimensions is null then 'None' + {%- endif %} + else '_OtherGroup' + end as dimensions, {%- endif %} - else '_OtherGroup' - end as dimensions, - {%- endif %} - {%- for column in metrics.keys() %} - {{ column }}{{ ',' if not loop.last }} - {%- endfor %} - from buckets -), -{%- if group_by %} -bucket_spine as ( - select - row_number() over (order by null) as bucket - from table (generator(rowcount => {{ bucket_count }})) -), -spine__values__dimensions as ( - select distinct dimensions from source_query -), -spine as ( - select * from bucket_spine - cross join spine__values__dimensions -), -{%- else %} -spine as ( - select - row_number() over (order by null) as bucket - from table (generator(rowcount => {{ bucket_count }})) -), -{%- endif %} -joined as ( - select {{ '\n spine.dimensions,' if group_by}} - spine.bucket, - {%- for column, aggs in metrics.items() %} - {%- for aggregation_type in aggs %} - {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }}({{ 'distinct ' if 'distinct' in aggregation_type|lower else ''}}source_query.{{ column }}) as {{ cleanse_name(aggregation_type + '_' + column)}}, - {%- endfor %} - {%- endfor %} - boolor_agg(source_query.bucket is not null) as has_data - from spine - left outer join source_query on source_query.bucket = spine.bucket - {%- if group_by %} - and (source_query.dimensions = spine.dimensions - or source_query.dimensions is null and spine.dimensions is null) - {%- endif %} - group by 1{{ ', 2' if group_by }} -), -tidy_data as ( - select - min_val+((bucket-1)*bucket_size) as {{ x_axis }}_min, - min_val+(bucket*bucket_size) as {{ x_axis }}_max, {{ '\n dimensions,' if group_by }} - {%- for column, aggs in metrics.items() %} - {%- set oloop = loop %} - {%- for aggregation_type in aggs %} - {{ cleanse_name(aggregation_type + '_' + column)}}{{ '' if loop.last and oloop.last else ',' }} - {%- endfor %} - {%- endfor %} - from joined + {%- for column in metrics.keys() %} {{ column }}{{ ',' if not loop.last }} + {%- endfor %} + from buckets + ), + {%- if group_by %} + bucket_spine as ( + select row_number() over (order by null) as bucket + from table(generator(rowcount => {{ bucket_count }})) + ), + spine__values__dimensions as (select distinct dimensions from source_query), + spine as (select * from bucket_spine cross join spine__values__dimensions), + {%- else %} + spine as ( + select row_number() over (order by null) as bucket + from table(generator(rowcount => {{ bucket_count }})) + ), + {%- endif %} + joined as ( + select + {{ '\n spine.dimensions,' if group_by }} + spine.bucket, + {%- for column, aggs in metrics.items() %} + {%- for aggregation_type in aggs %} + {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }} ( + {{ 'distinct ' if 'distinct' in aggregation_type|lower else '' }}source_query.{{ column }} + ) as {{ cleanse_name(aggregation_type + '_' + column) }}, + {%- endfor %} + {%- endfor %} + boolor_agg(source_query.bucket is not null) as has_data + from spine + left outer join + source_query on source_query.bucket = spine.bucket + {%- if group_by %} + and ( + source_query.dimensions = spine.dimensions + or source_query.dimensions is null + and spine.dimensions is null + ) + {%- endif %} + group by 1{{ ', 2' if group_by }} + ), + tidy_data as ( + select + min_val + ((bucket -1) * bucket_size) as {{ x_axis }}_min, + min_val + (bucket * bucket_size) as {{ x_axis }}_max, + {{ '\n dimensions,' if group_by }} + {%- for column, aggs in metrics.items() %} + {%- set oloop = loop %} + {%- for aggregation_type in aggs %} + {{ cleanse_name(aggregation_type + '_' + column) }}{{ '' if loop.last and oloop.last else ',' }} + {%- endfor %} + {%- endfor %} + from joined cross join edges -) + ) {%- elif axis_type == 'categorical' -%} -with source_query as ( - select - {{ x_axis }}, - {%- if group_by %} - concat( - {%- for column in group_by %} - {{ column }}{{", '_', " if not loop.last}} - {%- endfor %} - ) as combined_dimensions, - case - when combined_dimensions in ( - {%- for val in distinct_values %} - '{{ val }}'{{',' if not loop.last else ''}} +with + source_query as ( + select + {{ x_axis }}, + {%- if group_by %} + concat( + {%- for column in group_by %} + {{ column }}{{ ", '_', " if not loop.last }} {%- endfor %} - ) then combined_dimensions - {%- if 'None' in distinct_values %} - when combined_dimensions is null then 'None' + ) as combined_dimensions, + case + when + combined_dimensions in ( + {%- for val in distinct_values %} + '{{ val }}'{{ ',' if not loop.last else '' }} + {%- endfor %} + ) + then combined_dimensions + {%- if 'None' in distinct_values %} + when combined_dimensions is null then 'None' + {%- endif %} + else '_OtherGroup' + end as dimensions, {%- endif %} - else '_OtherGroup' - end as dimensions, - {%- endif %} - {%- for column in metrics.keys() %} - {{ column }}{{ ',' if not loop.last }} - {%- endfor %} - from {{ source_table }} - {{ filter_statement }} -), -tidy_data as ( - select - {%- if not group_by or not flatten %} - {{ x_axis }}, - {%- else %} - {{ x_axis }} as {{ x_axis }}_min, - {{ x_axis }} as {{ x_axis }}_max, - {%- endif %}{{ '\n dimensions,' if group_by}} - {%- for column, aggs in metrics.items() %} - {%- set oloop = loop -%} - {%- for aggregation_type in aggs %} - {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }}({{ 'distinct ' if 'distinct' in aggregation_type|lower else ''}}{{ column }}) as {{ cleanse_name(aggregation_type + '_' + column)}}{{ '' if loop.last and oloop.last else ',' }} - {%- endfor -%} - {%- endfor %} - from source_query - group by 1{{ ', 2' if group_by }}{{ ', 3' if group_by and flatten }} -) + {%- for column in metrics.keys() %} {{ column }}{{ ',' if not loop.last }} + {%- endfor %} + from {{ source_table }} {{ filter_statement }} + ), + tidy_data as ( + select + {%- if not group_by or not flatten %} {{ x_axis }}, + {%- else %} + {{ x_axis }} as {{ x_axis }}_min, {{ x_axis }} as {{ x_axis }}_max, + {%- endif %} {{ '\n dimensions,' if group_by }} + {%- for column, aggs in metrics.items() %} + {%- set oloop = loop -%} + {%- for aggregation_type in aggs %} + {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }} ( + {{ 'distinct ' if 'distinct' in aggregation_type|lower else '' }}{{ column }} + ) + as {{ cleanse_name(aggregation_type + '_' + column) }}{{ '' if loop.last and oloop.last else ',' }} + {%- endfor -%} + {%- endfor %} + from source_query + group by 1{{ ', 2' if group_by }}{{ ', 3' if group_by and flatten }} + ) {%- endif -%} {%- if not group_by or not flatten %} select * from tidy_data order by 1 {{ x_axis_order if x_axis_order }} {%- else -%} -, -{% set metric_names = [] -%} -{%- set column_names = [] -%} -{%- for column, aggs in metrics.items() -%} -{%- for aggregation_type in aggs -%} -{%- set metric_name = cleanse_name(aggregation_type + '_' + column) -%} -{%- do metric_names.append(metric_name) %} -pivoted__{{ metric_name }} as ( - select - x_min_{{ metric_name }}, - x_max_{{ metric_name }}, - {% for val in distinct_values -%} - {%- set column_name = cleanse_name(val) + '_' + metric_name -%} - {%- do column_names.append(column_name) -%} - {{ column_name }}{{',' if not loop.last else ''}} - {%- endfor %} - from ( - select - {{ x_axis }}_min, - {{ x_axis }}_max, - {{ metric_name }}, - dimensions - from tidy_data - ) - pivot ( - sum({{ metric_name }}) for dimensions in ( + , + {% set metric_names = [] -%} + {%- set column_names = [] -%} + {%- for column, aggs in metrics.items() -%} + {%- for aggregation_type in aggs -%} + {%- set metric_name = cleanse_name(aggregation_type + '_' + column) -%} + {%- do metric_names.append(metric_name) %} + pivoted__{{ metric_name }} as ( + select + x_min_{{ metric_name }}, + x_max_{{ metric_name }}, {% for val in distinct_values -%} - '{{ val }}'{{',' if not loop.last else ''}} + {%- set column_name = cleanse_name(val) + '_' + metric_name -%} + {%- do column_names.append(column_name) -%} + {{ column_name }}{{ ',' if not loop.last else '' }} {%- endfor %} - ) - ) as p ( - x_min_{{ metric_name }}, - x_max_{{ metric_name }}, - {% for val in distinct_values -%} - {{ cleanse_name(val) + '_' + metric_name }}{{',' if not loop.last else ''}} - {%- endfor %} - ) -), -{%- endfor %} -{%- endfor %} -pivoted as ( - select * - from pivoted__{{ metric_names[0] }} + from + ( + select {{ x_axis }}_min, {{ x_axis }}_max, {{ metric_name }}, dimensions + from tidy_data + ) + pivot( + sum({{ metric_name }}) for dimensions in ( + {% for val in distinct_values -%} + '{{ val }}'{{ ',' if not loop.last else '' }} + {%- endfor %} + ) + ) as p( + x_min_{{ metric_name }}, + x_max_{{ metric_name }}, + {% for val in distinct_values -%} + {{ cleanse_name(val) + '_' + metric_name }}{{ ',' if not loop.last else '' }} + {%- endfor %} + ) + ), + {%- endfor %} + {%- endfor %} + pivoted as ( + select * + from pivoted__{{ metric_names[0] }} {%- for i in range(1, metric_names|length) %} - left join pivoted__{{ metric_names[i] }} + left join + pivoted__{{ metric_names[i] }} on x_min_{{ metric_names[0] }} = x_min_{{ metric_names[i] }} and x_max_{{ metric_names[0] }} = x_max_{{ metric_names[i] }} {%- endfor %} -) -select - {%- if axis_type == 'categorical' %} - x_min_{{ metric_names[0] }} as {{ x_axis }}, + ) +select + {%- if axis_type == 'categorical' %} x_min_{{ metric_names[0] }} as {{ x_axis }}, {%- else %} x_min_{{ metric_names[0] }} as {{ x_axis }}_min, x_max_{{ metric_names[0] }} as {{ x_axis }}_max, {%- endif %} - {%- for column_name in column_names %} - {{ column_name }}{{ ',' if not loop.last }} + {%- for column_name in column_names %} {{ column_name }}{{ ',' if not loop.last }} {%- endfor %} -from pivoted order by 1 {{ x_axis_order if x_axis_order }} -{%- endif %} \ No newline at end of file +from pivoted +order by 1 {{ x_axis_order if x_axis_order }} +{%- endif %} diff --git a/rasgotransforms/rasgotransforms/transforms/prefix/prefix.sql b/rasgotransforms/rasgotransforms/transforms/prefix/prefix.sql index be0f346e..1adb95f9 100644 --- a/rasgotransforms/rasgotransforms/transforms/prefix/prefix.sql +++ b/rasgotransforms/rasgotransforms/transforms/prefix/prefix.sql @@ -1,7 +1,7 @@ {%- set source_col_names = get_columns(source_table) -%} {%- set alias = cleanse_name(prefix) -%} -SELECT -{%- for column in source_col_names %} - {{column}} AS {{ alias~'_'~column }}{{',' if not loop.last else ''}} -{%- endfor %} -FROM {{ source_table }} \ No newline at end of file +select + {%- for column in source_col_names %} + {{ column }} as {{ alias~'_'~column }}{{ ',' if not loop.last else '' }} + {%- endfor %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/query/bigquery/query.sql b/rasgotransforms/rasgotransforms/transforms/query/bigquery/query.sql index ee8c315d..9157dbd8 100644 --- a/rasgotransforms/rasgotransforms/transforms/query/bigquery/query.sql +++ b/rasgotransforms/rasgotransforms/transforms/query/bigquery/query.sql @@ -1,49 +1,50 @@ -WITH base_table as ( - SELECT * -{%- for formula in new_columns %} - , {{ formula }} as {{ cleanse_name(formula) }} -{%- endfor %} - FROM {{ source_table }} -), -filtered as ( - SELECT * - FROM base_table -{%- for filter in filters %} - {{ " WHERE " if loop.first else "" }} - {%- if filter is not mapping %} - {{ filter }} - {%- elif filter.operator|upper == 'CONTAINS' %} - {{ filter.operator }}({{ filter.columnName }}, {{ filter.comparisonValue }}) - {%- else %} - {{ filter.columnName }} {{ filter.operator }} {{ filter.comparisonValue }} - {%- endif %} - {{ " AND " if not loop.last else "" }} -{%- endfor %} -) +with + base_table as ( + select + * + {%- for formula in new_columns %} + , {{ formula }} as {{ cleanse_name(formula) }} + {%- endfor %} + from {{ source_table }} + ), + filtered as ( + select * + from + base_table + {%- for filter in filters %} + {{ " WHERE " if loop.first else "" }} + {%- if filter is not mapping %} {{ filter }} + {%- elif filter.operator|upper == 'CONTAINS' %} + {{ filter.operator }} ( + {{ filter.columnName }}, {{ filter.comparisonValue }} + ) + {%- else %} + {{ filter.columnName }} {{ filter.operator }} {{ filter.comparisonValue }} + {%- endif %} + {{ " AND " if not loop.last else "" }} + {%- endfor %} + ) {%- if summarize is defined -%} -, -aggregated as ( - SELECT - {%- if group_by is defined %} - {{ group_by | join(', ') }}, - {%- endif %} -{%- for column, aggs in summarize.items() %} - {%- set oloop = loop %} - {%- for aggregation_type in aggs %} - {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }}({{ 'distinct ' if 'distinct' in aggregation_type|lower else ''}}{{ column }}) as {{ cleanse_name(aggregation_type + '_' + column)}}{{ ',' if not (loop.last and oloop.last) }} - {%- endfor %} -{%- endfor %} - FROM filtered - {%- if group_by is defined %} - GROUP BY {{ group_by | join(', ') }} - {%- endif %} -) -SELECT * -FROM aggregated -{% else %} -SELECT * -FROM filtered + , + aggregated as ( + select + {%- if group_by is defined %} {{ group_by | join(', ') }}, {%- endif %} + {%- for column, aggs in summarize.items() %} + {%- set oloop = loop %} + {%- for aggregation_type in aggs %} + {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }} ( + {{ 'distinct ' if 'distinct' in aggregation_type|lower else '' }}{{ column }} + ) + as {{ cleanse_name(aggregation_type + '_' + column) }}{{ ',' if not (loop.last and oloop.last) }} + {%- endfor %} + {%- endfor %} + from filtered + {%- if group_by is defined %} group by {{ group_by | join(', ') }} {%- endif %} + ) +select * +from aggregated +{% else %} select * from filtered {%- endif -%} {%- if order_by_columns is defined %} -ORDER BY {{ order_by_columns | join(', ') }} {{ order_by_direction }} -{%- endif -%} \ No newline at end of file +order by {{ order_by_columns | join(', ') }} {{ order_by_direction }} +{%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/query/snowflake/query.sql b/rasgotransforms/rasgotransforms/transforms/query/snowflake/query.sql index 2c56ff73..f4c229eb 100644 --- a/rasgotransforms/rasgotransforms/transforms/query/snowflake/query.sql +++ b/rasgotransforms/rasgotransforms/transforms/query/snowflake/query.sql @@ -1,49 +1,50 @@ -WITH base_table as ( - SELECT * -{%- for formula in new_columns %} - , {{ formula }} as {{ cleanse_name(formula) }} -{%- endfor %} - FROM {{ source_table }} -), -filtered as ( - SELECT * - FROM base_table -{%- for filter in filters %} - {{ " WHERE " if loop.first else "" }} - {%- if filter is not mapping %} - {{ filter }} - {%- elif filter.operator|upper == 'CONTAINS' %} - {{ filter.operator }}({{ filter.columnName }}, {{ filter.comparisonValue }}) - {%- else %} - {{ filter.columnName }} {{ filter.operator }} {{ filter.comparisonValue }} - {%- endif %} - {{ " AND " if not loop.last else "" }} -{%- endfor %} -) +with + base_table as ( + select + * + {%- for formula in new_columns %} + , {{ formula }} as {{ cleanse_name(formula) }} + {%- endfor %} + from {{ source_table }} + ), + filtered as ( + select * + from + base_table + {%- for filter in filters %} + {{ " WHERE " if loop.first else "" }} + {%- if filter is not mapping %} {{ filter }} + {%- elif filter.operator|upper == 'CONTAINS' %} + {{ filter.operator }} ( + {{ filter.columnName }}, {{ filter.comparisonValue }} + ) + {%- else %} + {{ filter.columnName }} {{ filter.operator }} {{ filter.comparisonValue }} + {%- endif %} + {{ " AND " if not loop.last else "" }} + {%- endfor %} + ) {%- if summarize is defined and summarize|length -%} -, -aggregated as ( - SELECT - {%- if group_by is defined %} - {{ group_by | join(', ') }}, - {%- endif %} -{%- for column, aggs in summarize.items() %} - {%- set oloop = loop %} - {%- for aggregation_type in aggs %} - {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }}({{ 'distinct ' if 'distinct' in aggregation_type|lower else ''}}{{ column }}) as {{ cleanse_name(aggregation_type + '_' + column)}}{{ ',' if not (loop.last and oloop.last) }} - {%- endfor %} -{%- endfor %} - FROM filtered - {%- if group_by is defined %} - GROUP BY {{ group_by | join(', ') }} - {%- endif %} -) -SELECT * -FROM aggregated -{% else %} -SELECT * -FROM filtered + , + aggregated as ( + select + {%- if group_by is defined %} {{ group_by | join(', ') }}, {%- endif %} + {%- for column, aggs in summarize.items() %} + {%- set oloop = loop %} + {%- for aggregation_type in aggs %} + {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }} ( + {{ 'distinct ' if 'distinct' in aggregation_type|lower else '' }}{{ column }} + ) + as {{ cleanse_name(aggregation_type + '_' + column) }}{{ ',' if not (loop.last and oloop.last) }} + {%- endfor %} + {%- endfor %} + from filtered + {%- if group_by is defined %} group by {{ group_by | join(', ') }} {%- endif %} + ) +select * +from aggregated +{% else %} select * from filtered {%- endif -%} {%- if order_by_columns is defined %} -ORDER BY {{ order_by_columns | join(', ') }} {{ order_by_direction }} -{%- endif -%} \ No newline at end of file +order by {{ order_by_columns | join(', ') }} {{ order_by_direction }} +{%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/rank/rank.sql b/rasgotransforms/rasgotransforms/transforms/rank/rank.sql index 57c806fd..c2601f1c 100644 --- a/rasgotransforms/rasgotransforms/transforms/rank/rank.sql +++ b/rasgotransforms/rasgotransforms/transforms/rank/rank.sql @@ -2,20 +2,25 @@ {%- set alias = alias if alias is defined else cleanse_name('RANK_' + '_'.join(rank_columns)) -%} -SELECT {{ untouched_cols }}, -{%- if rank_type == 'dense' %} - DENSE_RANK() OVER( -{% elif rank_type == 'percent' %} - PERCENT_RANK() OVER( -{% elif rank_type == 'unique' %} - ROW_NUMBER() OVER( -{%- else -%} - RANK() OVER( -{% endif %} - {% if partition_by -%} - PARTITION BY {% for col in partition_by -%}{{col}}{{ ", " if not loop.last else " " }}{%- endfor %} - {% endif -%} - ORDER BY {% for col in rank_columns -%}{{col}}{% if order %} {{ order }}{% endif %}{{ ", " if not loop.last else " " }}{%- endfor %} - ) AS {{ alias }} -FROM {{ source_table }} -{% if qualify_filter %}QUALIFY {{ alias }} {{ qualify_filter }}{% endif %} +select + {{ untouched_cols }}, + {%- if rank_type == 'dense' %} + dense_rank() over ( + {% elif rank_type == 'percent' %} + percent_rank() over ( + {% elif rank_type == 'unique' %} row_number() over ( {%- else -%} rank() over ( + {% endif %} + {% if partition_by -%} + partition by + {% for col in partition_by -%} + {{ col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + {% endif -%} + order by + {% for col in rank_columns -%} + {{ col }} + {% if order %} {{ order }}{% endif %} {{ ", " if not loop.last else " " }} + {%- endfor %} + ) as {{ alias }} +from {{ source_table }} +{% if qualify_filter %} qualify {{ alias }} {{ qualify_filter }}{% endif %} diff --git a/rasgotransforms/rasgotransforms/transforms/ratio_with_shrinkage/ratio_with_shrinkage.sql b/rasgotransforms/rasgotransforms/transforms/ratio_with_shrinkage/ratio_with_shrinkage.sql index e249ed57..c0309c23 100644 --- a/rasgotransforms/rasgotransforms/transforms/ratio_with_shrinkage/ratio_with_shrinkage.sql +++ b/rasgotransforms/rasgotransforms/transforms/ratio_with_shrinkage/ratio_with_shrinkage.sql @@ -1,47 +1,23 @@ {# the strange __var__ names are meant to prevent collisions #} - {%- set source_col_names = get_columns(source_table) -%} -WITH CTE_AGG AS ( - SELECT - *, - {{ numerator }} / {{ denom }} as RAW__PCT - FROM - {{ source_table }} -), -CTE_FILTER AS ( - SELECT - * - FROM - CTE_AGG - WHERE - {{ denom }} >= {{ min_cutoff }} -), -CTE_STATS AS ( - SELECT - AVG(RAW__PCT) AS __U__, - VARIANCE_SAMP(RAW__PCT) AS __V__ - FROM - CTE_FILTER -), -CTE_JOINED AS ( - SELECT - * - FROM CTE_AGG - CROSS JOIN CTE_STATS -), -CTE_COEF AS ( - SELECT - *, - __U__ * ( - __U__ * (1 - __U__)/ __V__ - 1 - ) AS __ALPHA__, - __ALPHA__ * (1 - __U__)/ __U__ AS __BETA__ - FROM - CTE_JOINED -) -SELECT - {{ source_col_names | join(', ') }}, - RAW__PCT, - ({{ numerator }} + __ALPHA__) / ({{ denom }} + __ALPHA__ + __BETA__) AS ADJ__PCT -FROM - CTE_COEF \ No newline at end of file +with + cte_agg as ( + select *, {{ numerator }} / {{ denom }} as raw__pct from {{ source_table }} + ), + cte_filter as (select * from cte_agg where {{ denom }} >= {{ min_cutoff }}), + cte_stats as ( + select avg(raw__pct) as __u__, variance_samp(raw__pct) as __v__ from cte_filter + ), + cte_joined as (select * from cte_agg cross join cte_stats), + cte_coef as ( + select + *, + __u__ * (__u__ * (1 - __u__) / __v__ - 1) as __alpha__, + __alpha__ * (1 - __u__) / __u__ as __beta__ + from cte_joined + ) +select + {{ source_col_names | join(', ') }}, + raw__pct, + ({{ numerator }} + __alpha__) / ({{ denom }} + __alpha__ + __beta__) as adj__pct +from cte_coef diff --git a/rasgotransforms/rasgotransforms/transforms/ratio_with_shrinkage/snowflake/ratio_with_shrinkage.sql b/rasgotransforms/rasgotransforms/transforms/ratio_with_shrinkage/snowflake/ratio_with_shrinkage.sql index e249ed57..c0309c23 100644 --- a/rasgotransforms/rasgotransforms/transforms/ratio_with_shrinkage/snowflake/ratio_with_shrinkage.sql +++ b/rasgotransforms/rasgotransforms/transforms/ratio_with_shrinkage/snowflake/ratio_with_shrinkage.sql @@ -1,47 +1,23 @@ {# the strange __var__ names are meant to prevent collisions #} - {%- set source_col_names = get_columns(source_table) -%} -WITH CTE_AGG AS ( - SELECT - *, - {{ numerator }} / {{ denom }} as RAW__PCT - FROM - {{ source_table }} -), -CTE_FILTER AS ( - SELECT - * - FROM - CTE_AGG - WHERE - {{ denom }} >= {{ min_cutoff }} -), -CTE_STATS AS ( - SELECT - AVG(RAW__PCT) AS __U__, - VARIANCE_SAMP(RAW__PCT) AS __V__ - FROM - CTE_FILTER -), -CTE_JOINED AS ( - SELECT - * - FROM CTE_AGG - CROSS JOIN CTE_STATS -), -CTE_COEF AS ( - SELECT - *, - __U__ * ( - __U__ * (1 - __U__)/ __V__ - 1 - ) AS __ALPHA__, - __ALPHA__ * (1 - __U__)/ __U__ AS __BETA__ - FROM - CTE_JOINED -) -SELECT - {{ source_col_names | join(', ') }}, - RAW__PCT, - ({{ numerator }} + __ALPHA__) / ({{ denom }} + __ALPHA__ + __BETA__) AS ADJ__PCT -FROM - CTE_COEF \ No newline at end of file +with + cte_agg as ( + select *, {{ numerator }} / {{ denom }} as raw__pct from {{ source_table }} + ), + cte_filter as (select * from cte_agg where {{ denom }} >= {{ min_cutoff }}), + cte_stats as ( + select avg(raw__pct) as __u__, variance_samp(raw__pct) as __v__ from cte_filter + ), + cte_joined as (select * from cte_agg cross join cte_stats), + cte_coef as ( + select + *, + __u__ * (__u__ * (1 - __u__) / __v__ - 1) as __alpha__, + __alpha__ * (1 - __u__) / __u__ as __beta__ + from cte_joined + ) +select + {{ source_col_names | join(', ') }}, + raw__pct, + ({{ numerator }} + __alpha__) / ({{ denom }} + __alpha__ + __beta__) as adj__pct +from cte_coef diff --git a/rasgotransforms/rasgotransforms/transforms/remove_duplicates/remove_duplicates.sql b/rasgotransforms/rasgotransforms/transforms/remove_duplicates/remove_duplicates.sql index eac1046f..d014bfb0 100644 --- a/rasgotransforms/rasgotransforms/transforms/remove_duplicates/remove_duplicates.sql +++ b/rasgotransforms/rasgotransforms/transforms/remove_duplicates/remove_duplicates.sql @@ -1,7 +1,14 @@ -SELECT -* -FROM {{ source_table }} -QUALIFY ROW_NUMBER() OVER ( - PARTITION BY {%- for col in natural_key %} {{col}}{{"," if not loop.last else ""}} {%- endfor %} - ORDER BY {%- for col in order_col %} {{col}}{{"," if not loop.last else ""}} {%- endfor %} {{order_method}} -) = 1 +select * +from {{ source_table }} +qualify + row_number() over ( + partition by + {%- for col in natural_key %} + {{ col }}{{ "," if not loop.last else "" }} + {%- endfor %} + order by + {%- for col in order_col %} + {{ col }}{{ "," if not loop.last else "" }} + {%- endfor %} {{ order_method }} + ) + = 1 diff --git a/rasgotransforms/rasgotransforms/transforms/remove_outliers/remove_outliers.sql b/rasgotransforms/rasgotransforms/transforms/remove_outliers/remove_outliers.sql index 7d58d3c1..9aed5fd1 100644 --- a/rasgotransforms/rasgotransforms/transforms/remove_outliers/remove_outliers.sql +++ b/rasgotransforms/rasgotransforms/transforms/remove_outliers/remove_outliers.sql @@ -1,65 +1,83 @@ {%- set source_col_names = get_columns(source_table) -%} -with outliers as ( - {%- if method == "iqr" %} - with iqr_vals as ( +with + outliers as ( + {%- if method == "iqr" %} + with + iqr_vals as ( + select + {%- for col in target_columns %} + percentile_cont(0.25) within group ( + order by {{ col }} + ) as q1_{{ col }}, + percentile_cont(0.5) within group ( + order by {{ col }} + ) as median_{{ col }}, + percentile_cont(0.75) within group ( + order by {{ col }} + ) as q3_{{ col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + from {{ source_table }} + ) select - {%- for col in target_columns %} - percentile_cont(0.25) within group (order by {{ col }}) as Q1_{{ col }}, - percentile_cont(0.5) within group (order by {{ col }}) as MEDIAN_{{ col }}, - percentile_cont(0.75) within group (order by {{ col }}) as Q3_{{ col }}{{ ", " if not loop.last else " " }} - {%- endfor %} - from {{ source_table }} - ) select *, - case - {%- for col in target_columns %} - when {{ col }} > MEDIAN_{{ col }} + ((Q3_{{ col }} - Q1_{{ col }}) * 1.5) then true - when {{ col }} < MEDIAN_{{ col }} - ((Q3_{{ col }} - Q1_{{ col }}) * 1.5) then true - {%- endfor %} - else false - end as OUTLIER - from {{ source_table }}, iqr_vals + *, + case + {%- for col in target_columns %} + when + {{ col }} > median_{{ col }} + ((q3_{{ col }} - q1_{{ col }}) * 1.5) + then true + when + {{ col }} < median_{{ col }} - ((q3_{{ col }} - q1_{{ col }}) * 1.5) + then true + {%- endfor %} + else false + end as outlier + from {{ source_table }}, iqr_vals - {%- elif method == "threshold" %} - select *, - case - {%- for col in target_columns %} - when {{ col }} > {{ max_threshold }} then true - when {{ col }} < {{ min_threshold }} then true - {%- endfor %} - else false - end as OUTLIER - from {{ source_table }} - {%- else %} - {%- if max_zscore is not defined -%} - {%- set max_zscore = 2 -%} - {%- endif %} - with tbl_mean_std as ( + {%- elif method == "threshold" %} select - {%- for col in target_columns %} - avg({{ col }}) MEAN_{{ col }}, - stddev({{ col }}) STDDEV_{{ col }}{{ ", " if not loop.last else " " }} - {%- endfor %} + *, + case + {%- for col in target_columns %} + when {{ col }} > {{ max_threshold }} + then true + when {{ col }} < {{ min_threshold }} + then true + {%- endfor %} + else false + end as outlier from {{ source_table }} - ) select *, + {%- else %} + {%- if max_zscore is not defined -%}{%- set max_zscore = 2 -%} + {%- endif %} + with + tbl_mean_std as ( + select + {%- for col in target_columns %} + avg({{ col }}) mean_{{ col }}, + stddev( + {{ col }} + ) stddev_{{ col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + from {{ source_table }} + ) + select + *, {%- for col in target_columns %} - ({{col}} - MEAN_{{col}}) / STDDEV_{{ col }} as ZSCORE_{{ col }}, + ({{ col }} - mean_{{ col }}) / stddev_{{ col }} as zscore_{{ col }}, {%- endfor %} case {%- for col in target_columns %} - when abs(ZSCORE_{{ col }}) > {{ max_zscore }} then TRUE + when abs(zscore_{{ col }}) > {{ max_zscore }} then true {%- endfor %} - else FALSE - end as OUTLIER - from {{ source_table }}, tbl_mean_std - {%- endif %} -) select - {% if not drop -%} - OUTLIER, - {%- endif %} + else false + end as outlier + from {{ source_table }}, tbl_mean_std + {%- endif %} + ) +select + {% if not drop -%} outlier, {%- endif %} {% for col in source_col_names -%} {{ col }}{{ ", " if not loop.last else " " }} {%- endfor %} - from outliers -{% if drop -%} - where not OUTLIER -{%- endif -%} +from outliers +{% if drop -%} where not outlier {%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/rename/rename.sql b/rasgotransforms/rasgotransforms/transforms/rename/rename.sql index 19c846fc..2d2ea7a4 100644 --- a/rasgotransforms/rasgotransforms/transforms/rename/rename.sql +++ b/rasgotransforms/rasgotransforms/transforms/rename/rename.sql @@ -1,11 +1,11 @@ {%- set source_col_names = get_columns(source_table) -%} -SELECT -{%- for target_col, new_name in renames.items() %} - {{target_col}} AS {{new_name}}{{ ", " if not loop.last else "" }} -{%- endfor -%} -{%- for col in source_col_names %} - {%- if col not in renames %}, {{col}}{%- endif -%} -{% endfor %} -FROM {{ source_table }} \ No newline at end of file +select + {%- for target_col, new_name in renames.items() %} + {{ target_col }} as {{ new_name }}{{ ", " if not loop.last else "" }} + {%- endfor -%} + {%- for col in source_col_names %} + {%- if col not in renames %}, {{ col }}{%- endif -%} + {% endfor %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/rename/snowflake/rename.sql b/rasgotransforms/rasgotransforms/transforms/rename/snowflake/rename.sql index ae299609..0ffea26c 100644 --- a/rasgotransforms/rasgotransforms/transforms/rename/snowflake/rename.sql +++ b/rasgotransforms/rasgotransforms/transforms/rename/snowflake/rename.sql @@ -1,11 +1,11 @@ {%- set source_col_names = get_columns(source_table) -%} -SELECT -{%- for target_col, new_name in renames.items() %} - {{target_col}} AS {{new_name}}{{ ", " if not loop.last else "" }} -{%- endfor -%} -{%- set renames = (renames|join(',')|upper).split(',') -%} -{%- for col in source_col_names %} - {%- if col|upper not in renames %}, {{col|upper}}{%- endif -%} -{% endfor %} -FROM {{ source_table }} \ No newline at end of file +select + {%- for target_col, new_name in renames.items() %} + {{ target_col }} as {{ new_name }}{{ ", " if not loop.last else "" }} + {%- endfor -%} + {%- set renames = (renames|join(',')|upper).split(',') -%} + {%- for col in source_col_names %} + {%- if col|upper not in renames %}, {{ col|upper }}{%- endif -%} + {% endfor %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/replace_missing/bigquery/replace_missing.sql b/rasgotransforms/rasgotransforms/transforms/replace_missing/bigquery/replace_missing.sql index 22be44f0..a700e214 100644 --- a/rasgotransforms/rasgotransforms/transforms/replace_missing/bigquery/replace_missing.sql +++ b/rasgotransforms/rasgotransforms/transforms/replace_missing/bigquery/replace_missing.sql @@ -7,29 +7,26 @@ make query to fill with supplied scalar value else it will perform that imputation stagety on column #} {%- macro get_impute_query(col, imputation) -%} - {%- set impute_expression = '' -%} - {%- set imputation_strategy = '' -%} - {%- if imputation | lower in ['mean', 'max', 'min', 'sum'] -%} - {%- set imputation = 'AVG' if imputation == 'mean' else imputation -%} - {%- set imputation_strategy = imputation | upper -%} - {%- set impute_expression = imputation_strategy + '(' + col + ') over ()' -%} - {%- elif imputation|lower == "median" -%} - {%- set impute_expression = 'PERCENTILE_CONT(' + col + ', 0.5) over ()' -%} - {%- elif imputation|lower == "mode" -%} - {%- set impute_expression = col + '_MODE_VALUE' -%} - {%- else -%} - {%- set imputation = "'" + imputation + "'" if imputation is string else imputation -%} - {%- set impute_expression = imputation -%} - {%- endif -%} - COALESCE({{ col }}, {{ impute_expression }} ) as {{ col }} +{%- set impute_expression = '' -%} +{%- set imputation_strategy = '' -%} +{%- if imputation | lower in ['mean', 'max', 'min', 'sum'] -%} +{%- set imputation = 'AVG' if imputation == 'mean' else imputation -%} +{%- set imputation_strategy = imputation | upper -%} +{%- set impute_expression = imputation_strategy + '(' + col + ') over ()' -%} +{%- elif imputation|lower == "median" -%} +{%- set impute_expression = 'PERCENTILE_CONT(' + col + ', 0.5) over ()' -%} +{%- elif imputation|lower == "mode" -%} +{%- set impute_expression = col + '_MODE_VALUE' -%} +{%- else -%} +{%- set imputation = "'" + imputation + "'" if imputation is string else imputation -%} +{%- set impute_expression = imputation -%} +{%- endif -%} +coalesce({{ col }}, {{ impute_expression }}) as {{ col }} {%- endmacro -%} {# Macro to generate a query to flag missing values #} {%- macro get_flag_missing_query(col) -%} - CASE - WHEN {{ col }} IS NULL then 1 - ELSE 0 - END as {{ col }}_missing_flag +case when {{ col }} is null then 1 else 0 end as {{ col }}_missing_flag {%- endmacro -%} @@ -38,34 +35,37 @@ else it will perform that imputation stagety on column {%- set mode_aggs = dict() -%} {%- for col, agg in replacements.items() -%} - {%- if 'MODE' in agg|upper -%} - {%- set _ = mode_aggs.update({col: agg}) -%} - {%- endif -%} +{%- if 'MODE' in agg|upper -%} +{%- set _ = mode_aggs.update({col: agg}) -%} +{%- endif -%} {%- endfor -%} {%- if mode_aggs %} - {%- for mode_col, mode_agg in mode_aggs.items() %} - WITH {{ mode_col }}_MODE_CTE AS ( - SELECT - APPROX_TOP_COUNT({{ mode_col }}, 1)[OFFSET(0)].VALUE AS {{ mode_col }}_MODE_VALUE - FROM {{ source_table }} - ){{ ',' if not loop.last else '' }} - {%- endfor %} +{%- for mode_col, mode_agg in mode_aggs.items() %} +with + {{ mode_col }}_mode_cte as ( + select + approx_top_count({{ mode_col }}, 1)[ + offset(0) + ].value as {{ mode_col }}_mode_value + from {{ source_table }} + ){{ ',' if not loop.last else '' }} +{%- endfor %} {%- endif %} -SELECT -{%- for col in source_col_names -%} +select + {%- for col in source_col_names -%} {%- if col in replacements %} - {{ get_impute_query(col, replacements[col]) }}{{ ',' if flag_missing_vals or not loop.last else ''}} - {%- if flag_missing_vals %} - {{ get_flag_missing_query(col) }}{{ ',' if not loop.last else ''}} - {%- endif -%} - {%- else %} - {{ col }}{{ ',' if not loop.last else ''}} + {{ get_impute_query(col, replacements[col]) }}{{ ',' if flag_missing_vals or not loop.last else '' }} + {%- if flag_missing_vals %} + {{ get_flag_missing_query(col) }}{{ ',' if not loop.last else '' }} {%- endif -%} -{%- endfor %} -FROM {{source_table}} -{%- if mode_aggs %} + {%- else %} {{ col }}{{ ',' if not loop.last else '' }} + {%- endif -%} + {%- endfor %} +from + {{ source_table }} + {%- if mode_aggs %} {%- for mode_col, mode_agg in mode_aggs.items() %} - ,{{ mode_col }}_MODE_CTE + ,{{ mode_col }}_mode_cte {%- endfor %} -{%- endif %} + {%- endif %} diff --git a/rasgotransforms/rasgotransforms/transforms/replace_missing/replace_missing.sql b/rasgotransforms/rasgotransforms/transforms/replace_missing/replace_missing.sql index 872c72b1..11887fa4 100644 --- a/rasgotransforms/rasgotransforms/transforms/replace_missing/replace_missing.sql +++ b/rasgotransforms/rasgotransforms/transforms/replace_missing/replace_missing.sql @@ -7,40 +7,36 @@ make query to fill with supplied scalar value else it will perform that impuattion stagety on column #} {%- macro get_impute_query(col, imputation) -%} - {%- set impute_expression = '' -%} - {%- set imputation_strategy = '' -%} - {%- if imputation | lower in ['mean', 'median', 'mode', 'max', 'min', 'sum'] -%} - {%- set imputation = 'AVG' if imputation == 'mean' else imputation -%} - {%- set imputation_strategy = imputation | upper -%} - {%- set impute_expression = imputation_strategy + '(' + col + ') over ()' -%} - {%- else -%} - {%- set imputation = "'" + imputation + "'" if imputation is string else imputation -%} - {%- set impute_expression = imputation -%} - {%- endif -%} - COALESCE({{ col }}, {{ impute_expression }} ) as {{ col }} +{%- set impute_expression = '' -%} +{%- set imputation_strategy = '' -%} +{%- if imputation | lower in ['mean', 'median', 'mode', 'max', 'min', 'sum'] -%} +{%- set imputation = 'AVG' if imputation == 'mean' else imputation -%} +{%- set imputation_strategy = imputation | upper -%} +{%- set impute_expression = imputation_strategy + '(' + col + ') over ()' -%} +{%- else -%} +{%- set imputation = "'" + imputation + "'" if imputation is string else imputation -%} +{%- set impute_expression = imputation -%} +{%- endif -%} +coalesce({{ col }}, {{ impute_expression }}) as {{ col }} {%- endmacro -%} {# Macro to generate a query to flag missing values #} {%- macro get_flag_missing_query(col) -%} - CASE - WHEN {{ col }} IS NULL then 1 - ELSE 0 - END as {{ col }}_missing_flag +case when {{ col }} is null then 1 else 0 end as {{ col }}_missing_flag {%- endmacro -%} {# Get all Columns in Source Table #} {%- set source_col_names = get_columns(source_table) -%} -SELECT -{%- for col in source_col_names -%} +select + {%- for col in source_col_names -%} {%- if col in replacements %} - {{ get_impute_query(col, replacements[col]) }}{{ ',' if flag_missing_vals or not loop.last else ''}} + {{ get_impute_query(col, replacements[col]) }}{{ ',' if flag_missing_vals or not loop.last else '' }} {%- if flag_missing_vals %} - {{ get_flag_missing_query(col) }}{{ ',' if not loop.last else ''}} + {{ get_flag_missing_query(col) }}{{ ',' if not loop.last else '' }} {%- endif -%} - {%- else %} - {{ col }}{{ ',' if not loop.last else ''}} + {%- else %} {{ col }}{{ ',' if not loop.last else '' }} {%- endif -%} -{%- endfor %} -FROM {{source_table}} \ No newline at end of file + {%- endfor %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/replace_string/replace_string.sql b/rasgotransforms/rasgotransforms/transforms/replace_string/replace_string.sql index 5f15d0dc..8c291ff4 100644 --- a/rasgotransforms/rasgotransforms/transforms/replace_string/replace_string.sql +++ b/rasgotransforms/rasgotransforms/transforms/replace_string/replace_string.sql @@ -1,27 +1,32 @@ -{% if position is not defined %} - {% set position = 1 %} -{% else %} - {% set use_regex = True %} +{% if position is not defined %} {% set position = 1 %} +{% else %} {% set use_regex = True %} {% endif %} -{% if occurrence is not defined %} - {% set occurrence = 0 %} -{% else %} - {% set use_regex = True %} +{% if occurrence is not defined %} {% set occurrence = 0 %} +{% else %} {% set use_regex = True %} {% endif %} -{% if parameters is not defined %} - {% set parameters = 'c' %} -{% else %} - {% set use_regex = True %} +{% if parameters is not defined %} {% set parameters = 'c' %} +{% else %} {% set use_regex = True %} {% endif %} {% if use_regex %} -SELECT *, -REGEXP_REPLACE({{ source_col }}, '{{ pattern }}', '{{ replacement }}', {{ position }}, {{ occurrence }}, '{{ parameters }}') AS {{cleanse_name(alias) if alias is defined else "REPLACE_" + source_col}} -FROM {{ source_table }} +select + *, + regexp_replace( + {{ source_col }}, + '{{ pattern }}', + '{{ replacement }}', + {{ position }}, + {{ occurrence }}, + '{{ parameters }}' + ) as {{ cleanse_name(alias) if alias is defined else "REPLACE_" + source_col }} +from {{ source_table }} {% else %} -SELECT *, -REPLACE({{ source_col }}, '{{ pattern }}', '{{ replacement }}') AS {{cleanse_name(alias) if alias is defined else "REPLACE_" + source_col}} -FROM {{ source_table }} -{% endif %} \ No newline at end of file +select + *, + replace( + {{ source_col }}, '{{ pattern }}', '{{ replacement }}' + ) as {{ cleanse_name(alias) if alias is defined else "REPLACE_" + source_col }} +from {{ source_table }} +{% endif %} diff --git a/rasgotransforms/rasgotransforms/transforms/reshape/bigquery/reshape.sql b/rasgotransforms/rasgotransforms/transforms/reshape/bigquery/reshape.sql index f2fbcc48..0559f248 100644 --- a/rasgotransforms/rasgotransforms/transforms/reshape/bigquery/reshape.sql +++ b/rasgotransforms/rasgotransforms/transforms/reshape/bigquery/reshape.sql @@ -5,36 +5,35 @@ limit 1000 {%- endset -%} - {%- if list_of_vals is not defined -%} - {%- set results = run_query(distinct_val_query) -%} - {%- set distinct_vals = results[results.columns[0]].to_list() -%} - {%- else -%} - {%- set distinct_vals = list_of_vals -%} - {%- endif -%} +{%- if list_of_vals is not defined -%} +{%- set results = run_query(distinct_val_query) -%} +{%- set distinct_vals = results[results.columns[0]].to_list() -%} +{%- else -%} {%- set distinct_vals = list_of_vals -%} +{%- endif -%} - SELECT * FROM ( - SELECT - {%- for dimension in dimensions %} - {{ dimension }}, - {%- endfor %} +select * +from + ( + select + {%- for dimension in dimensions %} {{ dimension }}, {%- endfor %} {{ values }}, {{ columns }} - FROM {{ source_table }} + from {{ source_table }} ) - PIVOT ( - {{ agg_method }} ( {{ values }} ) as _ - FOR {{ columns }} IN ( + pivot( + {{ agg_method }} ({{ values }}) as _ + for {{ columns }} in ( {%- for val in distinct_vals %} - {%- if val is string -%} - '{{ val }}' - {%- else -%} - {{ val }} + {%- if val is string -%}'{{ val }}' + {%- else -%}{{ val }} {%- endif -%} - {{', ' if not loop.last else ''}} + {{ ', ' if not loop.last else '' }} {%- endfor -%} ) ) {%- else -%} - SELECT * FROM {{ source_table }} - UNPIVOT( {{ value_column }} for {{ name_column }} in ( {{ columns | join(', ')}} )) -{%- endif -%} \ No newline at end of file +select * +from + {{ source_table }} + unpivot({{ value_column }} for {{ name_column }} in ({{ columns | join(', ') }})) +{%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/reshape/snowflake/reshape.sql b/rasgotransforms/rasgotransforms/transforms/reshape/snowflake/reshape.sql index 2468ff1a..a378c233 100644 --- a/rasgotransforms/rasgotransforms/transforms/reshape/snowflake/reshape.sql +++ b/rasgotransforms/rasgotransforms/transforms/reshape/snowflake/reshape.sql @@ -5,26 +5,40 @@ limit 1000 {%- endset -%} - {%- if list_of_vals is not defined -%} - {%- set results = run_query(distinct_val_query) -%} - {%- set distinct_vals = results[results.columns[0]].to_list() -%} - {%- else -%} - {%- set distinct_vals = list_of_vals -%} - {%- endif -%} +{%- if list_of_vals is not defined -%} +{%- set results = run_query(distinct_val_query) -%} +{%- set distinct_vals = results[results.columns[0]].to_list() -%} +{%- else -%} {%- set distinct_vals = list_of_vals -%} +{%- endif -%} - {# Jinja Macro to get the comma separated cleansed name list #} - {%- macro get_values(distinct_values) -%} - {%- for val in distinct_vals -%} - {{ cleanse_name(val) }}{{ ', ' if not loop.last else '' }} - {%- endfor -%} - {%- endmacro -%} +{# Jinja Macro to get the comma separated cleansed name list #} +{%- macro get_values(distinct_values) -%} +{%- for val in distinct_vals -%} +{{ cleanse_name(val) }}{{ ', ' if not loop.last else '' }} +{%- endfor -%} +{%- endmacro -%} - SELECT {{ dimensions | join(", ") }}{{ ',' if dimensions else ''}} {{ get_values(distinct_vals) }} - FROM ( SELECT {{ dimensions | join(", ") }}{{ ',' if dimensions else ''}} {{ values }}, {{ columns }} FROM {{ source_table }}) - PIVOT ( {{ agg_method }} ( {{ values }} ) FOR {{ columns }} IN ( '{{ distinct_vals | join("', '") }}' ) ) as p - ( {{ dimensions | join(", ") }}{{ ',' if dimensions else ''}} {{ get_values(distinct_vals) }} ) +select + {{ dimensions | join(", ") }}{{ ',' if dimensions else '' }} {{ get_values(distinct_vals) }} +from + ( + select + {{ dimensions | join(", ") }}{{ ',' if dimensions else '' }} {{ values }}, + {{ columns }} + from {{ source_table }} + ) + pivot( + {{ agg_method }} ({{ values }}) for {{ columns }} in ( + '{{ distinct_vals | join("', '") }}' + ) + ) as p + ( + {{ dimensions | join(", ") }}{{ ',' if dimensions else '' }} {{ get_values(distinct_vals) }} + ) {%- else -%} - SELECT * FROM {{ source_table }} - UNPIVOT( {{ value_column }} for {{ name_column }} in ( {{ columns | join(', ')}} )) -{%- endif -%} \ No newline at end of file +select * +from + {{ source_table }} + unpivot({{ value_column }} for {{ name_column }} in ({{ columns | join(', ') }})) +{%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/rolling_agg/rolling_agg.sql b/rasgotransforms/rasgotransforms/transforms/rolling_agg/rolling_agg.sql index 7af26534..d76b0b38 100644 --- a/rasgotransforms/rasgotransforms/transforms/rolling_agg/rolling_agg.sql +++ b/rasgotransforms/rasgotransforms/transforms/rolling_agg/rolling_agg.sql @@ -1,20 +1,18 @@ -SELECT * -{% for col, aggs in aggregations.items() -%} - {%- for agg in aggs -%} +select + * + {% for col, aggs in aggregations.items() -%} + {%- for agg in aggs -%} {%- for offset in offsets %} - {% set normalized_offset = -offset %} - , {{ agg }}({{ col }}) OVER( - {%- if group_by %} - PARTITION BY {{ group_by | join(", ") }} - {% endif -%} - ORDER BY {{ order_by | join(", ") }} + {% set normalized_offset = -offset %}, + {{ agg }} ({{ col }}) over ( + {%- if group_by %}partition by {{ group_by | join(", ") }} {% endif -%} + order by {{ order_by | join(", ") }} {% if normalized_offset > 0 -%} - ROWS BETWEEN CURRENT ROW AND {{ normalized_offset }} FOLLOWING - {% else -%} - ROWS BETWEEN {{ normalized_offset|abs }} PRECEDING AND CURRENT ROW + rows between current row and {{ normalized_offset }} following + {% else -%} rows between {{ normalized_offset|abs }} preceding and current row {% endif -%} - ) as {{ cleanse_name(agg + '_' + col + '_' + offset|string) }} + ) as {{ cleanse_name(agg + '_' + col + '_' + offset|string) }} + {%- endfor -%} {%- endfor -%} - {%- endfor -%} -{%- endfor %} -FROM {{ source_table }} \ No newline at end of file + {%- endfor %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/rsi/rsi.sql b/rasgotransforms/rasgotransforms/transforms/rsi/rsi.sql index da721573..4530a909 100644 --- a/rasgotransforms/rasgotransforms/transforms/rsi/rsi.sql +++ b/rasgotransforms/rasgotransforms/transforms/rsi/rsi.sql @@ -1,36 +1,54 @@ -WITH CTE_LAG1 AS ( -SELECT *, - lag({{ value_col }}, 1) over (partition by {{ partition_col }} order by {{ order_col }}) as LAG_{{ value_col }} -from {{ source_table }} -) , -CTE_DELTA AS ( -SELECT * - , {{ value_col }} - LAG_{{ value_col }} as DELTA -FROM CTE_LAG1 -) , -CTE_GAINLOSS_SPLIT AS ( -SELECT * - , CASE WHEN DELTA > 0 THEN DELTA WHEN DELTA = 0 THEN 0 ELSE 0 END as GAIN - , CASE WHEN DELTA < 0 THEN abs(DELTA) WHEN DELTA = 0 THEN 0 ELSE 0 END as LOSS -FROM CTE_DELTA -) , -CTE_MOVINGAVG AS ( -SELECT * -, avg(GAIN) OVER(PARTITION BY {{ partition_col }} ORDER BY {{ order_col }} ROWS BETWEEN {{ window - 1 }} PRECEDING AND CURRENT ROW) AS AVG_GAIN_{{ window }} -, avg(LOSS) OVER(PARTITION BY {{ partition_col }} ORDER BY {{ order_col }} ROWS BETWEEN {{ window - 1 }} PRECEDING AND CURRENT ROW) AS AVG_LOSS_{{ window }} -FROM CTE_GAINLOSS_SPLIT -) , -CTE_RSI AS ( -SELECT * - , CASE WHEN AVG_LOSS_{{ window }}=0 THEN 100 ELSE 100 - (100 / (1+(AVG_GAIN_{{ window }} / AVG_LOSS_{{ window }}))) END as {{ value_col }}_RSI_{{ window }} -FROM CTE_MOVINGAVG -) , -CTE_FINAL AS ( -SELECT {{ order_col }}, {{ partition_col }}, {{ value_col }}_RSI_{{ window }} -FROM CTE_RSI -) -SELECT A.*, B.{{ value_col }}_RSI_{{ window }} -FROM {{ source_table }} A -INNER JOIN CTE_FINAL B -ON A.{{ partition_col }} = B.{{ partition_col }} -AND A.{{ order_col }} = B.{{ order_col }} +with + cte_lag1 as ( + select + *, + lag({{ value_col }}, 1) over ( + partition by {{ partition_col }} order by {{ order_col }} + ) as lag_{{ value_col }} + from {{ source_table }} + ), + cte_delta as ( + select *, {{ value_col }} - lag_{{ value_col }} as delta from cte_lag1 + ), + cte_gainloss_split as ( + select + *, + case when delta > 0 then delta when delta = 0 then 0 else 0 end as gain, + case when delta < 0 then abs(delta) when delta = 0 then 0 else 0 end as loss + from cte_delta + ), + cte_movingavg as ( + select + *, + avg(gain) over ( + partition by {{ partition_col }} + order by {{ order_col }} + rows between {{ window - 1 }} preceding and current row + ) as avg_gain_{{ window }}, + avg(loss) over ( + partition by {{ partition_col }} + order by {{ order_col }} + rows between {{ window - 1 }} preceding and current row + ) as avg_loss_{{ window }} + from cte_gainloss_split + ), + cte_rsi as ( + select + *, + case + when avg_loss_{{ window }}= 0 + then 100 + else 100 - (100 / (1 + (avg_gain_{{ window }} / avg_loss_{{ window }}))) + end as {{ value_col }}_rsi_{{ window }} + from cte_movingavg + ), + cte_final as ( + select {{ order_col }}, {{ partition_col }}, {{ value_col }}_rsi_{{ window }} + from cte_rsi + ) +select a.*, b.{{ value_col }}_rsi_{{ window }} +from {{ source_table }} a +inner join + cte_final b + on a.{{ partition_col }} = b.{{ partition_col }} + and a.{{ order_col }} = b.{{ order_col }} diff --git a/rasgotransforms/rasgotransforms/transforms/sample/sample.sql b/rasgotransforms/rasgotransforms/transforms/sample/sample.sql index 58ea6d73..ceaa7e0c 100644 --- a/rasgotransforms/rasgotransforms/transforms/sample/sample.sql +++ b/rasgotransforms/rasgotransforms/transforms/sample/sample.sql @@ -1,30 +1,30 @@ -{%- if num_rows|float < 1 -%} - {%- set sample_amount = num_rows*100 |float -%} -{% else %} - {%- set sample_amount = num_rows~' ROWS' -%} +{%- if num_rows|float < 1 -%} {%- set sample_amount = num_rows*100 |float -%} +{% else %} {%- set sample_amount = num_rows~' ROWS' -%} {% endif %} {% if filters is defined %} -WITH filtered AS ( - SELECT * FROM {{source_table}} - {% for filter_block in filters %} - {%- set oloop = loop -%} - {{ 'WHERE ' if oloop.first else ' AND ' }} - {%- if filter_block is not mapping -%} - {{ filter_block }} - {%- else -%} +with + filtered as ( + select * + from + {{ source_table }} + {% for filter_block in filters %} + {%- set oloop = loop -%} + {{ 'WHERE ' if oloop.first else ' AND ' }} + {%- if filter_block is not mapping -%} {{ filter_block }} + {%- else -%} {%- if filter_block['operator'] == 'CONTAINS' -%} - {{ filter_block['operator'] }}({{ filter_block['columnName'] }}, {{ filter_block['comparisonValue'] }}) + {{ filter_block['operator'] }} ( + {{ filter_block['columnName'] }}, {{ filter_block['comparisonValue'] }} + ) {%- else -%} - {{ filter_block['columnName'] }} {{ filter_block['operator'] }} {{ filter_block['comparisonValue'] }} + {{ filter_block['columnName'] }} {{ filter_block['operator'] }} {{ filter_block['comparisonValue'] }} + {%- endif -%} {%- endif -%} - {%- endif -%} - {%- endfor -%} + {%- endfor -%} -) -SELECT * FROM filtered -TABLESAMPLE BERNOULLI ( {{ sample_amount }} ) -{% else %} -SELECT * FROM {{source_table}} -TABLESAMPLE BERNOULLI ( {{ sample_amount }} ) -{% endif %} \ No newline at end of file + ) +select * +from filtered tablesample bernoulli({{ sample_amount }}) +{% else %} select * from {{ source_table }} tablesample bernoulli({{ sample_amount }}) +{% endif %} diff --git a/rasgotransforms/rasgotransforms/transforms/sample_class/sample_class.sql b/rasgotransforms/rasgotransforms/transforms/sample_class/sample_class.sql index e0bc1c9b..728d382e 100644 --- a/rasgotransforms/rasgotransforms/transforms/sample_class/sample_class.sql +++ b/rasgotransforms/rasgotransforms/transforms/sample_class/sample_class.sql @@ -1,5 +1,11 @@ {%- for class, n in sample.items() %} - SELECT * FROM - (SELECT * FROM {{ source_table }} WHERE {{ sample_col }} = '{{ class }}') SAMPLE ({{ n }}{{' rows' if n > 1 else ''}}) +select * +from + ( + select * + from {{ source_table }} + where {{ sample_col }} = '{{ class }}' + ) sample({{ n }}{{ ' rows' if n > 1 else '' }} + ) {{ '' if loop.last else ' UNION ALL ' }} -{%- endfor %} \ No newline at end of file +{%- endfor %} diff --git a/rasgotransforms/rasgotransforms/transforms/sankey/sankey.sql b/rasgotransforms/rasgotransforms/transforms/sankey/sankey.sql index e32cd12a..d5dd67b4 100644 --- a/rasgotransforms/rasgotransforms/transforms/sankey/sankey.sql +++ b/rasgotransforms/rasgotransforms/transforms/sankey/sankey.sql @@ -1,13 +1,11 @@ {%- for i in range((stage|length) - 1) -%} - SELECT - '{{ stage[i] }}_' || CAST({{ stage[i] }} AS STRING) AS SOURCE_NODE, - '{{ stage[i+1] }}_' || CAST({{ stage[i+1] }} AS STRING) AS DEST_NODE, - COUNT(*) AS WIDTH -FROM {{ source_table }} -GROUP BY - SOURCE_NODE, - DEST_NODE -HAVING - SOURCE_NODE IS NOT NULL AND DEST_NODE IS NOT NULL -{{ "UNION ALL" if not loop.last else "" }} +select + '{{ stage[i] }}_' || cast({{ stage[i] }} as string) as source_node, + '{{ stage[i+1] }}_' || cast({{ stage[i+1] }} as string) as dest_node, + count(*) as width +from {{ source_table }} +group by source_node, dest_node +having + source_node is not null and dest_node is not null + {{ "UNION ALL" if not loop.last else "" }} {% endfor %} diff --git a/rasgotransforms/rasgotransforms/transforms/scale_columns/scale_columns.sql b/rasgotransforms/rasgotransforms/transforms/scale_columns/scale_columns.sql index f5885d3e..403b72bf 100644 --- a/rasgotransforms/rasgotransforms/transforms/scale_columns/scale_columns.sql +++ b/rasgotransforms/rasgotransforms/transforms/scale_columns/scale_columns.sql @@ -4,52 +4,68 @@ {%- if method == 'standard' -%} {%- if averages is not defined or standarddevs is not defined -%} -with avg_stddev_vals as ( - select +with + avg_stddev_vals as ( + select + {%- for column in columns_to_scale %} + avg({{ column }}) as avg_{{ column }}, + stddev( + {{ column }} + ) as stddev_{{ column }}{{ ", " if not loop.last else "" }} + {%- endfor %} + from {{ source_table }} + ) +select + {{ source_table + ".*" if not overwrite_columns else untouched_cols }}, {%- for column in columns_to_scale %} - avg({{column}}) as avg_{{column}}, - stddev({{column}}) as stddev_{{column}}{{ ", " if not loop.last else "" }} + ({{ column }} - avg_{{ column }}) / (stddev_{{ column }}) + as {{ column if overwrite_columns else column + "_SCALED" }}{{ ", " if not loop.last else "" }} {%- endfor %} - from {{source_table}} -) select {{ source_table + ".*" if not overwrite_columns else untouched_cols}}, -{%- for column in columns_to_scale %} - ({{column}} - avg_{{column}}) / (stddev_{{column}}) as {{column if overwrite_columns else column + "_SCALED"}}{{ ", " if not loop.last else "" }} -{%- endfor %} -from avg_stddev_vals, {{source_table}} +from avg_stddev_vals, {{ source_table }} {%- else -%} -select {{ untouched_cols }}, -{%- for column in columns_to_scale %} - ({{column}} - {{averages[loop.index0]}}) / ({{standarddevs[loop.index0]}}) as {{column if overwrite_columns else column + "_SCALED"}}{{ ", " if not loop.last else "" }} -{%- endfor %} -from {{source_table}} +select + {{ untouched_cols }}, + {%- for column in columns_to_scale %} + ({{ column }} - {{ averages[loop.index0] }}) / ({{ standarddevs[loop.index0] }}) + as {{ column if overwrite_columns else column + "_SCALED" }}{{ ", " if not loop.last else "" }} + {%- endfor %} +from {{ source_table }} {%- endif -%} {%- elif method == 'min_max' -%} {%- if minimums is not defined -%} -with min_max_vals as ( - select +with + min_max_vals as ( + select + {%- for column in columns_to_scale %} + min({{ column }}) as min_{{ column }}, + max({{ column }}) as max_{{ column }}{{ "," if not loop.last else "" }} + {%- endfor %} + from {{ source_table }} + ) +select + {{ source_table + ".*" if not overwrite_columns else untouched_cols }}, {%- for column in columns_to_scale %} - min({{column}}) as min_{{column}}, - max({{column}}) as max_{{column}}{{ "," if not loop.last else "" }} + ({{ column }} - min_{{ column }}) / (max_{{ column }} - min_{{ column }}) + as {{ column if overwrite_columns else column + "_SCALED" }}{{ ", " if not loop.last else "" }} {%- endfor %} - from {{source_table}} -) select {{ source_table + ".*" if not overwrite_columns else untouched_cols}}, -{%- for column in columns_to_scale %} - ({{column}} - min_{{column}}) / (max_{{column}} - min_{{column}}) as {{column if overwrite_columns else column + "_SCALED"}}{{ ", " if not loop.last else "" }} -{%- endfor %} -from min_max_vals, {{source_table}} +from min_max_vals, {{ source_table }} {%- else -%} -select {{ untouched_cols }}, -{%- for column in columns_to_scale %} - ({{column}} - {{minimums[loop.index0]}}) / ({{maximums[loop.index0]}} - {{minimums[loop.index0]}}) as {{column if overwrite_columns else column + "_SCALED"}}{{ ", " if not loop.last else "" }} -{%- endfor %} -from {{source_table}} +select + {{ untouched_cols }}, + {%- for column in columns_to_scale %} + ({{ column }} - {{ minimums[loop.index0] }}) / ( + {{ maximums[loop.index0] }} - {{ minimums[loop.index0] }} + ) + as {{ column if overwrite_columns else column + "_SCALED" }}{{ ", " if not loop.last else "" }} + {%- endfor %} +from {{ source_table }} {%- endif -%} {%- else -%} -{{ raise_exception("Method '" + method + "' is not recognized. The supported methods are 'standard' and 'min_max'.")}} +{{ raise_exception("Method '" + method + "' is not recognized. The supported methods are 'standard' and 'min_max'.") }} -{%- endif -%} \ No newline at end of file +{%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/sliding_slope/sliding_slope.sql b/rasgotransforms/rasgotransforms/transforms/sliding_slope/sliding_slope.sql index e6dd149b..e0e2b533 100644 --- a/rasgotransforms/rasgotransforms/transforms/sliding_slope/sliding_slope.sql +++ b/rasgotransforms/rasgotransforms/transforms/sliding_slope/sliding_slope.sql @@ -1,33 +1,50 @@ -WITH CTE_RANK AS ( -SELECT *, ROW_NUMBER() OVER(PARTITION BY {{ partition_col }} ORDER BY {{ order_col }} ASC) AS RANK_{{ order_col }} -FROM {{ source_table }} -) , -CTE_WINDOW AS ( -SELECT A.{{ partition_col }}, A.RANK_{{ order_col }}, -ARRAY_AGG(ARRAY_CONSTRUCT(B.{{ value_col }}, B.RANK_{{ order_col }})) ARRAY_AGG_OBJ -FROM CTE_RANK A -JOIN CTE_RANK B -ON A.{{ partition_col }}=B.{{ partition_col }} -AND A.RANK_{{ order_col }} BETWEEN B.RANK_{{ order_col }} AND B.RANK_{{ order_col }}+{{ window }} -GROUP BY A.{{ partition_col }}, A.RANK_{{ order_col }} -), -CTE_SLOPE AS -( -SELECT {{ partition_col }}, RANK_{{ order_col }} - , regr_slope(X.VALUE[0], X.VALUE[1]) AS {{ value_col }}_SLOPE_{{ window }} -FROM CTE_WINDOW, table(flatten(ARRAY_AGG_OBJ)) X -GROUP BY {{ partition_col }}, RANK_{{ order_col }} -), -CTE_RESULT AS -( -SELECT A.{{ partition_col }}, A.{{ order_col }}, B.{{ value_col }}_SLOPE_{{ window }} -FROM CTE_RANK A -INNER JOIN CTE_SLOPE B -ON A.{{ partition_col }} = B.{{ partition_col }} -AND A.RANK_{{ order_col }} = B.RANK_{{ order_col }} -) -SELECT A.*, B.{{ value_col }}_SLOPE_{{ window }} -FROM {{ source_table }} A -LEFT OUTER JOIN CTE_RESULT B -ON A.{{ partition_col }} = B.{{ partition_col }} -AND A.{{ order_col }} = B.{{ order_col }} \ No newline at end of file +with + cte_rank as ( + select + *, + row_number() over ( + partition by {{ partition_col }} order by {{ order_col }} asc + ) as rank_{{ order_col }} + from {{ source_table }} + ), + cte_window as ( + select + a.{{ partition_col }}, + a.rank_{{ order_col }}, + array_agg( + array_construct(b.{{ value_col }}, b.rank_{{ order_col }}) + ) array_agg_obj + from cte_rank a + join + cte_rank b + on a.{{ partition_col }}= b.{{ partition_col }} + and a.rank_{{ order_col }} + between b.rank_{{ order_col }} and b.rank_{{ order_col }} + +{{ window }} + group by a.{{ partition_col }}, a.rank_{{ order_col }} + ), + cte_slope as ( + select + {{ partition_col }}, + rank_{{ order_col }}, + regr_slope(x.value[0], x.value[1]) as {{ value_col }}_slope_{{ window }} + from cte_window, table(flatten(array_agg_obj)) x + group by {{ partition_col }}, rank_{{ order_col }} + ), + cte_result as ( + select + a.{{ partition_col }}, + a.{{ order_col }}, + b.{{ value_col }}_slope_{{ window }} + from cte_rank a + inner join + cte_slope b + on a.{{ partition_col }} = b.{{ partition_col }} + and a.rank_{{ order_col }} = b.rank_{{ order_col }} + ) +select a.*, b.{{ value_col }}_slope_{{ window }} +from {{ source_table }} a +left outer join + cte_result b + on a.{{ partition_col }} = b.{{ partition_col }} + and a.{{ order_col }} = b.{{ order_col }} diff --git a/rasgotransforms/rasgotransforms/transforms/sliding_slope/snowflake/sliding_slope.sql b/rasgotransforms/rasgotransforms/transforms/sliding_slope/snowflake/sliding_slope.sql index e6dd149b..e0e2b533 100644 --- a/rasgotransforms/rasgotransforms/transforms/sliding_slope/snowflake/sliding_slope.sql +++ b/rasgotransforms/rasgotransforms/transforms/sliding_slope/snowflake/sliding_slope.sql @@ -1,33 +1,50 @@ -WITH CTE_RANK AS ( -SELECT *, ROW_NUMBER() OVER(PARTITION BY {{ partition_col }} ORDER BY {{ order_col }} ASC) AS RANK_{{ order_col }} -FROM {{ source_table }} -) , -CTE_WINDOW AS ( -SELECT A.{{ partition_col }}, A.RANK_{{ order_col }}, -ARRAY_AGG(ARRAY_CONSTRUCT(B.{{ value_col }}, B.RANK_{{ order_col }})) ARRAY_AGG_OBJ -FROM CTE_RANK A -JOIN CTE_RANK B -ON A.{{ partition_col }}=B.{{ partition_col }} -AND A.RANK_{{ order_col }} BETWEEN B.RANK_{{ order_col }} AND B.RANK_{{ order_col }}+{{ window }} -GROUP BY A.{{ partition_col }}, A.RANK_{{ order_col }} -), -CTE_SLOPE AS -( -SELECT {{ partition_col }}, RANK_{{ order_col }} - , regr_slope(X.VALUE[0], X.VALUE[1]) AS {{ value_col }}_SLOPE_{{ window }} -FROM CTE_WINDOW, table(flatten(ARRAY_AGG_OBJ)) X -GROUP BY {{ partition_col }}, RANK_{{ order_col }} -), -CTE_RESULT AS -( -SELECT A.{{ partition_col }}, A.{{ order_col }}, B.{{ value_col }}_SLOPE_{{ window }} -FROM CTE_RANK A -INNER JOIN CTE_SLOPE B -ON A.{{ partition_col }} = B.{{ partition_col }} -AND A.RANK_{{ order_col }} = B.RANK_{{ order_col }} -) -SELECT A.*, B.{{ value_col }}_SLOPE_{{ window }} -FROM {{ source_table }} A -LEFT OUTER JOIN CTE_RESULT B -ON A.{{ partition_col }} = B.{{ partition_col }} -AND A.{{ order_col }} = B.{{ order_col }} \ No newline at end of file +with + cte_rank as ( + select + *, + row_number() over ( + partition by {{ partition_col }} order by {{ order_col }} asc + ) as rank_{{ order_col }} + from {{ source_table }} + ), + cte_window as ( + select + a.{{ partition_col }}, + a.rank_{{ order_col }}, + array_agg( + array_construct(b.{{ value_col }}, b.rank_{{ order_col }}) + ) array_agg_obj + from cte_rank a + join + cte_rank b + on a.{{ partition_col }}= b.{{ partition_col }} + and a.rank_{{ order_col }} + between b.rank_{{ order_col }} and b.rank_{{ order_col }} + +{{ window }} + group by a.{{ partition_col }}, a.rank_{{ order_col }} + ), + cte_slope as ( + select + {{ partition_col }}, + rank_{{ order_col }}, + regr_slope(x.value[0], x.value[1]) as {{ value_col }}_slope_{{ window }} + from cte_window, table(flatten(array_agg_obj)) x + group by {{ partition_col }}, rank_{{ order_col }} + ), + cte_result as ( + select + a.{{ partition_col }}, + a.{{ order_col }}, + b.{{ value_col }}_slope_{{ window }} + from cte_rank a + inner join + cte_slope b + on a.{{ partition_col }} = b.{{ partition_col }} + and a.rank_{{ order_col }} = b.rank_{{ order_col }} + ) +select a.*, b.{{ value_col }}_slope_{{ window }} +from {{ source_table }} a +left outer join + cte_result b + on a.{{ partition_col }} = b.{{ partition_col }} + and a.{{ order_col }} = b.{{ order_col }} diff --git a/rasgotransforms/rasgotransforms/transforms/split_column/split_column.sql b/rasgotransforms/rasgotransforms/transforms/split_column/split_column.sql index d7d0b572..357c62c7 100644 --- a/rasgotransforms/rasgotransforms/transforms/split_column/split_column.sql +++ b/rasgotransforms/rasgotransforms/transforms/split_column/split_column.sql @@ -1,45 +1,58 @@ {%- set source_col_names = get_columns(source_table) -%} -with deliminated as ( - select *, - {% for col in output_cols %} - {%- if loop.first -%} - case when CHARINDEX('{{ sep }}', {{ target_col }}) > 0 - then CHARINDEX('{{ sep }}', {{ target_col }}) - else len({{ target_col }}) + 1 - end as IX_{{ col }}, - {% elif not loop.last-%} - case when CHARINDEX('{{ sep }}', {{ target_col }}, IX_{{ loop.previtem }} + 1) > 0 - then CHARINDEX('{{ sep }}', {{ target_col }}, IX_{{ loop.previtem }} + 1) - else len({{ target_col }}) + 1 - end as IX_{{ col }}, - {% else -%} - len({{ target_col }}) + 1 as IX_{{ col }}, - {% endif -%} - {%- endfor %} +with + deliminated as ( + select + *, + {% for col in output_cols %} + {%- if loop.first -%} + case + when charindex('{{ sep }}', {{ target_col }}) > 0 + then charindex('{{ sep }}', {{ target_col }}) + else len({{ target_col }}) + 1 + end as ix_{{ col }}, + {% elif not loop.last -%} + case + when + charindex('{{ sep }}', {{ target_col }}, ix_{{ loop.previtem }} + 1) + > 0 + then + charindex('{{ sep }}', {{ target_col }}, ix_{{ loop.previtem }} + 1) + else len({{ target_col }}) + 1 + end as ix_{{ col }}, + {% else -%} len({{ target_col }}) + 1 as ix_{{ col }}, + {% endif -%} + {%- endfor %} - {%- for col in output_cols %} - {%- if loop.first -%} - case when IX_{{ col }} > 0 - then substring({{ target_col }}, 1, IX_{{ col }} - 1) - else NULL - end as {{ col }}, - {% else -%} - case when IX_{{ col }} > IX_{{ loop.previtem }} - then substring({{ target_col }}, IX_{{ loop.previtem }} + 1, IX_{{ col }} - IX_{{ loop.previtem }} - 1) - else NULL - end as {{ col }}{{ ", " if not loop.last else " " }} - {% endif -%} - {%- endfor %} - from {{ source_table }} limit 100 -) select + {%- for col in output_cols %} + {%- if loop.first -%} + case + when ix_{{ col }} > 0 + then substring({{ target_col }}, 1, ix_{{ col }} - 1) + else null + end as {{ col }}, + {% else -%} + case + when ix_{{ col }} > ix_{{ loop.previtem }} + then + substring( + {{ target_col }}, + ix_{{ loop.previtem }} + 1, + ix_{{ col }} - ix_{{ loop.previtem }} - 1 + ) + else null + end as {{ col }}{{ ", " if not loop.last else " " }} + {% endif -%} + {%- endfor %} + from {{ source_table }} + limit 100 + ) +select {% for col in source_col_names -%} {{ col }} {%- if col == target_col -%} - {%- for output_col in output_cols -%} - , {{ output_col }} - {%- endfor -%} + {%- for output_col in output_cols -%}, {{ output_col }} {%- endfor -%} {%- endif -%} {{ ", " if not loop.last else " " }} {%- endfor %} -from deliminated \ No newline at end of file +from deliminated diff --git a/rasgotransforms/rasgotransforms/transforms/standard_scaler/standard_scaler.sql b/rasgotransforms/rasgotransforms/transforms/standard_scaler/standard_scaler.sql index a81d2422..8aa62f9e 100644 --- a/rasgotransforms/rasgotransforms/transforms/standard_scaler/standard_scaler.sql +++ b/rasgotransforms/rasgotransforms/transforms/standard_scaler/standard_scaler.sql @@ -1,23 +1,31 @@ {%- set untouched_cols = get_columns(source_table)|list|reject('in', columns_to_scale)|join(',') if overwrite_columns else "*" -%} {%- if averages is not defined or standarddevs is not defined -%} -with avg_stddev_vals as ( - select +with + avg_stddev_vals as ( + select + {%- for column in columns_to_scale %} + avg({{ column }}) as avg_{{ column }}, + stddev( + {{ column }} + ) as stddev_{{ column }}{{ ", " if not loop.last else "" }} + {%- endfor %} + from {{ source_table }} + ) +select + {{ source_table + ".*" if not overwrite_columns else untouched_cols }}, {%- for column in columns_to_scale %} - avg({{column}}) as avg_{{column}}, - stddev({{column}}) as stddev_{{column}}{{ ", " if not loop.last else "" }} + ({{ column }} - avg_{{ column }}) / (stddev_{{ column }}) + as {{ column if overwrite_columns else column + "_STANDARD_SCALED" }}{{ ", " if not loop.last else "" }} {%- endfor %} - from {{source_table}} -) select {{ source_table + ".*" if not overwrite_columns else untouched_cols}}, -{%- for column in columns_to_scale %} - ({{column}} - avg_{{column}}) / (stddev_{{column}}) as {{column if overwrite_columns else column + "_STANDARD_SCALED"}}{{ ", " if not loop.last else "" }} -{%- endfor %} -from avg_stddev_vals, {{source_table}} +from avg_stddev_vals, {{ source_table }} {%- else -%} -select {{ untouched_cols }}, -{%- for column in columns_to_scale %} - ({{column}} - {{averages[loop.index0]}}) / ({{standarddevs[loop.index0]}}) as {{column if overwrite_columns else column + "_STANDARD_SCALED"}}{{ ", " if not loop.last else "" }} -{%- endfor %} -from {{source_table}} -{%- endif -%} \ No newline at end of file +select + {{ untouched_cols }}, + {%- for column in columns_to_scale %} + ({{ column }} - {{ averages[loop.index0] }}) / ({{ standarddevs[loop.index0] }}) + as {{ column if overwrite_columns else column + "_STANDARD_SCALED" }}{{ ", " if not loop.last else "" }} + {%- endfor %} +from {{ source_table }} +{%- endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/substring/substring.sql b/rasgotransforms/rasgotransforms/transforms/substring/substring.sql index 2676be0b..bb0f2c67 100644 --- a/rasgotransforms/rasgotransforms/transforms/substring/substring.sql +++ b/rasgotransforms/rasgotransforms/transforms/substring/substring.sql @@ -1,8 +1,14 @@ -SELECT -* -{% if end_pos %} - , SUBSTR({{ target_col }}, {{ start_pos }}, {{ end_pos }}) AS SUBSTRING_{{ cleanse_name(target_col) }}_{{ start_pos }}_{{ end_pos }} -{% else %} - , SUBSTR({{ target_col }}, {{ start_pos }}) AS SUBSTRING_{{ cleanse_name(target_col) }}_{{ start_pos }} -{% endif %} -FROM {{ source_table }} +select + * + {% if end_pos %} + , + substr( + {{ target_col }}, {{ start_pos }}, {{ end_pos }} + ) as substring_{{ cleanse_name(target_col) }}_{{ start_pos }}_{{ end_pos }} + {% else %} + , + substr( + {{ target_col }}, {{ start_pos }} + ) as substring_{{ cleanse_name(target_col) }}_{{ start_pos }} + {% endif %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/suffix/suffix.sql b/rasgotransforms/rasgotransforms/transforms/suffix/suffix.sql index 484651fd..b17b069d 100644 --- a/rasgotransforms/rasgotransforms/transforms/suffix/suffix.sql +++ b/rasgotransforms/rasgotransforms/transforms/suffix/suffix.sql @@ -1,7 +1,7 @@ {%- set source_col_names = get_columns(source_table) -%} {%- set alias = cleanse_name(suffix) -%} -SELECT -{%- for column in source_col_names %} - {{column}} AS {{ column~'_'~alias }}{{',' if not loop.last else ''}} -{%- endfor %} -FROM {{ source_table }} \ No newline at end of file +select + {%- for column in source_col_names %} + {{ column }} as {{ column~'_'~alias }}{{ ',' if not loop.last else '' }} + {%- endfor %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/summarize_flatlines/summarize_flatlines.sql b/rasgotransforms/rasgotransforms/transforms/summarize_flatlines/summarize_flatlines.sql index 380d45cc..1fa78d70 100644 --- a/rasgotransforms/rasgotransforms/transforms/summarize_flatlines/summarize_flatlines.sql +++ b/rasgotransforms/rasgotransforms/transforms/summarize_flatlines/summarize_flatlines.sql @@ -1,21 +1,31 @@ -WITH CTE_SEQUENCES AS ( - SELECT - T.*, - ROW_NUMBER() OVER (PARTITION BY {%- for group_item in group_by %} {{ group_item }},{%- endfor -%} {{ value_col }} ORDER BY {{ order_col }}) AS RN_R97_B42_O, - ROW_NUMBER() OVER (ORDER BY {%- for group_item in group_by %} {{ group_item }},{%- endfor -%} {{ order_col }}) AS RN_R97_B42_E - FROM - {{ source_table }} T -) -SELECT - {%- for group_item in group_by %} S.{{ group_item }},{%- endfor -%} - S.{{ value_col }} as REPEATED_VALUE, - MIN(S.{{ order_col }}) AS FLATLINE_START_DATE, - MAX(S.{{ order_col }}) AS FLATLINE_END_DATE, - COUNT(*) AS OCCURRENCE_COUNT -FROM - CTE_SEQUENCES S -GROUP BY - {%- for group_item in group_by %} S.{{ group_item }},{%- endfor -%} - S.{{ value_col }}, - S.RN_R97_B42_E - S.RN_R97_B42_O -HAVING COUNT(*) > {{ min_repeat_count }} \ No newline at end of file +with + cte_sequences as ( + select + t.*, + row_number() over ( + partition by + {%- for group_item in group_by %} + {{ group_item }}, + {%- endfor -%} {{ value_col }} + order by {{ order_col }} + ) as rn_r97_b42_o, + row_number() over ( + order by + {%- for group_item in group_by %} + {{ group_item }}, + {%- endfor -%} {{ order_col }} + ) as rn_r97_b42_e + from {{ source_table }} t + ) +select + {%- for group_item in group_by %} s.{{ group_item }},{%- endfor -%} + s.{{ value_col }} as repeated_value, + min(s.{{ order_col }}) as flatline_start_date, + max(s.{{ order_col }}) as flatline_end_date, + count(*) as occurrence_count +from cte_sequences s +group by + {%- for group_item in group_by %} s.{{ group_item }},{%- endfor -%} + s.{{ value_col }}, + s.rn_r97_b42_e - s.rn_r97_b42_o +having count(*) > {{ min_repeat_count }} diff --git a/rasgotransforms/rasgotransforms/transforms/summarize_islands/bigquery/summarize_islands.sql b/rasgotransforms/rasgotransforms/transforms/summarize_islands/bigquery/summarize_islands.sql index 716cf43e..5ff4bb8c 100644 --- a/rasgotransforms/rasgotransforms/transforms/summarize_islands/bigquery/summarize_islands.sql +++ b/rasgotransforms/rasgotransforms/transforms/summarize_islands/bigquery/summarize_islands.sql @@ -1,52 +1,153 @@ -WITH CTE_CONDITION AS ( -SELECT {{ date_col }} AS dtm {% if group_cols -%},{% endif -%} -{% if group_cols -%}{% for group_col in group_cols -%}{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %}{% endif -%} -FROM {{ source_table }} -WHERE -{% if conditions -%}{% for condition in conditions -%}{{condition}}{{ " AND " if not loop.last else " " }}{%- endfor %}{% endif -%} -{% if conditions -%}AND {% endif -%}{{ date_col }} is not null -), -CTE_LAGGED AS ( -SELECT -dtm{% if group_cols -%},{% endif -%} {% if group_cols -%}{% for group_col in group_cols -%}{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %}{% endif -%}, -LAG(dtm) -OVER ({% if group_cols -%}PARTITION BY {% endif -%}{% if group_cols -%}{% for group_col in group_cols -%}{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %}{% endif -%} ORDER BY dtm) AS previous_datetime, -LEAD(dtm) -OVER ({% if group_cols -%}PARTITION BY {% endif -%} {% if group_cols -%}{% for group_col in group_cols -%}{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %}{% endif -%} ORDER BY dtm) AS next_datetime, -ROW_NUMBER() OVER ({% if group_cols -%}PARTITION BY {% endif -%} {% if group_cols -%}{% for group_col in group_cols -%}{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %}{% endif -%} ORDER BY CTE_CONDITION.dtm) -AS island_location -FROM CTE_CONDITION), -CTE_ISLAND_START AS ( -SELECT -ROW_NUMBER() OVER ({% if group_cols -%}PARTITION BY {% endif -%} {% if group_cols -%}{% for group_col in group_cols -%}{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %}{% endif -%} ORDER BY dtm) AS island_number{% if group_cols -%},{% endif -%} -{% if group_cols -%}{% for group_col in group_cols -%}{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %}{% endif -%}, -dtm AS island_start_datetime, -island_location AS island_start_location -FROM CTE_LAGGED -WHERE (DATE_DIFF( dtm, previous_datetime, {{ buffer_date_part }}) > {{ buffer_size }} -OR CTE_LAGGED.previous_datetime IS NULL)), -CTE_ISLAND_END AS ( -SELECT -ROW_NUMBER() -OVER ({% if group_cols -%}PARTITION BY {% endif -%} {% if group_cols -%}{% for group_col in group_cols -%}{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %}{% endif -%} ORDER BY dtm) AS island_number{% if group_cols -%},{% endif -%} -{% if group_cols -%}{% for group_col in group_cols -%}{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %}{% endif -%}, -dtm AS island_end_datetime, -island_location AS island_end_location -FROM CTE_LAGGED -WHERE DATE_DIFF(next_datetime, dtm, {{ buffer_date_part }}) > {{ buffer_size }} OR CTE_LAGGED.next_datetime IS NULL) -SELECT -{% if group_cols -%}{% for group_col in group_cols -%}CTE_ISLAND_START.{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %},{% endif -%} -CTE_ISLAND_START.island_start_datetime, -CTE_ISLAND_END.island_end_datetime, -DATE_DIFF(CTE_ISLAND_END.island_end_datetime, CTE_ISLAND_START.island_start_datetime, {{ buffer_date_part }}) AS ISLAND_DURATION_{{ buffer_date_part }}, -(SELECT COUNT(*) -FROM CTE_LAGGED -WHERE CTE_LAGGED.dtm BETWEEN -CTE_ISLAND_START.island_start_datetime AND -CTE_ISLAND_END.island_end_datetime -{% if group_cols -%} AND {% for group_col in group_cols -%}CTE_LAGGED.{{ group_col }} = CTE_ISLAND_START.{{ group_col }} AND CTE_LAGGED.{{ group_col }} = CTE_ISLAND_START.{{ group_col }}{{ " AND " if not loop.last else " " }}{%- endfor %}{% endif -%} -) -AS island_row_count -FROM CTE_ISLAND_START -INNER JOIN CTE_ISLAND_END ON CTE_ISLAND_END.island_number = CTE_ISLAND_START.island_number -{% if group_cols -%}{% for group_col in group_cols %} AND CTE_ISLAND_START.{{ group_col }} = CTE_ISLAND_END.{{ group_col }}{%- endfor %}{% endif -%} +with + cte_condition as ( + select + {{ date_col }} as dtm + {% if group_cols -%},{% endif -%} + {% if group_cols -%} + {% for group_col in group_cols -%} + {{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + {% endif -%} + from {{ source_table }} + where + {% if conditions -%} + {% for condition in conditions -%} + {{ condition }}{{ " AND " if not loop.last else " " }} + {%- endfor %} + {% endif -%} + {% if conditions -%} and {% endif -%} {{ date_col }} is not null + ), + cte_lagged as ( + select + dtm + {% if group_cols -%},{% endif -%} + {% if group_cols -%} + {% for group_col in group_cols -%} + {{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + {% endif -%}, + lag(dtm) over ( + {% if group_cols -%} + partition by + {% endif -%} + {% if group_cols -%} + {% for group_col in group_cols -%} + {{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + {% endif -%} + order by dtm + ) as previous_datetime, + lead(dtm) over ( + {% if group_cols -%} + partition by + {% endif -%} + {% if group_cols -%} + {% for group_col in group_cols -%} + {{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + {% endif -%} + order by dtm + ) as next_datetime, + row_number() over ( + {% if group_cols -%} + partition by + {% endif -%} + {% if group_cols -%} + {% for group_col in group_cols -%} + {{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + {% endif -%} + order by cte_condition.dtm + ) as island_location + from cte_condition + ), + cte_island_start as ( + select + row_number() over ( + {% if group_cols -%} + partition by + {% endif -%} + {% if group_cols -%} + {% for group_col in group_cols -%} + {{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + {% endif -%} + order by dtm + ) as island_number + {% if group_cols -%},{% endif -%} + {% if group_cols -%} + {% for group_col in group_cols -%} + {{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + {% endif -%}, + dtm as island_start_datetime, + island_location as island_start_location + from cte_lagged + where + ( + date_diff(dtm, previous_datetime, {{ buffer_date_part }}) + > {{ buffer_size }} + or cte_lagged.previous_datetime is null + ) + ), + cte_island_end as ( + select + row_number() over ( + {% if group_cols -%} + partition by + {% endif -%} + {% if group_cols -%} + {% for group_col in group_cols -%} + {{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + {% endif -%} + order by dtm + ) as island_number + {% if group_cols -%},{% endif -%} + {% if group_cols -%} + {% for group_col in group_cols -%} + {{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + {% endif -%}, + dtm as island_end_datetime, + island_location as island_end_location + from cte_lagged + where + date_diff(next_datetime, dtm, {{ buffer_date_part }}) > {{ buffer_size }} + or cte_lagged.next_datetime is null + ) +select + {% if group_cols -%} + {% for group_col in group_cols -%} + cte_island_start.{{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %}, + {% endif -%} + cte_island_start.island_start_datetime, + cte_island_end.island_end_datetime, + date_diff( + cte_island_end.island_end_datetime, + cte_island_start.island_start_datetime, + {{ buffer_date_part }} + ) as island_duration_{{ buffer_date_part }}, + ( + select count(*) + from cte_lagged + where + cte_lagged.dtm between cte_island_start.island_start_datetime and + cte_island_end.island_end_datetime + {% if group_cols -%} + and {% for group_col in group_cols -%} + cte_lagged.{{ group_col }} = cte_island_start.{{ group_col }} + and cte_lagged.{{ group_col }} + = cte_island_start.{{ group_col }}{{ " AND " if not loop.last else " " }} + {%- endfor %} + {% endif -%} + ) as island_row_count +from cte_island_start +inner join + cte_island_end on cte_island_end.island_number = cte_island_start.island_number + {% if group_cols -%} + {% for group_col in group_cols %} + and cte_island_start.{{ group_col }} = cte_island_end.{{ group_col }} + {%- endfor %} + {% endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/summarize_islands/snowflake/summarize_islands.sql b/rasgotransforms/rasgotransforms/transforms/summarize_islands/snowflake/summarize_islands.sql index df9374e0..ad5332ba 100644 --- a/rasgotransforms/rasgotransforms/transforms/summarize_islands/snowflake/summarize_islands.sql +++ b/rasgotransforms/rasgotransforms/transforms/summarize_islands/snowflake/summarize_islands.sql @@ -1,52 +1,153 @@ -WITH CTE_CONDITION AS ( -SELECT {{ date_col }} AS dtm {% if group_cols -%},{% endif -%} - {% if group_cols -%}{% for group_col in group_cols -%}{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %}{% endif -%} -FROM {{ source_table }} -WHERE -{% if conditions -%}{% for condition in conditions -%}{{condition}}{{ " AND " if not loop.last else " " }}{%- endfor %}{% endif -%} -{% if conditions -%}AND {% endif -%}{{ date_col }} is not null -), -CTE_LAGGED AS ( - SELECT - dtm{% if group_cols -%},{% endif -%} {% if group_cols -%}{% for group_col in group_cols -%}{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %}{% endif -%}, - LAG(dtm) - OVER ({% if group_cols -%}PARTITION BY {% endif -%}{% if group_cols -%}{% for group_col in group_cols -%}{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %}{% endif -%} ORDER BY dtm) AS previous_datetime, - LEAD(dtm) - OVER ({% if group_cols -%}PARTITION BY {% endif -%} {% if group_cols -%}{% for group_col in group_cols -%}{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %}{% endif -%} ORDER BY dtm) AS next_datetime, - ROW_NUMBER() OVER ({% if group_cols -%}PARTITION BY {% endif -%} {% if group_cols -%}{% for group_col in group_cols -%}{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %}{% endif -%} ORDER BY CTE_CONDITION.dtm) - AS island_location - FROM CTE_CONDITION), -CTE_ISLAND_START AS ( - SELECT - ROW_NUMBER() OVER ({% if group_cols -%}PARTITION BY {% endif -%} {% if group_cols -%}{% for group_col in group_cols -%}{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %}{% endif -%} ORDER BY dtm) AS island_number{% if group_cols -%},{% endif -%} - {% if group_cols -%}{% for group_col in group_cols -%}{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %}{% endif -%}, - dtm AS island_start_datetime, - island_location AS island_start_location - FROM CTE_LAGGED - WHERE (DATEDIFF({{ buffer_date_part }}, previous_datetime, dtm) > {{ buffer_size }} - OR CTE_LAGGED.previous_datetime IS NULL)), -CTE_ISLAND_END AS ( - SELECT - ROW_NUMBER() - OVER ({% if group_cols -%}PARTITION BY {% endif -%} {% if group_cols -%}{% for group_col in group_cols -%}{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %}{% endif -%} ORDER BY dtm) AS island_number{% if group_cols -%},{% endif -%} - {% if group_cols -%}{% for group_col in group_cols -%}{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %}{% endif -%}, - dtm AS island_end_datetime, - island_location AS island_end_location - FROM CTE_LAGGED - WHERE DATEDIFF({{ buffer_date_part }}, dtm, next_datetime) > {{ buffer_size }} OR CTE_LAGGED.next_datetime IS NULL) -SELECT -{% if group_cols -%}{% for group_col in group_cols -%}CTE_ISLAND_START.{{group_col}}{{ ", " if not loop.last else " " }}{%- endfor %},{% endif -%} - CTE_ISLAND_START.island_start_datetime, - CTE_ISLAND_END.island_end_datetime, - DATEDIFF({{ buffer_date_part }}, CTE_ISLAND_START.island_start_datetime, CTE_ISLAND_END.island_end_datetime) AS ISLAND_DURATION_{{ buffer_date_part }}, - (SELECT COUNT(*) - FROM CTE_LAGGED - WHERE CTE_LAGGED.dtm BETWEEN - CTE_ISLAND_START.island_start_datetime AND - CTE_ISLAND_END.island_end_datetime -{% if group_cols -%} AND {% for group_col in group_cols -%}CTE_LAGGED.{{ group_col }} = CTE_ISLAND_START.{{ group_col }} AND CTE_LAGGED.{{ group_col }} = CTE_ISLAND_START.{{ group_col }}{{ " AND " if not loop.last else " " }}{%- endfor %}{% endif -%} -) - AS island_row_count -FROM CTE_ISLAND_START -INNER JOIN CTE_ISLAND_END ON CTE_ISLAND_END.island_number = CTE_ISLAND_START.island_number -{% if group_cols -%}{% for group_col in group_cols %} AND CTE_ISLAND_START.{{ group_col }} = CTE_ISLAND_END.{{ group_col }}{%- endfor %}{% endif -%} \ No newline at end of file +with + cte_condition as ( + select + {{ date_col }} as dtm + {% if group_cols -%},{% endif -%} + {% if group_cols -%} + {% for group_col in group_cols -%} + {{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + {% endif -%} + from {{ source_table }} + where + {% if conditions -%} + {% for condition in conditions -%} + {{ condition }}{{ " AND " if not loop.last else " " }} + {%- endfor %} + {% endif -%} + {% if conditions -%} and {% endif -%} {{ date_col }} is not null + ), + cte_lagged as ( + select + dtm + {% if group_cols -%},{% endif -%} + {% if group_cols -%} + {% for group_col in group_cols -%} + {{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + {% endif -%}, + lag(dtm) over ( + {% if group_cols -%} + partition by + {% endif -%} + {% if group_cols -%} + {% for group_col in group_cols -%} + {{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + {% endif -%} + order by dtm + ) as previous_datetime, + lead(dtm) over ( + {% if group_cols -%} + partition by + {% endif -%} + {% if group_cols -%} + {% for group_col in group_cols -%} + {{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + {% endif -%} + order by dtm + ) as next_datetime, + row_number() over ( + {% if group_cols -%} + partition by + {% endif -%} + {% if group_cols -%} + {% for group_col in group_cols -%} + {{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + {% endif -%} + order by cte_condition.dtm + ) as island_location + from cte_condition + ), + cte_island_start as ( + select + row_number() over ( + {% if group_cols -%} + partition by + {% endif -%} + {% if group_cols -%} + {% for group_col in group_cols -%} + {{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + {% endif -%} + order by dtm + ) as island_number + {% if group_cols -%},{% endif -%} + {% if group_cols -%} + {% for group_col in group_cols -%} + {{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + {% endif -%}, + dtm as island_start_datetime, + island_location as island_start_location + from cte_lagged + where + ( + datediff({{ buffer_date_part }}, previous_datetime, dtm) + > {{ buffer_size }} + or cte_lagged.previous_datetime is null + ) + ), + cte_island_end as ( + select + row_number() over ( + {% if group_cols -%} + partition by + {% endif -%} + {% if group_cols -%} + {% for group_col in group_cols -%} + {{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + {% endif -%} + order by dtm + ) as island_number + {% if group_cols -%},{% endif -%} + {% if group_cols -%} + {% for group_col in group_cols -%} + {{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %} + {% endif -%}, + dtm as island_end_datetime, + island_location as island_end_location + from cte_lagged + where + datediff({{ buffer_date_part }}, dtm, next_datetime) > {{ buffer_size }} + or cte_lagged.next_datetime is null + ) +select + {% if group_cols -%} + {% for group_col in group_cols -%} + cte_island_start.{{ group_col }}{{ ", " if not loop.last else " " }} + {%- endfor %}, + {% endif -%} + cte_island_start.island_start_datetime, + cte_island_end.island_end_datetime, + datediff( + {{ buffer_date_part }}, + cte_island_start.island_start_datetime, + cte_island_end.island_end_datetime + ) as island_duration_{{ buffer_date_part }}, + ( + select count(*) + from cte_lagged + where + cte_lagged.dtm between cte_island_start.island_start_datetime and + cte_island_end.island_end_datetime + {% if group_cols -%} + and {% for group_col in group_cols -%} + cte_lagged.{{ group_col }} = cte_island_start.{{ group_col }} + and cte_lagged.{{ group_col }} + = cte_island_start.{{ group_col }}{{ " AND " if not loop.last else " " }} + {%- endfor %} + {% endif -%} + ) as island_row_count +from cte_island_start +inner join + cte_island_end on cte_island_end.island_number = cte_island_start.island_number + {% if group_cols -%} + {% for group_col in group_cols %} + and cte_island_start.{{ group_col }} = cte_island_end.{{ group_col }} + {%- endfor %} + {% endif -%} diff --git a/rasgotransforms/rasgotransforms/transforms/table/table.sql b/rasgotransforms/rasgotransforms/transforms/table/table.sql index f16ccb63..0ee83e04 100644 --- a/rasgotransforms/rasgotransforms/transforms/table/table.sql +++ b/rasgotransforms/rasgotransforms/transforms/table/table.sql @@ -1,24 +1,24 @@ -{%- if num_rows is not defined -%} - {%- set row_count = 10 -%} -{%- else -%} - {%- set row_count = num_rows -%} +{%- if num_rows is not defined -%} {%- set row_count = 10 -%} +{%- else -%} {%- set row_count = num_rows -%} {%- endif -%} -SELECT * -FROM {{ source_table }} -{%- if filters is defined and filters %} +select * +from + {{ source_table }} + {%- if filters is defined and filters %} {% for filter_block in filters %} {%- set oloop = loop -%} {{ 'WHERE ' if oloop.first else ' AND ' }} - {%- if filter_block is not mapping -%} - {{ filter_block }} - {%- else -%} - {%- if filter_block['operator'] == 'CONTAINS' -%} - {{ filter_block['operator'] }}({{ filter_block['columnName'] }}, {{ filter_block['comparisonValue'] }}) - {%- else -%} - {{ filter_block['columnName'] }} {{ filter_block['operator'] }} {{ filter_block['comparisonValue'] }} - {%- endif -%} - {%- endif -%} + {%- if filter_block is not mapping -%} {{ filter_block }} + {%- else -%} + {%- if filter_block['operator'] == 'CONTAINS' -%} + {{ filter_block['operator'] }} ( + {{ filter_block['columnName'] }}, {{ filter_block['comparisonValue'] }} + ) + {%- else -%} + {{ filter_block['columnName'] }} {{ filter_block['operator'] }} {{ filter_block['comparisonValue'] }} + {%- endif -%} + {%- endif -%} {%- endfor -%} -{%- endif %} -LIMIT {{ row_count }} \ No newline at end of file + {%- endif %} +limit {{ row_count }} diff --git a/rasgotransforms/rasgotransforms/transforms/target_encode/target_encode.sql b/rasgotransforms/rasgotransforms/transforms/target_encode/target_encode.sql index 94966735..3c381be5 100644 --- a/rasgotransforms/rasgotransforms/transforms/target_encode/target_encode.sql +++ b/rasgotransforms/rasgotransforms/transforms/target_encode/target_encode.sql @@ -1,10 +1,12 @@ -with means as ( - select distinct {{column}} as value, ROUND(AVG({{target}}), 3) as {{column}}_target_encoded - from {{ source_table }} - group by value) +with + means as ( + select distinct + {{ column }} as value, + round(avg({{ target }}), 3) as {{ column }}_target_encoded + from {{ source_table }} + group by value + ) -select t.*, m.{{column}}_target_encoded +select t.*, m.{{ column }}_target_encoded from {{ source_table }} t -left join -means m -on t.{{column}} = m.value \ No newline at end of file +left join means m on t.{{ column }} = m.value diff --git a/rasgotransforms/rasgotransforms/transforms/timeseries_agg/bigquery/timeseries_agg.sql b/rasgotransforms/rasgotransforms/transforms/timeseries_agg/bigquery/timeseries_agg.sql index 06a044b0..81eae8ea 100644 --- a/rasgotransforms/rasgotransforms/transforms/timeseries_agg/bigquery/timeseries_agg.sql +++ b/rasgotransforms/rasgotransforms/transforms/timeseries_agg/bigquery/timeseries_agg.sql @@ -1,22 +1,26 @@ -SELECT * -{% for offset in offsets -%} - {% set normalized_offset = -offset %} - {% for col, aggs in aggregations.items() -%} +select + * + {% for offset in offsets -%} + {% set normalized_offset = -offset %} + {% for col, aggs in aggregations.items() -%} {% for agg in aggs %} - ,( - SELECT {{ agg }}({{ col }}) - FROM {{ source_table }} i - WHERE - {% if normalized_offset > 0 -%} - i.{{ date }} BETWEEN o.{{ date }} AND DATE_ADD(o.{{ date }}, INTERVAL {{ normalized_offset }} {{ date_part }}) - {% else -%} - i.{{ date }} BETWEEN DATE_SUB(o.{{ date }}, INTERVAL {{ normalized_offset|abs }} {{ date_part }}) AND o.{{ date }} - {%- endif -%} - {%- for g in group_by %} - AND o.{{ g }} = i.{{ g }} - {% endfor -%} - ) AS {{ cleanse_name(agg + '_' + col + '_' + offset|string + date_part) }} + , + ( + select {{ agg }} ({{ col }}) + from {{ source_table }} i + where + {% if normalized_offset > 0 -%} + i.{{ date }} between o.{{ date }} and date_add( + o.{{ date }}, interval {{ normalized_offset }} {{ date_part }} + ) + {% else -%} + i.{{ date }} between date_sub( + o.{{ date }}, interval {{ normalized_offset|abs }} {{ date_part }} + ) and o.{{ date }} + {%- endif -%} + {%- for g in group_by %} and o.{{ g }} = i.{{ g }} {% endfor -%} + ) as {{ cleanse_name(agg + '_' + col + '_' + offset|string + date_part) }} {%- endfor -%} - {%- endfor %} -{% endfor %} -FROM {{ source_table }} o \ No newline at end of file + {%- endfor %} + {% endfor %} +from {{ source_table }} o diff --git a/rasgotransforms/rasgotransforms/transforms/timeseries_agg/snowflake/timeseries_agg.sql b/rasgotransforms/rasgotransforms/transforms/timeseries_agg/snowflake/timeseries_agg.sql index 82b7b810..4355b5a0 100644 --- a/rasgotransforms/rasgotransforms/transforms/timeseries_agg/snowflake/timeseries_agg.sql +++ b/rasgotransforms/rasgotransforms/transforms/timeseries_agg/snowflake/timeseries_agg.sql @@ -1,54 +1,44 @@ {%- macro create_cte_basic(group_by, offset, date, date_part, aggregations) -%} {% set normalized_offset = -offset %} {% set cte_name = cleanse_name('BASIC_OFFSET_' ~ offset ~ date_part) %} -{%- do cte_list.append(cte_name) -%} - -,{{ cte_name }} AS (SELECT -{% for g in group_by -%} - A.{{ g }}, -{%- endfor %} -A.{{ date }}, -{% for col, aggs in aggregations.items() -%} - {%- for agg in aggs %} - {%- if agg == 'ENTROPY' -%} - {%- set entropy_flag = True -%} - {%- endif -%} +{%- do cte_list.append(cte_name) -%}, +{{ cte_name }} as ( + select + {% for g in group_by -%} a.{{ g }}, {%- endfor %} + a.{{ date }}, + {% for col, aggs in aggregations.items() -%} + {%- for agg in aggs %} + {%- if agg == 'ENTROPY' -%} {%- set entropy_flag = True -%} {%- endif -%} {% if normalized_offset > 0 -%} - {%- set alias = cleanse_name(agg ~ '_' ~ col ~ '_NEXT' + offset|string + date_part) %} + {%- set alias = cleanse_name(agg ~ '_' ~ col ~ '_NEXT' + offset|string + date_part) %} {%- else -%} - {%- set alias = cleanse_name(agg ~ '_' ~ col ~ '_PAST' + offset|string + date_part) %} + {%- set alias = cleanse_name(agg ~ '_' ~ col ~ '_PAST' + offset|string + date_part) %} {%- endif -%} {%- if not entropy_flag %} - {%- if ' DISTINCT' in agg %} - {{ agg|replace(" DISTINCT", "") }}(DISTINCT B.{{ col }}) as {{ alias }}, - {%- else %} - {{ agg }}(B.{{ col }}) as {{ alias }}, - {%- endif -%} - {%- do final_col_list.append(cte_name ~ '.' ~ alias) -%} + {%- if ' DISTINCT' in agg %} + {{ agg|replace(" DISTINCT", "") }} (distinct b.{{ col }}) as {{ alias }}, + {%- else %} {{ agg }} (b.{{ col }}) as {{ alias }}, {%- endif -%} - {%- endfor -%} -{%- endfor -%} -COUNT(1) AS AGG_ROW_COUNT -FROM {{ source_table }} A -INNER JOIN {{ source_table }} B -ON - {% for g in group_by -%} - A.{{ g }} = B.{{ g }} AND - {% endfor %} - 1=1 -WHERE - {% if normalized_offset > 0 -%} - B.{{ date }} <= DATEADD({{ date_part }}, {{ normalized_offset }}, A.{{ date }}) - AND B.{{ date }} >= A.{{ date }} - {% else -%} - B.{{ date }} >= DATEADD({{ date_part }}, {{ normalized_offset }}, A.{{ date }}) - AND B.{{ date }} <= A.{{ date }} - {% endif %} -GROUP BY -{% for g in group_by %} - A.{{ g }}, -{% endfor -%} - A.{{ date }}) + {%- do final_col_list.append(cte_name ~ '.' ~ alias) -%} + {%- endif -%} + {%- endfor -%} + {%- endfor -%} + count(1) as agg_row_count + from {{ source_table }} a + inner join + {{ source_table }} b + on {% for g in group_by -%} a.{{ g }} = b.{{ g }} and {% endfor %} + 1 = 1 + where + {% if normalized_offset > 0 -%} + b.{{ date }} <= dateadd({{ date_part }}, {{ normalized_offset }}, a.{{ date }}) + and b.{{ date }} >= a.{{ date }} + {% else -%} + b.{{ date }} >= dateadd({{ date_part }}, {{ normalized_offset }}, a.{{ date }}) + and b.{{ date }} <= a.{{ date }} + {% endif %} + group by {% for g in group_by %} a.{{ g }}, {% endfor -%} a.{{ date }} +) {%- endmacro -%} {%- macro create_cte_entropy(group_by, offset, date, date_part, entropy_aggs) -%} {% set normalized_offset = -offset %} @@ -58,82 +48,68 @@ GROUP BY {% set cte_name3 = cleanse_name('ENTROPY_OFFSET_' ~ offset ~ date_part ~ '_ENTROPY') %} {%- do cte_list.append(cte_name3) -%} {% if normalized_offset > 0 -%} - {%- set alias = cleanse_name(col ~ '_ENTROPY_NEXT' + offset|string + date_part) %} +{%- set alias = cleanse_name(col ~ '_ENTROPY_NEXT' + offset|string + date_part) %} {%- else -%} - {%- set alias = cleanse_name(col ~ '_ENTROPY_PAST' + offset|string + date_part) %} +{%- set alias = cleanse_name(col ~ '_ENTROPY_PAST' + offset|string + date_part) %} {%- endif -%} -{%- do final_col_list.append(cte_name3 ~ '.' ~ alias) -%} - , {{ cte_name1 }} AS (SELECT - {% for g in group_by -%} - A.{{ g }}, - {%- endfor %} - A.{{ date }}, - B.{{ col }}, - COUNT(1) AS C - FROM {{ source_table }} A - INNER JOIN {{ source_table }} B - ON - {% for g in group_by -%} - A.{{ g }} = B.{{ g }} AND - {% endfor %} - 1=1 - WHERE +{%- do final_col_list.append(cte_name3 ~ '.' ~ alias) -%}, +{{ cte_name1 }} as ( + select + {% for g in group_by -%} a.{{ g }}, {%- endfor %} + a.{{ date }}, + b.{{ col }}, + count(1) as c + from {{ source_table }} a + inner join + {{ source_table }} b + on {% for g in group_by -%} a.{{ g }} = b.{{ g }} and {% endfor %} + 1 = 1 + where {% if normalized_offset > 0 -%} - B.{{ date }} >= DATEADD({{ date_part }}, {{ normalized_offset }}, A.{{ date }}) - AND B.{{ date }} > A.{{ date }} + b.{{ date }} >= dateadd({{ date_part }}, {{ normalized_offset }}, a.{{ date }}) + and b.{{ date }} > a.{{ date }} {% else -%} - B.{{ date }} >= DATEADD({{ date_part }}, {{ normalized_offset }}, A.{{ date }}) - AND B.{{ date }} <= A.{{ date }} + b.{{ date }} >= dateadd({{ date_part }}, {{ normalized_offset }}, a.{{ date }}) + and b.{{ date }} <= a.{{ date }} {% endif %} - GROUP BY - {% for g in group_by %} - A.{{ g }}, - {% endfor -%} - B.{{ col }}, - A.{{ date }} + group by {% for g in group_by %} a.{{ g }}, {% endfor -%} b.{{ col }}, a.{{ date }} +), +{{ cte_name2 }} as ( + select + {%- for group_item in group_by %} {{ group_item }}, {%- endfor -%} + {{ date }}, + {{ col }}, + c / sum(c) over (partition by {{ group_by | join(', ') }}, {{ date }}) as p + from {{ cte_name1 }} ), - {{ cte_name2 }} AS ( - SELECT - {%- for group_item in group_by %} - {{ group_item }}, - {%- endfor -%} - {{ date }}, - {{ col }}, - C / SUM(C) OVER (PARTITION BY {{ group_by | join(', ') }}, {{ date }}) AS P - FROM {{ cte_name1 }} - ), - {{ cte_name3 }} AS ( - SELECT - {%- for group_item in group_by %} - {{ group_item }}, - {%- endfor -%} - {{ date }}, - -SUM(P*LOG(2,P)) AS {{ alias }} - FROM {{ cte_name2 }} - GROUP BY {{ group_by | join(', ') }} - ,{{ date }}) +{{ cte_name3 }} as ( + select + {%- for group_item in group_by %} {{ group_item }}, {%- endfor -%} + {{ date }}, - sum(p * log(2, p)) as {{ alias }} + from {{ cte_name2 }} + group by {{ group_by | join(', ') }},{{ date }} +) {%- endfor -%} {%- endmacro -%} {%- set cte_list = [] -%} {%- set final_col_list = [] -%} {%- set entropy_aggs = {} -%} {%- for col, aggs in aggregations.items() -%} - {%- if 'ENTROPY' in aggs -%} - {%- set _ = entropy_aggs.update({col: aggs}) -%} - {%- endif -%} +{%- if 'ENTROPY' in aggs -%} +{%- set _ = entropy_aggs.update({col: aggs}) -%} +{%- endif -%} {%- endfor -%} -WITH DUMMY1 AS (SELECT NULL FROM {{ source_table }} WHERE 1=0) -{%- for offset in offsets -%} +with + dummy1 as (select null from {{ source_table }} where 1 = 0) + {%- for offset in offsets -%} {{ create_cte_basic(group_by=group_by, offset=offset, date=date, date_part=date_part, aggregations=aggregations) }} {{ create_cte_entropy(group_by=group_by, offset=offset, date=date, date_part=date_part, entropy_aggs=entropy_aggs) }} -{%- endfor -%} -,DUMMY2 AS (SELECT NULL FROM {{ source_table }} WHERE 1=0) -SELECT src.*, -{{ final_col_list|join(', ') }} FROM {{ source_table }} src + {%- endfor -%}, + dummy2 as (select null from {{ source_table }} where 1 = 0) +select src.*, {{ final_col_list|join(', ') }} +from {{ source_table }} src {% for cte in cte_list -%} -LEFT OUTER JOIN {{ cte }} -ON {{ cte }}.{{ date }} = src.{{ date }} - {%- for g in group_by %} - AND {{ cte }}.{{ g }} = src.{{ g }} - {% endfor -%} -{%- endfor -%} \ No newline at end of file +left outer join + {{ cte }} on {{ cte }}.{{ date }} = src.{{ date }} + {%- for g in group_by %} and {{ cte }}.{{ g }} = src.{{ g }} {% endfor -%} +{%- endfor -%} diff --git a/rasgotransforms/rasgotransforms/transforms/timeseries_agg/timeseries_agg.sql b/rasgotransforms/rasgotransforms/transforms/timeseries_agg/timeseries_agg.sql index 7ffe9505..9ec64fec 100644 --- a/rasgotransforms/rasgotransforms/transforms/timeseries_agg/timeseries_agg.sql +++ b/rasgotransforms/rasgotransforms/transforms/timeseries_agg/timeseries_agg.sql @@ -1,22 +1,26 @@ -SELECT * -{% for offset in offsets -%} - {% set normalized_offset = -offset %} - {% for col, aggs in aggregations.items() -%} +select + * + {% for offset in offsets %} + {% set normalized_offset = -offset %} + {% for col, aggs in aggregations.items() %} {% for agg in aggs %} - ,( - SELECT {{ agg }}({{ col }}) - FROM {{ source_table }} i - WHERE - {% if normalized_offset > 0 -%} - i.{{ date }} BETWEEN o.{{ date }} AND (o.{{ date }} + INTERVAL {{ normalized_offset }} {{ date_part }}) - {% else -%} - i.{{ date }} BETWEEN (o.{{ date }} - INTERVAL {{ normalized_offset|abs }} {{ date_part }})) AND o.{{ date }} - {%- endif -%} - {%- for g in group_by %} - AND o.{{ g }} = i.{{ g }} - {% endfor -%} - ) AS {{ cleanse_name(agg + '_' + col + '_' + offset|string + date_part) }} - {%- endfor -%} - {%- endfor %} -{% endfor %} -FROM {{ source_table }} o \ No newline at end of file + , + ( + select {{ agg }} ({{ col }}) + from {{ source_table }} i + where + {% if normalized_offset > 0 %} + i.{{ date }} between o.{{ date }} and ( + o.{{ date }} + interval {{ normalized_offset }} {{ date_part }} + ) + {% else %} + i.{{ date }} between ( + o.{{ date }} - interval {{ normalized_offset|abs }} {{ date_part }} + ) and o.{{ date }} + {% endif %} + {% for g in group_by %} and o.{{ g }} = i.{{ g }} {% endfor %} + ) as {{ cleanse_name(agg + '_' + col + '_' + offset|string + date_part) }} + {% endfor %} + {% endfor %} + {% endfor %} +from {{ source_table }} o diff --git a/rasgotransforms/rasgotransforms/transforms/to_date/to_date.sql b/rasgotransforms/rasgotransforms/transforms/to_date/to_date.sql index d0c565ec..be2995a7 100644 --- a/rasgotransforms/rasgotransforms/transforms/to_date/to_date.sql +++ b/rasgotransforms/rasgotransforms/transforms/to_date/to_date.sql @@ -1,7 +1,9 @@ {%- set untouched_cols = get_columns(source_table)|list|reject('in', dates)|join(',') if overwrite_columns else "*" -%} -SELECT {{ untouched_cols }}, -{%- for target_col, date_format in dates.items() %} - DATE({{target_col}}, '{{date_format}}') as {{target_col if overwrite_columns else target_col + "_DATE"}}{{ ", " if not loop.last else "" }} -{%- endfor %} -from {{ source_table }} \ No newline at end of file +select + {{ untouched_cols }}, + {%- for target_col, date_format in dates.items() %} + date({{ target_col }}, '{{date_format}}') + as {{ target_col if overwrite_columns else target_col + "_DATE" }}{{ ", " if not loop.last else "" }} + {%- endfor %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/train_test_split/train_test_split.sql b/rasgotransforms/rasgotransforms/transforms/train_test_split/train_test_split.sql index 9668cb60..a3676512 100644 --- a/rasgotransforms/rasgotransforms/transforms/train_test_split/train_test_split.sql +++ b/rasgotransforms/rasgotransforms/transforms/train_test_split/train_test_split.sql @@ -1,7 +1,17 @@ -SELECT *, -{%- if order_by is defined %} -CASE WHEN ROW_NUMBER() OVER (ORDER BY {{order_by | join(", ")}} ) < (COUNT(1) OVER () * {{train_percent}}) THEN 'TRAIN' ELSE 'TEST' END AS TT_SPLIT -{%- else %} -CASE WHEN MOD(RANDOM(), 1/{{train_percent}}) = 0 THEN 'TEST' ELSE 'TRAIN' END AS TT_SPLIT -{%- endif %} -FROM {{ source_table }} \ No newline at end of file +select + *, + {%- if order_by is defined %} + case + when + row_number() over (order by {{ order_by | join(", ") }}) < ( + count(1) over () * {{ train_percent }} + ) + then 'TRAIN' + else 'TEST' + end as tt_split + {%- else %} + case + when mod(random(), 1 /{{ train_percent }}) = 0 then 'TEST' else 'TRAIN' + end as tt_split + {%- endif %} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/union/union.sql b/rasgotransforms/rasgotransforms/transforms/union/union.sql index adbcaa0a..0fd10029 100644 --- a/rasgotransforms/rasgotransforms/transforms/union/union.sql +++ b/rasgotransforms/rasgotransforms/transforms/union/union.sql @@ -6,9 +6,12 @@ {# Get Unique Columns Across Both Datasets #} {%- set union_cols = source_col_names.keys()|list + other_source_col_names.keys()|list -%} -{%- set union_cols = union_cols | unique | list -%} +{%- set union_cols = union_cols | unique | list -%} {# Generate Union Query #} -SELECT {{ union_cols | join(', ') }} FROM {{ dataset2 }} -UNION {{ 'ALL' if keep_dupes else '' }} -SELECT {{ union_cols | join(', ') }} FROM {{ source_table }} \ No newline at end of file +select {{ union_cols | join(', ') }} +from {{ dataset2 }} +union +{{ 'ALL' if keep_dupes else '' }} +select {{ union_cols | join(', ') }} +from {{ source_table }} diff --git a/rasgotransforms/rasgotransforms/transforms/unions/unions.sql b/rasgotransforms/rasgotransforms/transforms/unions/unions.sql index 5e7f7182..5d7f4dfd 100644 --- a/rasgotransforms/rasgotransforms/transforms/unions/unions.sql +++ b/rasgotransforms/rasgotransforms/transforms/unions/unions.sql @@ -1,19 +1,20 @@ {# Get all Columns in Source Table #} {%- set source_col_names = get_columns(source_table) -%} -{% set ns = namespace(union_columns=source_col_names.keys()) %} +{% set ns = namespace(union_columns=source_col_names.keys()) %} {%- for utable in union_tables -%} - {%- set utable_cols = get_columns(utable) -%} - {%- set ns.union_columns = ns.union_columns|list|select("in", utable_cols.keys()|list) -%} +{%- set utable_cols = get_columns(utable) -%} +{%- set ns.union_columns = ns.union_columns|list|select("in", utable_cols.keys()|list) -%} {%- endfor -%} {%- set columns_to_select = ns.union_columns|join(', ') -%} {# Generate Union Query #} -SELECT {{ columns_to_select }} -FROM {{ source_table }} +select {{ columns_to_select }} +from {{ source_table }} {%- for u_table in union_tables %} -UNION {{ 'ALL' if not remove_duplicates else '' }} -SELECT {{ columns_to_select }} -FROM {{ u_table }} -{%- endfor -%} \ No newline at end of file +union +{{ 'ALL' if not remove_duplicates else '' }} +select {{ columns_to_select }} +from {{ u_table }} +{%- endfor -%} diff --git a/rasgotransforms/rasgotransforms/transforms/unpivot/unpivot.sql b/rasgotransforms/rasgotransforms/transforms/unpivot/unpivot.sql index 143d3f68..a6bf227c 100644 --- a/rasgotransforms/rasgotransforms/transforms/unpivot/unpivot.sql +++ b/rasgotransforms/rasgotransforms/transforms/unpivot/unpivot.sql @@ -1,2 +1,6 @@ -SELECT * FROM {{ source_table }} -UNPIVOT( {{ value_column }} for {{ name_column }} in ( {{ column_list | join(', ')}} )) +select * +from + {{ source_table }} + unpivot( + {{ value_column }} for {{ name_column }} in ({{ column_list | join(', ') }}) + )