From d7654d6528e0f75dd1aee719f69a69f8d791c975 Mon Sep 17 00:00:00 2001 From: Milind Srivastava Date: Sat, 7 Mar 2026 22:24:16 -0500 Subject: [PATCH 1/2] Added quotes to asap-sketch-ingest quote to preserve case --- asap-sketch-ingest/run_arroyosketch.py | 34 +++++++++++++++++--- asap-sketch-ingest/tests/test_integration.py | 21 ++++++------ 2 files changed, 40 insertions(+), 15 deletions(-) diff --git a/asap-sketch-ingest/run_arroyosketch.py b/asap-sketch-ingest/run_arroyosketch.py index 2ee7d61..36c61d1 100644 --- a/asap-sketch-ingest/run_arroyosketch.py +++ b/asap-sketch-ingest/run_arroyosketch.py @@ -391,7 +391,12 @@ def create_pipeline( """Create a pipeline JSON based on template""" # Escape newlines in SQL query for JSON compatibility - sql_queries = [sql_query.replace("\n", "\\n") for sql_query in sql_queries] + # Escape characters that would break the JSON string in pipeline.j2. + # Double quotes appear now that column names are quoted (issue #116). + sql_queries = [ + sql_query.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n") + for sql_query in sql_queries + ] sql_query = "\\n\\n".join(sql_queries) # UDFs handling @@ -575,6 +580,21 @@ def delete_pipelines(args): # ) +def _quote_col(label_prefix: str, label: str) -> str: + """Return a SQL column reference with double-quotes for case-sensitivity. + + DataFusion (Arroyo's SQL engine) normalises unquoted identifiers to + lowercase, which silently corrupts mixed-case column names. Wrapping + every user-supplied identifier in double-quotes preserves the original + casing. + + Examples: + _quote_col("", "hostName") -> '"hostName"' + _quote_col("labels.", "hostName") -> 'labels."hostName"' + """ + return f'{label_prefix}"{label}"' + + def get_sql_query( streaming_aggregation_config: StreamingAggregationConfig, schema_config, # MetricConfig or SQLTableConfig @@ -617,12 +637,14 @@ def get_sql_query( value_column = "value" label_prefix = "labels." if use_nested_labels else "" + # Double-quote all user-supplied column names so DataFusion preserves + # their original casing (issue #116). fully_qualified_group_by_columns = [ - "{}{}".format(label_prefix, label) + _quote_col(label_prefix, label) for label in streaming_aggregation_config.labels["grouping"].keys ] fully_qualified_agg_columns = [ - "{}{}".format(label_prefix, label) + _quote_col(label_prefix, label) for label in streaming_aggregation_config.labels["aggregated"].keys ] @@ -635,9 +657,13 @@ def get_sql_query( all_labels = schema_config.config[source_identifier].keys all_labels_agg_columns = [ - "{}{}".format(label_prefix, label) for label in all_labels + _quote_col(label_prefix, label) for label in all_labels ] + # Quote the scalar column references for the same reason. + time_column = f'"{time_column}"' + value_column = f'"{value_column}"' + # Determine if timestamps should be included as argument include_timestamps_as_argument = ( streaming_aggregation_config.aggregationType == "multipleincrease" diff --git a/asap-sketch-ingest/tests/test_integration.py b/asap-sketch-ingest/tests/test_integration.py index 335ddeb..f3bf82f 100644 --- a/asap-sketch-ingest/tests/test_integration.py +++ b/asap-sketch-ingest/tests/test_integration.py @@ -225,8 +225,8 @@ def test_sql_query_uses_value_column( use_nested_labels=False, ) - assert "cpu_usage" in sql_query - assert "value" not in sql_query or "cpu_usage" in sql_query + assert '"cpu_usage"' in sql_query + assert '"value"' not in sql_query or '"cpu_usage"' in sql_query def test_sql_query_no_label_prefix( self, sql_schema_config, sql_agg_config, sql_template @@ -246,12 +246,11 @@ def test_sql_query_no_label_prefix( use_nested_labels=False, ) - # Should have flat column names, not labels.host - assert "labels.host" not in sql_query - assert "labels.region" not in sql_query - # Should have host and region directly - assert "host" in sql_query - assert "region" in sql_query + # Should have flat quoted column names, not labels.host + assert "labels." not in sql_query + # Should have double-quoted host and region directly + assert '"host"' in sql_query + assert '"region"' in sql_query class TestGetSqlQueryPromQL: @@ -310,7 +309,7 @@ def test_promql_query_uses_value( use_nested_labels=True, ) - assert "value" in sql_query + assert '"value"' in sql_query def test_promql_query_uses_label_prefix( self, promql_metric_config, promql_agg_config, sql_template @@ -330,8 +329,8 @@ def test_promql_query_uses_label_prefix( use_nested_labels=True, ) - assert "labels.instance" in sql_query - assert "labels.job" in sql_query + assert 'labels."instance"' in sql_query + assert 'labels."job"' in sql_query class TestEndToEndConfigParsing: From 49464c0b6ec7106431427e255c70b708abb635f4 Mon Sep 17 00:00:00 2001 From: Milind Srivastava Date: Sat, 7 Mar 2026 22:30:42 -0500 Subject: [PATCH 2/2] formatting --- asap-sketch-ingest/run_arroyosketch.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/asap-sketch-ingest/run_arroyosketch.py b/asap-sketch-ingest/run_arroyosketch.py index 36c61d1..af0a4fc 100644 --- a/asap-sketch-ingest/run_arroyosketch.py +++ b/asap-sketch-ingest/run_arroyosketch.py @@ -656,9 +656,7 @@ def get_sql_query( source_identifier = streaming_aggregation_config.metric all_labels = schema_config.config[source_identifier].keys - all_labels_agg_columns = [ - _quote_col(label_prefix, label) for label in all_labels - ] + all_labels_agg_columns = [_quote_col(label_prefix, label) for label in all_labels] # Quote the scalar column references for the same reason. time_column = f'"{time_column}"'