From 3482be30561505e664f623c9e821869488daa896 Mon Sep 17 00:00:00 2001
From: Parker Timmerman <parker.timmerman@materialize.com>
Date: Wed, 18 Oct 2023 11:46:43 -0400
Subject: [PATCH] sql: Suggest other columns names when one can't be found
 (#22446)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR updates `PlanError::UnknownColumn` to suggest similarly named
columns if the one specified cannot be found.

### Motivation

Closes https://github.com/MaterializeInc/materialize/issues/22427

### Checklist

- [ ] This PR has adequate test coverage / QA involvement has been duly
considered.
- [ ] This PR has an associated up-to-date [design
doc](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/design/README.md),
is a design doc
([template](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/design/00000000_template.md)),
or is sufficiently small to not require a design.
  <!-- Reference the design in the description. -->
- [ ] If this PR evolves [an existing `$T ⇔ Proto$T`
mapping](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/command-and-response-binary-encoding.md)
(possibly in a backwards-incompatible way), then it is tagged with a
`T-proto` label.
- [ ] If this PR will require changes to cloud orchestration or tests,
there is a companion cloud PR to account for those changes that is
tagged with the release-blocker label
([example](https://github.com/MaterializeInc/cloud/pull/5021)).
<!-- Ask in #team-cloud on Slack if you need help preparing the cloud
PR. -->
- [x] This PR includes the following [user-facing behavior
changes](https://github.com/MaterializeInc/materialize/blob/main/doc/developer/guide-changes.md#what-changes-require-a-release-note):
  - Updates an error message to be more helpful.
---
 src/repr/src/relation.rs                      | 14 +++++++
 src/sql/src/names.rs                          |  2 +
 src/sql/src/plan/error.rs                     | 14 ++++++-
 src/sql/src/plan/query.rs                     | 32 +++++++++-----
 src/sql/src/plan/scope.rs                     | 34 +++++++++++----
 src/sql/src/plan/statement/dml.rs             |  1 +
 src/sql/src/pure/postgres.rs                  | 12 +++++-
 test/pg-cdc/pg-cdc.td                         | 11 +++++
 .../cockroach/case_sensitive_names.slt        | 42 ++++++++++++++++++-
 9 files changed, 138 insertions(+), 24 deletions(-)
diff --git a/src/repr/src/relation.rs b/src/repr/src/relation.rs
index ba8ea1918e1bf..390d8e4a23903 100644
--- a/src/repr/src/relation.rs
+++ b/src/repr/src/relation.rs
@@ -289,6 +289,11 @@ impl ColumnName {
     pub fn as_mut_str(&mut self) -> &mut String {
         &mut self.0
     }
+
+    /// Returns if this [`ColumnName`] is similar to the provided one.
+    pub fn is_similar(&self, other: &ColumnName) -> bool {
+        self.0.to_lowercase() == other.as_str().to_lowercase()
+    }
 }
 
 impl fmt::Display for ColumnName {
@@ -496,6 +501,15 @@ impl RelationDesc {
         self.names.iter()
     }
 
+    /// Returns an iterator over the names of the columns in this relation that are "similar" to
+    /// the provided `name`.
+    pub fn iter_similar_names<'a>(
+        &'a self,
+        name: &'a ColumnName,
+    ) -> impl Iterator<Item = &'a ColumnName> {
+        self.iter_names().filter(|n| n.is_similar(name))
+    }
+
     /// Finds a column by name.
     ///
     /// Returns the index and type of the column named `name`. If no column with
diff --git a/src/sql/src/names.rs b/src/sql/src/names.rs
index ec294623e86cd..04ef3372c9022 100644
--- a/src/sql/src/names.rs
+++ b/src/sql/src/names.rs
@@ -1764,9 +1764,11 @@ impl<'a> Fold<Raw, Aug> for NameResolver<'a> {
                 let name = normalize::column_name(column_name.column);
                 let Some((index, _typ)) = desc.get_by_name(&name) else {
                     if self.status.is_ok() {
+                        let similar = desc.iter_similar_names(&name).cloned().collect();
                         self.status = Err(PlanError::UnknownColumn {
                             table: Some(full_name.clone().into()),
                             column: name,
+                            similar,
                         })
                     }
                     return ResolvedColumnName::Error;
diff --git a/src/sql/src/plan/error.rs b/src/sql/src/plan/error.rs
index 0605fb9fa31dd..11906b4db21da 100644
--- a/src/sql/src/plan/error.rs
+++ b/src/sql/src/plan/error.rs
@@ -58,6 +58,7 @@ pub enum PlanError {
     UnknownColumn {
         table: Option<PartialItemName>,
         column: ColumnName,
+        similar: Box<[ColumnName]>,
     },
     UngroupedColumn {
         table: Option<PartialItemName>,
@@ -340,6 +341,17 @@ impl PlanError {
             Self::KafkaSourcePurification(e) => e.hint(),
             Self::TestScriptSourcePurification(e) => e.hint(),
             Self::LoadGeneratorSourcePurification(e) => e.hint(),
+            Self::UnknownColumn { table, similar, .. } => {
+                let suffix = "Make sure to surround case sensitive names in double quotes.";
+                match &similar[..] {
+                    [] => None,
+                    [column] => Some(format!("The similarly named column {} does exist. {suffix}", ColumnDisplay { table, column })),
+                    names => {
+                        let similar = names.into_iter().map(|column| ColumnDisplay { table, column }).join(", ");
+                        Some(format!("There are similarly named columns that do exist: {similar}. {suffix}"))
+                    }
+                }
+            }
             _ => None,
         }
     }
@@ -362,7 +374,7 @@ impl fmt::Display for PlanError {
                 }
                 Ok(())
             }
-            Self::UnknownColumn { table, column } => write!(
+            Self::UnknownColumn { table, column, similar: _ } => write!(
                 f,
                 "column {} does not exist",
                 ColumnDisplay { table, column }
diff --git a/src/sql/src/plan/query.rs b/src/sql/src/plan/query.rs
index c168b6f9c70d9..d8817e72d80f1 100644
--- a/src/sql/src/plan/query.rs
+++ b/src/sql/src/plan/query.rs
@@ -2104,14 +2104,20 @@ fn plan_view_select(
     // Checks if an unknown column error was the result of not including that
     // column in the GROUP BY clause and produces a friendlier error instead.
     let check_ungrouped_col = |e| match e {
-        PlanError::UnknownColumn { table, column } => {
-            match from_scope.resolve(&qcx.outer_scopes, table.as_ref(), &column) {
-                Ok(ColumnRef { level: 0, column }) => {
-                    PlanError::ungrouped_column(&from_scope.items[column])
-                }
-                _ => PlanError::UnknownColumn { table, column },
+        PlanError::UnknownColumn {
+            table,
+            column,
+            similar,
+        } => match from_scope.resolve(&qcx.outer_scopes, table.as_ref(), &column) {
+            Ok(ColumnRef { level: 0, column }) => {
+                PlanError::ungrouped_column(&from_scope.items[column])
             }
-        }
+            _ => PlanError::UnknownColumn {
+                table,
+                column,
+                similar,
+            },
+        },
         e => e,
     };
 
@@ -2453,6 +2459,7 @@ fn plan_group_by_expr<'a>(
             Err(PlanError::UnknownColumn {
                 table: None,
                 column,
+                similar,
             }) => {
                 // The expression was a simple identifier that did not match an
                 // input column. See if it matches an output column.
@@ -2469,6 +2476,7 @@ fn plan_group_by_expr<'a>(
                     Err(PlanError::UnknownColumn {
                         table: None,
                         column,
+                        similar,
                     })
                 }
             }
@@ -4627,12 +4635,13 @@ fn plan_identifier(ecx: &ExprContext, names: &[Ident]) -> Result<HirScalarExpr,
         return Ok(HirScalarExpr::Column(i));
     }
 
-    // If the name is unqualified, first check if it refers to a column.
-    match ecx.scope.resolve_column(&ecx.qcx.outer_scopes, &col_name) {
+    // If the name is unqualified, first check if it refers to a column. Track any similar names
+    // that might exist for a better error message.
+    let similar_names = match ecx.scope.resolve_column(&ecx.qcx.outer_scopes, &col_name) {
         Ok(i) => return Ok(HirScalarExpr::Column(i)),
-        Err(PlanError::UnknownColumn { .. }) => (),
+        Err(PlanError::UnknownColumn { similar, .. }) => similar,
         Err(e) => return Err(e),
-    }
+    };
 
     // The name doesn't refer to a column. Check if it is a whole-row reference
     // to a table.
@@ -4649,6 +4658,7 @@ fn plan_identifier(ecx: &ExprContext, names: &[Ident]) -> Result<HirScalarExpr,
         [] => Err(PlanError::UnknownColumn {
             table: None,
             column: col_name,
+            similar: similar_names,
         }),
         // The name refers to a table that is the result of a function that
         // returned a single column. Per PostgreSQL, this is a special case
diff --git a/src/sql/src/plan/scope.rs b/src/sql/src/plan/scope.rs
index d8debca360e5d..4beecd584793c 100644
--- a/src/sql/src/plan/scope.rs
+++ b/src/sql/src/plan/scope.rs
@@ -269,6 +269,10 @@ impl Scope {
         Ok(items)
     }
 
+    /// Returns a matching [`ColumnRef`], if one exists.
+    ///
+    /// Filters all visible items against the provided `matches` closure, and then matches this
+    /// filtered set against the provided `column_name`.
     fn resolve_internal<'a, M>(
         &'a self,
         outer_scopes: &[Scope],
@@ -281,12 +285,26 @@ impl Scope {
     {
         let mut results = self
             .all_items(outer_scopes)
-            .filter(|(column, lat_level, item)| (matches)(*column, *lat_level, item));
+            .filter(|(column, lat_level, item)| {
+                (matches)(*column, *lat_level, item) && item.column_name == *column_name
+            });
         match results.next() {
-            None => Err(PlanError::UnknownColumn {
-                table: table_name.cloned(),
-                column: column_name.clone(),
-            }),
+            None => {
+                let similar = self
+                    .all_items(outer_scopes)
+                    .filter(|(column, lat_level, item)| (matches)(*column, *lat_level, item))
+                    .filter_map(|(_col, _lat, item)| {
+                        item.column_name
+                            .is_similar(column_name)
+                            .then(|| item.column_name.clone())
+                    })
+                    .collect();
+                Err(PlanError::UnknownColumn {
+                    table: table_name.cloned(),
+                    column: column_name.clone(),
+                    similar,
+                })
+            }
             Some((column, lat_level, item)) => {
                 if results
                     .find(|(_column, lat_level2, _item)| lat_level == *lat_level2)
@@ -321,9 +339,7 @@ impl Scope {
         let table_name = None;
         self.resolve_internal(
             outer_scopes,
-            |_column, _lat_level, item| {
-                item.allow_unqualified_references && item.column_name == *column_name
-            },
+            |_column, _lat_level, item| item.allow_unqualified_references,
             table_name,
             column_name,
         )
@@ -368,7 +384,7 @@ impl Scope {
                 }
                 if item.table_name.as_ref().map(|n| n.matches(table_name)) == Some(true) {
                     seen_at_level = Some(lat_level);
-                    item.column_name == *column_name
+                    true
                 } else {
                     false
                 }
diff --git a/src/sql/src/plan/statement/dml.rs b/src/sql/src/plan/statement/dml.rs
index 68d1349b6bb44..30afe6459a3bf 100644
--- a/src/sql/src/plan/statement/dml.rs
+++ b/src/sql/src/plan/statement/dml.rs
@@ -673,6 +673,7 @@ pub fn plan_subscribe(
                 Err(PlanError::UnknownColumn {
                     table: None,
                     column,
+                    similar: _,
                 }) if &column == &mz_diff => {
                     // mz_diff is being used in an expression. Since mz_diff isn't part of the table
                     // it looks like an unknown column. Instead, return a better error
diff --git a/src/sql/src/pure/postgres.rs b/src/sql/src/pure/postgres.rs
index a708edfddc189..5d3f8bad43327 100644
--- a/src/sql/src/pure/postgres.rs
+++ b/src/sql/src/pure/postgres.rs
@@ -146,6 +146,15 @@ pub(super) fn generate_text_columns(
                 })?;
 
         if !desc.columns.iter().any(|column| column.name == col) {
+            let column = mz_repr::ColumnName::from(col);
+            let similar = desc
+                .columns
+                .iter()
+                .filter_map(|c| {
+                    let c_name = mz_repr::ColumnName::from(c.name.clone());
+                    c_name.is_similar(&column).then_some(c_name)
+                })
+                .collect();
             return Err(PlanError::InvalidOptionValue {
                 option_name: option_name.to_string(),
                 err: Box::new(PlanError::UnknownColumn {
@@ -153,7 +162,8 @@ pub(super) fn generate_text_columns(
                         normalize::unresolved_item_name(fully_qualified_name)
                             .expect("known to be of valid len"),
                     ),
-                    column: mz_repr::ColumnName::from(col),
+                    column,
+                    similar,
                 }),
             });
         }
diff --git a/test/pg-cdc/pg-cdc.td b/test/pg-cdc/pg-cdc.td
index db211d3018e02..e14f5f92a0fc3 100644
--- a/test/pg-cdc/pg-cdc.td
+++ b/test/pg-cdc/pg-cdc.td
@@ -599,6 +599,17 @@ DELETE FROM conflict_schema.conflict_table;
   );
 contains: invalid TEXT COLUMNS option value: column "pk_table.col_dne" does not exist
 
+! CREATE SOURCE case_sensitive_names
+  FROM POSTGRES CONNECTION pg (
+    PUBLICATION 'mz_source',
+    TEXT COLUMNS [pk_table."F2"]
+  )
+  FOR TABLES (
+    "enum_table"
+  );
+contains: invalid TEXT COLUMNS option value: column "pk_table.F2" does not exist
+hint: The similarly named column "pk_table.f2" does exist.
+
 ! CREATE SOURCE enum_source
   FROM POSTGRES CONNECTION pg (
     PUBLICATION 'mz_source',
diff --git a/test/sqllogictest/cockroach/case_sensitive_names.slt b/test/sqllogictest/cockroach/case_sensitive_names.slt
index c1859293d3141..646a516fc75a9 100644
--- a/test/sqllogictest/cockroach/case_sensitive_names.slt
+++ b/test/sqllogictest/cockroach/case_sensitive_names.slt
@@ -204,10 +204,10 @@ SELECT x, X, "Y" FROM foo
 ----
 x x Y
 
-statement error column "X" does not exist
+statement error db error: ERROR: column "X" does not exist\nHINT: The similarly named column "x" does exist.
 SELECT "X" FROM foo
 
-statement error column "y" does not exist
+statement error column "y" does not exist\nHINT: The similarly named column "Y" does exist.
 SELECT Y FROM foo
 
 # The following should not be ambiguous.
@@ -216,6 +216,44 @@ SELECT Y, "Y" FROM (SELECT x as y, "Y" FROM foo)
 ----
 y Y
 
+statement ok
+CREATE TABLE foo_multi_case ("cAsE" INT, "CASE" INT)
+
+statement error ERROR: column "foo_multi_case.case" does not exist\nHINT: There are similarly named columns that do exist: "foo_multi_case.cAsE", "foo_multi_case.CASE". Make sure to surround case sensitive names in double quotes.
+SELECT foo_multi_case.case FROM foo_multi_case;
+
+statement ok
+CREATE TABLE j1 ("X" int);
+
+statement ok
+INSERT INTO j1 VALUES (1), (2), (3);
+
+statement ok
+CREATE TABLE j2 ("X" int, "Y" int);
+
+statement ok
+INSERT INTO j2 VALUES (1, 5), (2, 7), (8, 9);
+
+statement ok
+CREATE TABLE j3 ("X" int, "Z" int);
+
+statement ok
+INSERT INTO j3 VALUES (1, 7), (10, 11), (12, 13);
+
+statement error db error: ERROR: column "Y" does not exist\nHINT: The similarly named column "y" does exist.
+SELECT "Y" FROM ( SELECT "X" as y FROM j1 NATURAL JOIN j2 );
+
+query I
+SELECT y FROM ( SELECT "X" as y FROM j1 NATURAL JOIN j2 );
+----
+1
+2
+
+# Even though the column "Y" exists, it should not get suggested since it's not in scope.
+
+statement error db error: ERROR: column "y" does not exist
+SELECT "X", y FROM j1 LEFT JOIN ( SELECT "X", "Z" FROM j2 LEFT JOIN j3 USING ("X") ) USING ("X");
+
 # Case sensitivity of view names.
 
 mode standard