Hello,

we have the great fuzzy string match, that comes up with suggestions in the 
case of a typo of a column name.

Since underscores are the de facto standard of separating words, it would also 
make sense to also generate suggestions, if the order of words gets mixed up. 
Example: If the user types timstamp_entry instead of entry_timestamp the 
suggestion shows up.

The attached patch does that for up to three segments, that are separated by 
underscores. The permutation of two segments is treated the same way a wrongly 
typed char would be.

The permutation is skipped, if the typed column name contains more than 6 
underscores to prevent a meaningful (measured on my development machine) 
slowdown, if the user types to many underscores. In terms of underscores m and 
the length of the individual strings n_att and n_col the trivial upper bound is 
O(n_att * n_col * m^2). Considering, that strings with a lot of underscores 
have a bigger likelihood of being long as well, I simply decided to add it. I 
still wonder a bit whether it should be disabled entirely (as this patch does) 
or only the swap-three sections part as the rest would bound by O(n_att * n_col 
* m). But the utility of only swapping two sections seems a bit dubious to me, 
if I have 7 or more of them.

To me this patch seems simple (if string handling in C can be called that way) 
and self contained. Despite my calculations above, it resides in a non 
performance critical piece of code. I think of it as a quality of life thing.
Let me know what you think. Thank you!

Regards
Arne

From 2f5801abe48234fade70a7238fe2a1d1f2c5813d Mon Sep 17 00:00:00 2001
From: Arne Roland <arne.rol...@index.de>
Date: Fri, 6 Jan 2023 22:23:37 +0100
Subject: [PATCH] fuzzy_underscore_permutation

---
 src/backend/parser/parse_relation.c | 103 +++++++++++++++++++++-------
 1 file changed, 80 insertions(+), 23 deletions(-)

diff --git a/src/backend/parser/parse_relation.c b/src/backend/parser/parse_relation.c
index 5389a0eddb..f9347792eb 100644
--- a/src/backend/parser/parse_relation.c
+++ b/src/backend/parser/parse_relation.c
@@ -579,32 +579,13 @@ GetCTEForRTE(ParseState *pstate, RangeTblEntry *rte, int rtelevelsup)
 	return NULL;				/* keep compiler quiet */
 }
 
-/*
- * updateFuzzyAttrMatchState
- *	  Using Levenshtein distance, consider if column is best fuzzy match.
- */
 static void
-updateFuzzyAttrMatchState(int fuzzy_rte_penalty,
-						  FuzzyAttrMatchState *fuzzystate, RangeTblEntry *rte,
-						  const char *actual, const char *match, int attnum)
+updateFuzzyAttrMatchStateSingleString(int fuzzy_rte_penalty,
+							FuzzyAttrMatchState *fuzzystate, RangeTblEntry *rte,
+							const char *actual, const char *match, int attnum, int matchlen)
 {
-	int			columndistance;
-	int			matchlen;
-
-	/* Bail before computing the Levenshtein distance if there's no hope. */
-	if (fuzzy_rte_penalty > fuzzystate->distance)
-		return;
-
-	/*
-	 * Outright reject dropped columns, which can appear here with apparent
-	 * empty actual names, per remarks within scanRTEForColumn().
-	 */
-	if (actual[0] == '\0')
-		return;
-
 	/* Use Levenshtein to compute match distance. */
-	matchlen = strlen(match);
-	columndistance =
+	int columndistance =
 		varstr_levenshtein_less_equal(actual, strlen(actual), match, matchlen,
 									  1, 1, 1,
 									  fuzzystate->distance + 1
@@ -667,6 +648,82 @@ updateFuzzyAttrMatchState(int fuzzy_rte_penalty,
 	}
 }
 
+/*
+ * updateFuzzyAttrMatchState
+ *	  Using Levenshtein distance, consider if column is best fuzzy match.
+ */
+static void
+updateFuzzyAttrMatchState(int fuzzy_rte_penalty,
+						FuzzyAttrMatchState *fuzzystate, RangeTblEntry *rte,
+						const char *actual, const char *match, int attnum)
+{
+	/* Memory segment to store the current permutation of the match string. */
+	char* tmp_match;
+	/*
+	 * We count the number of underscores here, because we want to know whether we should consider
+	 * permuting underscore separated sections.
+	 */
+	int underscore_count = 0;
+	int matchlen		 = strlen(match);
+	/* We check for the amounts of underscores first, since updateFuzzyAttrMatchState has already quadratic run time. */
+	for (int i = 0; i < matchlen; i++) {
+		if (match[i] == '_')
+			underscore_count++;
+	}
+
+	/* Bail before computing the Levenshtein distance if there's no hope. */
+	if (fuzzy_rte_penalty > fuzzystate->distance)
+		return;
+
+	/*
+	 * Outright reject dropped columns, which can appear here with apparent
+	 * empty actual names, per remarks within scanRTEForColumn().
+	 */
+	if (actual[0] == '\0')
+		return;
+
+	updateFuzzyAttrMatchStateSingleString(fuzzy_rte_penalty, fuzzystate, rte, actual, match, attnum, matchlen);
+	/*
+	 * If told to, check for permuting up to three sections separated by underscores.
+	 */
+	if (underscore_count && underscore_count <= 6) {
+			tmp_match = palloc(matchlen);
+			tmp_match[matchlen] = '\0';
+			for (int i = 1; i < matchlen - 1; i++) {
+				if (match[i] == '_') {
+					/* Consider swapping two sections. */
+					memcpy(tmp_match, &match[i + 1], matchlen - i - 1);
+					tmp_match[matchlen - i - 1] = '_';
+					memcpy(&tmp_match[matchlen - i + 1], match, i);
+					updateFuzzyAttrMatchStateSingleString(fuzzy_rte_penalty + 1, fuzzystate, rte, actual, tmp_match, attnum, matchlen);
+					/* Consider swapping three sections. */
+					for (int j = i + 2; j < matchlen - 1; j++) {
+						if (match[j] == '_') {
+							/*
+							 * Only consider mirroring permutations, since the three simple rotations are already
+							 * (or will be for a later i) covered above.
+							 */
+							int permutation_matrix[3][3] = {{j - i, 0, j + 1},
+									{matchlen - i, matchlen - j, 0},
+									{0, matchlen - j + i + 1, i + 1}};
+							for (int k = 0; k < 3; k++) {
+								memcpy(&tmp_match[permutation_matrix[k][0]], match, i);
+								tmp_match[permutation_matrix[k][0] + i - 1 + 1] = '_';
+								memcpy(&tmp_match[permutation_matrix[k][1]], &match[i + 1], j - i - 1);
+								tmp_match[permutation_matrix[k][1] + j - i - 1] = '_';
+								memcpy(&tmp_match[permutation_matrix[k][2]], &match[j + 1], matchlen - j - 1);
+								tmp_match[permutation_matrix[k][2] + matchlen - j - 1] = '_';
+								tmp_match[matchlen] = '\0';
+								updateFuzzyAttrMatchStateSingleString(fuzzy_rte_penalty + 1, fuzzystate, rte, actual, tmp_match, attnum, matchlen);
+							}
+						}
+					}
+				}
+			}
+			pfree(tmp_match);
+		}
+}
+
 /*
  * scanNSItemForColumn
  *	  Search the column names of a single namespace item for the given name.
-- 
2.35.3

Reply via email to