From 325f8918b5fc03097ce6397f023738fbbe41546b Mon Sep 17 00:00:00 2001
From: Hunaid Sohail <hunaid2000@gmail.com>
Date: Sat, 7 Sep 2024 14:14:11 +0500
Subject: [PATCH v3] Add RN/rn support for to_number function

---
 doc/src/sgml/func.sgml                |  11 ++-
 src/backend/utils/adt/formatting.c    | 135 +++++++++++++++++++++++++-
 src/backend/utils/errcodes.txt        |   1 +
 src/test/regress/expected/numeric.out |  27 ++++++
 src/test/regress/sql/numeric.sql      |  14 +++
 5 files changed, 183 insertions(+), 5 deletions(-)
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 461fc3f437..e8adf28940 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -8628,7 +8628,7 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        </row>
        <row>
         <entry><literal>RN</literal></entry>
-        <entry>Roman numeral (input between 1 and 3999)</entry>
+        <entry>Roman numeral (valid for numbers 1 to 3999)</entry>
        </row>
        <row>
         <entry><literal>TH</literal> or <literal>th</literal></entry>
@@ -8754,6 +8754,15 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        (e.g., <literal>9.99EEEE</literal> is a valid pattern).
       </para>
      </listitem>
+
+    <listitem>
+      <para>
+        In <function>to_number</function>, <literal>RN</literal> pattern converts
+        roman numerals to standard numbers. It is case-insensitive (e.g., <literal>'XIV'</literal>,
+        <literal>'xiv'</literal>, and <literal>'Xiv'</literal> are all seen as <literal>14</literal>).
+        When using <literal>RN</literal>, other format elements are ignored.
+      </para>
+    </listitem>
     </itemizedlist>
    </para>
 
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 68069fcfd3..fb12bf91d0 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -49,7 +49,6 @@
  *	- better number building (formatting) / parsing, now it isn't
  *		  ideal code
  *	- use Assert()
- *	- add support for roman number to standard number conversion
  *	- add support for number spelling
  *	- add support for string to string formatting (we must be better
  *	  than Oracle :-),
@@ -270,6 +269,29 @@ static const char *const rm100[] = {"C", "CC", "CCC", "CD", "D", "DC", "DCC", "D
 static const char *const numTH[] = {"ST", "ND", "RD", "TH", NULL};
 static const char *const numth[] = {"st", "nd", "rd", "th", NULL};
 
+/* ----------
+ * MACRO: Check if the current and next characters
+ * form a valid subtraction combination for roman numerals
+ * ----------
+ */
+#define IS_VALID_SUB_COMB(curr, next) \
+	(((curr) == 'I' && ((next) == 'V' || (next) == 'X')) || \
+	 ((curr) == 'X' && ((next) == 'L' || (next) == 'C')) || \
+	 ((curr) == 'C' && ((next) == 'D' || (next) == 'M')))
+
+/* ----------
+ * MACRO: Roman number value
+ * ----------
+ */
+#define ROMAN_VAL(r) \
+	((r) == 'I' ? 1 : \
+	(r) == 'V' ? 5 : \
+	(r) == 'X' ? 10 : \
+	(r) == 'L' ? 50 : \
+	(r) == 'C' ? 100 : \
+	(r) == 'D' ? 500 : \
+	(r) == 'M' ? 1000 : 0)
+
 /* ----------
  * Flags & Options:
  * ----------
@@ -1074,6 +1096,7 @@ static bool do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std,
 static char *fill_str(char *str, int c, int max);
 static FormatNode *NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree);
 static char *int_to_roman(int number);
+static int roman_to_int(char* s, int len);
 static void NUM_prepare_locale(NUMProc *Np);
 static char *get_last_relevant_decnum(char *num);
 static void NUM_numpart_from_char(NUMProc *Np, int id, int input_len);
@@ -5236,6 +5259,99 @@ int_to_roman(int number)
 }
 
 
+static int
+roman_to_int(char* s, int len)
+{
+	int repeatCount = 1;
+	int vCount = 0, lCount = 0, dCount = 0;
+	bool subtractionEncountered = false;
+	char lastSubtractedChar = 0;
+	int total = 0;
+
+	if (len == 0 || len > 15)
+		return 0;
+
+	for (int i = 0; i < len; ++i)
+	{
+		char currChar = toupper(s[i]);
+		int currValue = ROMAN_VAL(currChar);
+
+		if (currValue == 0)
+			return 0;
+
+		/* Ensure no character greater than or equal to the subtracted
+		 * character appears after the subtraction.
+		 */
+		if (subtractionEncountered && (currValue >= ROMAN_VAL(lastSubtractedChar)))
+			return 0;
+
+		/* Check for invalid repetitions of characters V, L, or D. */
+		if (currChar == 'V') vCount++;
+		if (currChar == 'L') lCount++;
+		if (currChar == 'D') dCount++;
+		if (vCount > 1 || lCount > 1 || dCount > 1)
+			return 0;
+
+		if (i < len - 1)
+		{
+			char nextChar = toupper(s[i + 1]);
+			int nextValue = ROMAN_VAL(nextChar);
+
+			if (nextValue == 0)
+				return 0;
+
+			/* If the current value is less than the next value,
+			 * handle subtraction. Verify valid subtractive
+			 * combinations and update the total accordingly.
+			 */
+			if (currValue < nextValue)
+			{
+				/* Check for invalid repetitions of characters V, L, or D. */
+				if (nextChar == 'V') vCount++;
+				if (nextChar == 'L') lCount++;
+				if (nextChar == 'D') dCount++;
+				if (vCount > 1 || lCount > 1 || dCount > 1)
+					return 0;
+
+				/* for cases where the same character is repeated
+				 * with subtraction. Like 'MCCM' or 'DCCCD'.
+				 */
+				if (repeatCount > 1)
+					return 0;
+
+				if (!IS_VALID_SUB_COMB(currChar, nextChar))
+					return 0;
+
+				/* Skip the next character as it is part of
+				 * the subtractive combination.
+				 */
+				i++;
+				repeatCount = 1;
+				subtractionEncountered = true;
+				lastSubtractedChar = currChar;
+				total += (nextValue - currValue);
+			}
+			else
+			{
+				/* for same characters, check for repetition */
+				if (currChar == nextChar)
+				{
+					repeatCount++;
+					if (repeatCount > 3)
+						return 0;
+				}
+				else
+					repeatCount = 1;
+				total += currValue;
+			}
+		}
+		/* add the value of the last character */
+		else
+			total += currValue;
+	}
+
+	return total;
+}
 
 /* ----------
  * Locale
@@ -5787,6 +5903,7 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 			   *Np = &_Np;
 	const char *pattern;
 	int			pattern_len;
+	int			roman_result;
 
 	MemSet(Np, 0, sizeof(NUMProc));
 
@@ -5817,9 +5934,19 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 	if (IS_ROMAN(Np->Num))
 	{
 		if (!Np->is_to_char)
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("\"RN\" not supported for input")));
+		{
+			roman_result = roman_to_int(inout, input_len);
+			if (roman_result == 0)
+				ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ROMAN_NUMERAL),
+					 errmsg("invalid roman numeral")));
+			else
+			{
+				Np->Num->pre = sprintf(number, "%d", roman_result);
+				Np->Num->post = 0;
+				return number;
+			}
+		}
 
 		Np->Num->lsign = Np->Num->pre_lsign_num = Np->Num->post =
 			Np->Num->pre = Np->out_pre_spaces = Np->sign = 0;
diff --git a/src/backend/utils/errcodes.txt b/src/backend/utils/errcodes.txt
index b43a24d4bc..a64188da82 100644
--- a/src/backend/utils/errcodes.txt
+++ b/src/backend/utils/errcodes.txt
@@ -201,6 +201,7 @@ Section: Class 22 - Data Exception
 22P03    E    ERRCODE_INVALID_BINARY_REPRESENTATION                          invalid_binary_representation
 22P04    E    ERRCODE_BAD_COPY_FILE_FORMAT                                   bad_copy_file_format
 22P05    E    ERRCODE_UNTRANSLATABLE_CHARACTER                               untranslatable_character
+22P07    E    ERRCODE_INVALID_ROMAN_NUMERAL                                  invalid_roman_numeral
 2200L    E    ERRCODE_NOT_AN_XML_DOCUMENT                                    not_an_xml_document
 2200M    E    ERRCODE_INVALID_XML_DOCUMENT                                   invalid_xml_document
 2200N    E    ERRCODE_INVALID_XML_CONTENT                                    invalid_xml_content
diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out
index f30ac236f5..b29747034a 100644
--- a/src/test/regress/expected/numeric.out
+++ b/src/test/regress/expected/numeric.out
@@ -2297,6 +2297,33 @@ SELECT to_number('42nd', '99th');
         42
 (1 row)
 
+-- Test for correct conversion between numbers and Roman numerals
+WITH rows AS
+  (SELECT i, to_char(i, 'FMRN') AS roman FROM generate_series(1, 3999) AS i)
+SELECT
+  bool_and(to_number(roman, 'RN') = i) as valid
+FROM rows;
+ valid 
+-------
+ t
+(1 row)
+
+SELECT to_number('CvIiI', 'rn');
+ to_number 
+-----------
+       108
+(1 row)
+
+SELECT to_number('viv', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('DCCCD', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('XIXL', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('MCCM', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('', 'RN');
+ERROR:  invalid roman numeral
 RESET lc_numeric;
 --
 -- Input syntax
diff --git a/src/test/regress/sql/numeric.sql b/src/test/regress/sql/numeric.sql
index c86395209a..b3a09758ff 100644
--- a/src/test/regress/sql/numeric.sql
+++ b/src/test/regress/sql/numeric.sql
@@ -1070,6 +1070,20 @@ SELECT to_number('$1,234.56','L99,999.99');
 SELECT to_number('1234.56','L99,999.99');
 SELECT to_number('1,234.56','L99,999.99');
 SELECT to_number('42nd', '99th');
+
+-- Test for correct conversion between numbers and Roman numerals
+WITH rows AS
+  (SELECT i, to_char(i, 'FMRN') AS roman FROM generate_series(1, 3999) AS i)
+SELECT
+  bool_and(to_number(roman, 'RN') = i) as valid
+FROM rows;
+
+SELECT to_number('CvIiI', 'rn');
+SELECT to_number('viv', 'RN');
+SELECT to_number('DCCCD', 'RN');
+SELECT to_number('XIXL', 'RN');
+SELECT to_number('MCCM', 'RN');
+SELECT to_number('', 'RN');
 RESET lc_numeric;
 
 --
-- 
2.34.1