In the ansic* functions, if HANDLE_MULTIBYTE is defined, we end up
calling mbrtowc for non-"basic" characters in singlebyte locales.
While this is not incorrect (Android's broken implementation aside),
we can optimize by being more selective about when to call mbrtowc.
For the right combination of input and encoding, these changes can
speed up ansicstr by ~45% and ansic_quote by ~20%.
lib/sh/strtrans.c
- ansicstr: skip mbrtowc if we know that the encoding does not have
backslash as part of a multibyte character
- ansic_quote, ansic_shouldquote: tighten check for potential start
of a multibyte character
---
lib/sh/strtrans.c | 15 ++++++++++-----
1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/lib/sh/strtrans.c b/lib/sh/strtrans.c
index 586dc9f7..b2ab314e 100644
--- a/lib/sh/strtrans.c
+++ b/lib/sh/strtrans.c
@@ -79,8 +79,11 @@ ansicstr (const char *string, size_t len, int flags, int
*sawc, size_t *rlen)
{
clen = 1;
#if defined (HANDLE_MULTIBYTE)
- if ((locale_utf8locale && (c & 0x80)) ||
- (locale_utf8locale == 0 && mb_cur_max > 0 && is_basic (c) == 0))
+ /* We only care about multibyte character boundaries so that we do
+ not treat as the start of an escape sequence a '\\' byte that is
+ part of a multibyte character. If we know this is not possible
+ in the current encoding, skip calling mbrtowc. */
+ if (locale_utf8locale == 0 && mb_cur_max > 1 && is_basic (c) == 0)
{
clen = mbrtowc (&wc, s - 1, mb_cur_max, 0);
if (MB_NULLWCH (clen))
@@ -262,9 +265,10 @@ ansic_quote (const char *str, int flags, int *rlen)
break;
default:
#if defined (HANDLE_MULTIBYTE)
- if (is_basic (c) == 0)
+ if ((locale_utf8locale && (c & 0x80)) ||
+ (locale_utf8locale == 0 && locale_mb_cur_max > 1 && is_basic (c)
== 0))
{
- clen = mbrtowc (&wc, s, MB_CUR_MAX, &state);
+ clen = mbrtowc (&wc, s, locale_mb_cur_max, &state);
if (MB_NULLWCH (clen))
goto quote_end;
if (MB_INVALIDCH (clen))
@@ -346,7 +350,8 @@ ansic_shouldquote (const char *string)
for (s = string; c = *s; s++)
{
#if defined (HANDLE_MULTIBYTE)
- if (is_basic (c) == 0)
+ if ((locale_utf8locale && (c & 0x80)) ||
+ (locale_utf8locale == 0 && locale_mb_cur_max > 1 && is_basic (c) ==
0))
return (ansic_wshouldquote (s));
#endif
if (ISPRINT (c) == 0)
--
2.51.1