[ https://issues.apache.org/jira/browse/DRILL-5450?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15992350#comment-15992350 ]
ASF GitHub Bot commented on DRILL-5450: --------------------------------------- Github user paul-rogers commented on a diff in the pull request: https://github.com/apache/drill/pull/821#discussion_r114249116 --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionHelpers.java --- @@ -144,41 +144,28 @@ public static int varTypesToInt(final int start, final int end, DrillBuf buffer) return result; } - // Assumes Alpha as [A-Za-z0-9] - // white space is treated as everything else. + /** + * Capitalizes first letter in each word. + * Any symbol except digits and letters is considered as word delimiter. + * + * @param start start position in input buffer + * @param end end position in input buffer + * @param inBuf buffer with input characters + * @param outBuf buffer with output characters + */ public static void initCap(int start, int end, DrillBuf inBuf, DrillBuf outBuf) { - boolean capNext = true; + boolean capitalizeNext = true; int out = 0; for (int id = start; id < end; id++, out++) { - byte currentByte = inBuf.getByte(id); - - // 'A - Z' : 0x41 - 0x5A - // 'a - z' : 0x61 - 0x7A - // '0-9' : 0x30 - 0x39 - if (capNext) { // curCh is whitespace or first character of word. - if (currentByte >= 0x30 && currentByte <= 0x39) { // 0-9 - capNext = false; - } else if (currentByte >= 0x41 && currentByte <= 0x5A) { // A-Z - capNext = false; - } else if (currentByte >= 0x61 && currentByte <= 0x7A) { // a-z - capNext = false; - currentByte -= 0x20; // Uppercase this character - } - // else {} whitespace - } else { // Inside of a word or white space after end of word. - if (currentByte >= 0x30 && currentByte <= 0x39) { // 0-9 - // noop - } else if (currentByte >= 0x41 && currentByte <= 0x5A) { // A-Z - currentByte -= 0x20; // Lowercase this character - } else if (currentByte >= 0x61 && currentByte <= 0x7A) { // a-z - // noop - } else { // whitespace - capNext = true; - } + int currentByte = inBuf.getByte(id); --- End diff -- This code works only for ASCII, but not for UTF-8. UTF-8 is a multi-byte code that requires special encoding/decoding to convert to Unicode characters. Without that encoding, this method won't work for Cyrillic, Greek or any other character set with upper/lower distinctions. Since this method never worked, it is probably OK to make it a bit less broken than before: at least now it works for ASCII. Please add unit tests below, then file a JIRA, for the fact that this function does not work with UTF-8 despite the fact that Drill claims it supports UTF-8. > Fix initcap function to convert upper case characters correctly > --------------------------------------------------------------- > > Key: DRILL-5450 > URL: https://issues.apache.org/jira/browse/DRILL-5450 > Project: Apache Drill > Issue Type: Bug > Components: Functions - Drill > Affects Versions: 1.10.0 > Reporter: Arina Ielchiieva > Assignee: Arina Ielchiieva > > Initcap function converts incorrectly subsequent upper case characters after > first character. > {noformat} > 0: jdbc:drill:zk=local> select initcap('aaa') from (values(1)); > +---------+ > | EXPR$0 | > +---------+ > | Aaa | > +---------+ > 1 row selected (0.275 seconds) > 0: jdbc:drill:zk=local> select initcap('AAA') from (values(1)); > +---------+ > | EXPR$0 | > +---------+ > | A!! | > +---------+ > 1 row selected (0.27 seconds) > 0: jdbc:drill:zk=local> select initcap('aAa') from (values(1)); > +---------+ > | EXPR$0 | > +---------+ > | A!a | > +---------+ > 1 row selected (0.229 seconds) > {noformat} -- This message was sent by Atlassian JIRA (v6.3.15#6346)