Re: [PATCH v2 7/7] udf: Merge linux specific translation into CS0 conversion function

Jan Kara Mon, 04 Jan 2016 05:25:23 -0800

On Thu 24-12-15 10:25:38, Andrew Gabbasov wrote:
> Current implementation of udf_translate_to_linux function does not
> support multi-bytes characters at all: it counts bytes while calculating
> extension length, when inserting CRC inside the name it doesn't
> take into account inter-character boundaries and can break into
> the middle of the character.
> 
> The most efficient way to properly support multi-bytes characters is
> merging of translation operations directly into conversion function.
> This can help to avoid extra passes along the string or parsing
> the multi-bytes character back into unicode to find out it's length.
> 
> Signed-off-by: Andrew Gabbasov <[email protected]>


Looks mostly good. A few comments below.

> ---
>  fs/udf/unicode.c | 260 
> ++++++++++++++++++++++++++++++-------------------------
>  1 file changed, 141 insertions(+), 119 deletions(-)
> 
> diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
> index f1cdeac..1dc967d 100644
> --- a/fs/udf/unicode.c
> +++ b/fs/udf/unicode.c
> @@ -28,9 +28,6 @@
>  
>  #include "udf_sb.h"
>  
> -static int udf_translate_to_linux(uint8_t *, int, const uint8_t *, int,
> -                               const uint8_t *, int);
> -
>  static int udf_uni2char_utf8(wchar_t uni,
>                            unsigned char *out,
>                            int boundlen)
> @@ -114,13 +111,32 @@ static int udf_char2uni_utf8(const unsigned char *in,
>       return u_len;
>  }
>  
> +#define ILLEGAL_CHAR_MARK    '_'
> +#define EXT_MARK             '.'
> +#define CRC_MARK             '#'
> +#define EXT_SIZE             5
> +/* Number of chars we need to store generated CRC to make filename unique */
> +#define CRC_LEN                      5
> +
>  static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
>                            const uint8_t *ocu, int ocu_len,
> -                          int (*conv_f)(wchar_t, unsigned char *, int))
> +                          int (*conv_f)(wchar_t, unsigned char *, int),
> +                          int translate)
>  {
> +     uint32_t c;
>       uint8_t cmp_id;
>       int i, len;
> -     int str_o_len = 0;
> +     int u_ch;
> +     int firstDots = 0, needsCRC = 0, illChar;
> +     int ext_i_len, ext_max_len;
> +     int str_o_len = 0;      /* Length of resulting output */
> +     int ext_o_len = 0;      /* Extension output length */
> +     int ext_crc_len = 0;    /* Extension output length if used with CRC */
> +     int i_ext = -1;         /* Extension position in input buffer */
> +     int o_crc = 0;          /* Rightmost possible output pos for CRC+ext */
> +     unsigned short valueCRC;
> +     uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1];
> +     uint8_t crc[CRC_LEN];
>  
>       if (str_max_len <= 0)
>               return 0;
> @@ -133,22 +149,134 @@ static int udf_name_from_CS0(uint8_t *str_o, int 
> str_max_len,
>       cmp_id = ocu[0];
>       if (cmp_id != 8 && cmp_id != 16) {
>               memset(str_o, 0, str_max_len);
> -             pr_err("unknown compression code (%d) stri=%s\n", cmp_id, ocu);
> +             pr_err("unknown compression code (%d)\n", cmp_id);
>               return -EINVAL;
>       }
> +     u_ch = cmp_id >> 3;
> +
> +     ocu++;
> +     ocu_len--;
> +

Can we just check whether ocu_len is a multiple of u_ch here and bail out
if not? That would save us some rounding below and also fix possible access
beyond the end of the string in case fs is corrupted.

> +     if (translate) {
> +             /* Look for extension */
> +             for (i = (ocu_len & ~(u_ch - 1)) - u_ch, ext_i_len = 0;
> +                  (i >= 0) && (ext_i_len < EXT_SIZE);
> +                  i -= u_ch, ext_i_len++) {
> +

Just remove the empty line above please.

> +                     c = ocu[i];
> +                     if (u_ch > 1)
> +                             c = (c << 8) | ocu[i + 1];
> +
> +                     if (c == EXT_MARK) {
> +                             if (ext_i_len)
> +                                     i_ext = i;
> +                             break;
> +                     }
> +             }
> +             if (i_ext >= 0) {
> +                     /* Convert extension */
> +                     ext_max_len = min_t(int, sizeof(ext), str_max_len);
> +                     ext[ext_o_len++] = EXT_MARK;
> +                     illChar = 0;
> +                     for (i = i_ext + u_ch; i < ocu_len;) {
> +

Remove the empty line here please. Also how about using i += u_ch in the
for() above instead of i++ in the two lines below?

> +                             c = ocu[i++];
> +                             if (u_ch > 1)
> +                                     c = (c << 8) | ocu[i++];
> +
> +                             if (c == '/' || c == 0) {
> +                                     if (illChar)
> +                                             continue;
> +                                     illChar = 1;
> +                                     needsCRC = 1;
> +                                     c = ILLEGAL_CHAR_MARK;
> +                             } else {
> +                                     illChar = 0;
> +                             }
> +
> +                             len = conv_f(c, &ext[ext_o_len],
> +                                          ext_max_len - ext_o_len);
> +                             /* Valid character? */
> +                             if (len >= 0) {
> +                                     ext_o_len += len;
> +                             } else {
> +                                     ext[ext_o_len++] = '?';
> +                                     needsCRC = 1;
> +                             }
> +                             if ((ext_o_len + CRC_LEN) < str_max_len)
> +                                     ext_crc_len = ext_o_len;
> +                     }
> +             }
> +     }
> +
> +     illChar = 0;
> +     for (i = 0; i < ocu_len;) {
> +

Again, remove the empty line, i += u_ch.

> +             if (str_o_len >= str_max_len) {
> +                     needsCRC = 1;
> +                     break;
> +             }
> +
> +             if (translate && (i == i_ext)) {
> +                     if (str_o_len > (str_max_len - ext_o_len))
> +                             needsCRC = 1;
> +                     break;
> +             }
>  
> -     for (i = 1; (i < ocu_len) && (str_o_len < str_max_len);) {
>               /* Expand OSTA compressed Unicode to Unicode */
> -             uint32_t c = ocu[i++];
> -             if (cmp_id == 16)
> +             c = ocu[i++];
> +             if (u_ch > 1)
>                       c = (c << 8) | ocu[i++];
>  
> +             if (translate) {
> +                     if ((c == '.') && (firstDots >= 0))
> +                             firstDots++;
> +                     else
> +                             firstDots = -1;
> +
> +                     if (c == '/' || c == 0) {
> +                             if (illChar)
> +                                     continue;
> +                             illChar = 1;
> +                             needsCRC = 1;
> +                             c = ILLEGAL_CHAR_MARK;
> +                     } else {
> +                             illChar = 0;
> +                     }
> +             }
> +
>               len = conv_f(c, &str_o[str_o_len], str_max_len - str_o_len);
>               /* Valid character? */
> -             if (len >= 0)
> +             if (len >= 0) {
>                       str_o_len += len;
> -             else
> +             } else {
>                       str_o[str_o_len++] = '?';
> +                     needsCRC = 1;
> +             }

Umm, can you try factoring out body of this loop into a helper function and
using it both during extension parsing and full name parsing? Also I think
that checking whether the name is '.' or '..' could be moved just into the
if (translate) below where you could just directly check it like:
        if (str_o_len <= 2 && str_o[0] == '.' &&
            (str_o_len == 1 || str_o[1] == '.'))

> +             if (str_o_len <= (str_max_len - ext_o_len - CRC_LEN))
> +                     o_crc = str_o_len;
> +     }
> +
> +     if (translate) {
> +             if ((firstDots == 1) || (firstDots == 2))
> +                     needsCRC = 1;
> +             if (needsCRC) {
> +                     str_o_len = o_crc;
> +                     valueCRC = crc_itu_t(0, ocu, ocu_len);
> +                     crc[0] = CRC_MARK;
> +                     crc[1] = hex_asc_upper_hi(valueCRC >> 8);
> +                     crc[2] = hex_asc_upper_lo(valueCRC >> 8);
> +                     crc[3] = hex_asc_upper_hi(valueCRC);
> +                     crc[4] = hex_asc_upper_lo(valueCRC);
> +                     len = min_t(int, CRC_LEN, str_max_len - str_o_len);
> +                     memcpy(&str_o[str_o_len], crc, len);
> +                     str_o_len += len;
> +                     ext_o_len = ext_crc_len;
> +             }
> +             if (ext_o_len > 0) {
> +                     memcpy(&str_o[str_o_len], ext, ext_o_len);
> +                     str_o_len += ext_o_len;
> +             }
>       }
>  
>       return str_o_len;

                                                                Honza
-- 
Jan Kara <[email protected]>
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 7/7] udf: Merge linux specific translation into CS0 conversion function

Reply via email to