byte lengths.

marvin Thu, 05 May 2016 13:57:07 -0700

Address -Wconversion for string/byte lengths.

For text lengths, unicode code point counts, and sometimes arbitrary
byte lengths: add casts and adress potential overflow issues with
checks.



Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/a4b0b3b2
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/a4b0b3b2
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/a4b0b3b2

Branch: refs/heads/master
Commit: a4b0b3b252f4bf253039756bad02b2fe80077114
Parents: 5ba1525
Author: Marvin Humphrey <mar...@rectangular.com>
Authored: Mon May 2 17:08:32 2016 -0700
Committer: Marvin Humphrey <mar...@rectangular.com>
Committed: Wed May 4 19:21:37 2016 -0700

----------------------------------------------------------------------
 core/Lucy/Analysis/Normalizer.c        | 18 +++++++++++++-----
 core/Lucy/Analysis/SnowballStemmer.c   | 11 +++++++++--
 core/Lucy/Analysis/StandardTokenizer.c | 10 ++++++----
 core/Lucy/Highlight/Highlighter.c      |  4 ++--
 core/Lucy/Index/HighlightWriter.c      |  8 ++++----
 core/Lucy/Index/Posting/MatchPosting.c |  2 +-
 core/Lucy/Plan/TextType.c              |  6 +++---
 core/Lucy/Util/Json.c                  |  4 ++--
 core/Lucy/Util/MemoryPool.c            |  2 +-
 9 files changed, 41 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/a4b0b3b2/core/Lucy/Analysis/Normalizer.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Analysis/Normalizer.c b/core/Lucy/Analysis/Normalizer.c
index 4258374..d569141 100644
--- a/core/Lucy/Analysis/Normalizer.c
+++ b/core/Lucy/Analysis/Normalizer.c
@@ -81,7 +81,7 @@ Normalizer_Transform_IMP(Normalizer *self, Inversion 
*inversion) {
         TokenIVARS *const token_ivars = Token_IVARS(token);
         ssize_t len
             = utf8proc_decompose((uint8_t*)token_ivars->text,
-                                 token_ivars->len, buffer, bufsize,
+                                 (ssize_t)token_ivars->len, buffer, bufsize,
                                  ivars->options);
 
         if (len > bufsize) {
@@ -91,9 +91,13 @@ Normalizer_Transform_IMP(Normalizer *self, Inversion 
*inversion) {
             }
             // allocate additional INITIAL_BUFSIZE items
             bufsize = len + INITIAL_BUFSIZE;
-            buffer = (int32_t*)MALLOCATE((bufsize + 1) * sizeof(int32_t));
+            if ((size_t)bufsize >= SIZE_MAX / sizeof(int32_t) - 
sizeof(int32_t)) {
+                THROW(ERR, "Requested bufsize too large: %u64",
+                      (uint64_t)bufsize);
+            }
+            buffer = (int32_t*)MALLOCATE(((size_t)bufsize + 1) * 
sizeof(int32_t));
             len = utf8proc_decompose((uint8_t*)token_ivars->text,
-                                     token_ivars->len, buffer, bufsize,
+                                     (ssize_t)token_ivars->len, buffer, 
bufsize,
                                      ivars->options);
         }
         if (len < 0) {
@@ -104,11 +108,15 @@ Normalizer_Transform_IMP(Normalizer *self, Inversion 
*inversion) {
 
         if (len >= 0) {
             if (len > (ssize_t)token_ivars->len) {
+                if (len >= INT32_MAX - 1) {
+                    THROW(ERR, "Normalized result over 2 GB: %u64",
+                          (uint64_t)len);
+                }
                 FREEMEM(token_ivars->text);
-                token_ivars->text = (char*)MALLOCATE(len + 1);
+                token_ivars->text = (char*)MALLOCATE((size_t)len + 1);
             }
             memcpy(token_ivars->text, buffer, len + 1);
-            token_ivars->len = len;
+            token_ivars->len = (size_t)len;
         }
     }
 

http://git-wip-us.apache.org/repos/asf/lucy/blob/a4b0b3b2/core/Lucy/Analysis/SnowballStemmer.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Analysis/SnowballStemmer.c 
b/core/Lucy/Analysis/SnowballStemmer.c
index 982e4c1..8e6870c 100644
--- a/core/Lucy/Analysis/SnowballStemmer.c
+++ b/core/Lucy/Analysis/SnowballStemmer.c
@@ -79,9 +79,16 @@ SnowStemmer_Transform_IMP(SnowballStemmer *self, Inversion 
*inversion) {
         TokenIVARS *const token_ivars = Token_IVARS(token);
         const sb_symbol *stemmed_text 
             = sb_stemmer_stem(snowstemmer, (sb_symbol*)token_ivars->text,
-                              token_ivars->len);
-        size_t len = sb_stemmer_length(snowstemmer);
+                              (int)token_ivars->len);
+        int length = sb_stemmer_length(snowstemmer);
+        if (length < 0) {
+            THROW(ERR, "Unexpected value for sb_stemmer_length: %d", length);
+        }
+        size_t len = (size_t)length;
         if (len > token_ivars->len) {
+            if (len >= INT32_MAX - 1) {
+                THROW(ERR, "String over 2Gb: %u64", (uint64_t)len);
+            }
             FREEMEM(token_ivars->text);
             token_ivars->text = (char*)MALLOCATE(len + 1);
         }

http://git-wip-us.apache.org/repos/asf/lucy/blob/a4b0b3b2/core/Lucy/Analysis/StandardTokenizer.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Analysis/StandardTokenizer.c 
b/core/Lucy/Analysis/StandardTokenizer.c
index 4a90700..012d428 100644
--- a/core/Lucy/Analysis/StandardTokenizer.c
+++ b/core/Lucy/Analysis/StandardTokenizer.c
@@ -147,9 +147,10 @@ S_parse_single(const char *text, size_t len, 
lucy_StringIter *iter,
     lucy_StringIter start = *iter;
     int wb = S_skip_extend_format(text, len, iter);
 
-    Token *token = Token_new(text + start.byte_pos,
-                             iter->byte_pos - start.byte_pos,
-                             start.char_pos, iter->char_pos, 1.0f, 1);
+    Token *token
+        = Token_new(text + start.byte_pos, iter->byte_pos - start.byte_pos,
+                    (uint32_t)start.char_pos, (uint32_t)iter->char_pos,
+                    1.0f, 1);
     Inversion_Append(inversion, token);
 
     return wb;
@@ -251,7 +252,8 @@ S_parse_word(const char *text, size_t len, lucy_StringIter 
*iter,
     Token *token;
 word_break:
     token = Token_new(text + start.byte_pos, end.byte_pos - start.byte_pos,
-                      start.char_pos, end.char_pos, 1.0f, 1);
+                      (uint32_t)start.char_pos, (uint32_t)end.char_pos,
+                      1.0f, 1);
     Inversion_Append(inversion, token);
 
     return wb;

http://git-wip-us.apache.org/repos/asf/lucy/blob/a4b0b3b2/core/Lucy/Highlight/Highlighter.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Highlight/Highlighter.c 
b/core/Lucy/Highlight/Highlighter.c
index 875d019..6db17dd 100644
--- a/core/Lucy/Highlight/Highlighter.c
+++ b/core/Lucy/Highlight/Highlighter.c
@@ -389,10 +389,10 @@ Highlighter_Raw_Excerpt_IMP(Highlighter *self, String 
*field_val,
         // around the hottest point in the field, start the fragment at the
         // beginning.
         start    = 0;
-        max_skip = best_location;
+        max_skip = (uint32_t)best_location;
     }
     else {
-        start    = best_location - ivars->slop;
+        start    = best_location - (int32_t)ivars->slop;
         max_skip = ivars->slop;
         StrIter_Advance(top, (size_t)start);
     }

http://git-wip-us.apache.org/repos/asf/lucy/blob/a4b0b3b2/core/Lucy/Index/HighlightWriter.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/HighlightWriter.c 
b/core/Lucy/Index/HighlightWriter.c
index b580dbc..64b1a4c 100644
--- a/core/Lucy/Index/HighlightWriter.c
+++ b/core/Lucy/Index/HighlightWriter.c
@@ -156,9 +156,9 @@ HLWriter_TV_Buf_IMP(HighlightWriter *self, Inversion 
*inversion) {
     while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) {
         Token *token = *tokens;
         char *const   token_text = Token_Get_Text(token);
-        const int32_t token_len  = Token_Get_Len(token);
+        const size_t  token_len  = Token_Get_Len(token);
 
-        int32_t overlap = StrHelp_overlap(last_text, token_text,
+        size_t overlap  = StrHelp_overlap(last_text, token_text,
                                           last_len, token_len);
         char *ptr;
         char *orig;
@@ -179,8 +179,8 @@ HLWriter_TV_Buf_IMP(HighlightWriter *self, Inversion 
*inversion) {
         num_postings += 1;
 
         // Append the string diff to the tv_buf.
-        NumUtil_encode_ci32(overlap, &ptr);
-        NumUtil_encode_ci32((token_len - overlap), &ptr);
+        NumUtil_encode_ci32((int32_t)overlap, &ptr);
+        NumUtil_encode_ci32((int32_t)(token_len - overlap), &ptr);
         memcpy(ptr, (token_text + overlap), (token_len - overlap));
         ptr += token_len - overlap;
 

http://git-wip-us.apache.org/repos/asf/lucy/blob/a4b0b3b2/core/Lucy/Index/Posting/MatchPosting.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/Posting/MatchPosting.c 
b/core/Lucy/Index/Posting/MatchPosting.c
index c1530ea..a428fac 100644
--- a/core/Lucy/Index/Posting/MatchPosting.c
+++ b/core/Lucy/Index/Posting/MatchPosting.c
@@ -133,7 +133,7 @@ MatchPost_Add_Inversion_To_Pool_IMP(MatchPosting *self,
     Inversion_Reset(inversion);
     while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) {
         TokenIVARS *const token_ivars = Token_IVARS(*tokens);
-        uint32_t raw_post_bytes
+        size_t raw_post_bytes
             = MAX_RAW_POSTING_LEN(base_size, token_ivars->len);
         RawPosting *raw_posting
             = RawPost_new(MemPool_Grab(mem_pool, raw_post_bytes), doc_id,

http://git-wip-us.apache.org/repos/asf/lucy/blob/a4b0b3b2/core/Lucy/Plan/TextType.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Plan/TextType.c b/core/Lucy/Plan/TextType.c
index 318fa47..5a3e323 100644
--- a/core/Lucy/Plan/TextType.c
+++ b/core/Lucy/Plan/TextType.c
@@ -129,13 +129,13 @@ TextTermStepper_Write_Delta_IMP(TextTermStepper *self, 
OutStream *outstream,
     }
 
     // Count how many bytes the strings share at the top.
-    const int32_t overlap = StrHelp_overlap(last_text, new_text,
-                                            last_size, new_size);
+    const size_t overlap = StrHelp_overlap(last_text, new_text,
+                                           last_size, new_size);
     const char *const diff_start_str = new_text + overlap;
     const size_t diff_len            = new_size - overlap;
 
     // Write number of common bytes and common bytes.
-    OutStream_Write_CI32(outstream, overlap);
+    OutStream_Write_CI32(outstream, (int32_t)overlap);
     OutStream_Write_String(outstream, diff_start_str, diff_len);
 
     // Update value.

http://git-wip-us.apache.org/repos/asf/lucy/blob/a4b0b3b2/core/Lucy/Util/Json.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Util/Json.c b/core/Lucy/Util/Json.c
index c74f64f..6c7473d 100644
--- a/core/Lucy/Util/Json.c
+++ b/core/Lucy/Util/Json.c
@@ -566,7 +566,7 @@ S_parse_string(const char **json_ptr, const char *limit) {
     }
     else {
         // Optimize common case where there are no escapes.
-        size_t len = end - top;
+        size_t len = (size_t)(end - top);
         if (!StrHelp_utf8_valid(top, len)) {
             String *mess = MAKE_MESS("Bad UTF-8 in JSON");
             Err_set_error(Err_new(mess));
@@ -582,7 +582,7 @@ S_unescape_text(const char *top, const char *end) {
     // because only a \u escape can theoretically be too long and
     // StrHelp_encode_utf8_char guards against sequences over 4 bytes.
     // Therefore we can allocate once and not worry about reallocating.
-    size_t cap = end - top + 1;
+    size_t cap = (size_t)(end - top) + 1;
     char *target_buf = (char*)MALLOCATE(cap);
     size_t target_size = 0;
     for (const char *text = top; text < end; text++) {

http://git-wip-us.apache.org/repos/asf/lucy/blob/a4b0b3b2/core/Lucy/Util/MemoryPool.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Util/MemoryPool.c b/core/Lucy/Util/MemoryPool.c
index d1d394b..ac89873 100644
--- a/core/Lucy/Util/MemoryPool.c
+++ b/core/Lucy/Util/MemoryPool.c
@@ -126,7 +126,7 @@ MemPool_Grab_IMP(MemoryPool *self, size_t amount) {
 void
 MemPool_Resize_IMP(MemoryPool *self, void *ptr, size_t new_amount) {
     MemoryPoolIVARS *const ivars = MemPool_IVARS(self);
-    const size_t last_amount = ivars->buf - ivars->last_buf;
+    const size_t last_amount = (size_t)(ivars->buf - ivars->last_buf);
     INCREASE_TO_WORD_MULTIPLE(new_amount);
 
     if (ptr != ivars->last_buf) {

[6/7] lucy git commit: Address -Wconversion for string/byte lengths.

Reply via email to