Hi, The problem I mentioned in the last mail is solved by replacing the bool `cmap->unsorted' by `cmap->flags.' So here is the patch.
For "bad" fonts, the improvement is great, since we can do binary search now: bkai00mp.ttf ------------- (original) Get_Char_Index : 31.956 us/op Charmap iteration : 454657.805 us/op (new) Get_Char_Index : 0.376 us/op Charmap iteration : 1274.315 us/op As for "good" fonts, there is no big impact on them. FT_Get_Char_Index is slightly slower partly because of function call overhead, i.e. `char_index' and `char_next' are wrappers to the same function. I have not yet check how other cmap classes implment `char_index' and `char_next'. If it turns out only one function is needed, we can change the cmap class interface and the overhead disappears. Charmap iteration is faster for unknown reason :-) Vera.ttf ------------- (original) Get_Char_Index : 0.191 us/op Charmap iteration : 25.127 us/op (new) Get_Char_Index : 0.209 us/op Charmap iteration : 23.035 us/op Please test if fonts still work correctly, "good" or "bad". Also, please let me know if you find other fonts which is "bad" too. (according to the comments in the code, they are Asian fonts. I am kind of afraid it only refers to bkai00mp.ttf.) -- Regards, olv
Major update to distinguish between unsorted and overlapped segments for cmap format 4. For overlapped but sorted segments, which is previously considered unsorted, we still use binary search. * src/sfnt/ttcmap.h (struct TT_CMapRec_): Replace `unsorted' by `flags'. (TT_CMAP_FLAG_UNSORTED, TT_CMAP_FLAG_OVERLAPPED): New macros. * src/sfnt/ttcmap.c (OPT_CMAP4): Removed as it is always defined. (struct TT_CMap4Rec_): Remove `old_charcode' and `table_length'. (tt_cmap4_reset): Removed. (tt_cmap4_init): Update accordingly. (tt_cmap4_next): Update accordingly. Take care of overlapped segments. (tt_cmap4_validate): Make sure the subtable is large enough. Do not check glyph_ids because some fonts set the length wrongly. Also when all segments have offset 0, glyph_ids is always invalid. It does not cause any problem so far because the check misses equality. Distinguish between unsorted and overlapped segments. (tt_cmap4_char_map_linear, tt_cmap4_char_map_binary): New functions to do "charcode => glyph index" by linear/binary search. (tt_cmap4_char_index, tt_cmap4_char_next): Use tt_cmap4_char_map_linear and tt_cmap4_char_map_binary. (tt_face_build_cmaps): Treat the return value of validator as flags for cmap. === src/sfnt/ttcmap.c ================================================================== --- src/sfnt/ttcmap.c (revision 2937) +++ src/sfnt/ttcmap.c (local) @@ -625,18 +625,12 @@ #ifdef TT_CONFIG_CMAP_FORMAT_4 -#define OPT_CMAP4 - -#ifdef OPT_CMAP4 - typedef struct TT_CMap4Rec_ { TT_CMapRec cmap; - FT_UInt32 old_charcode; /* old charcode */ FT_UInt32 cur_charcode; /* current charcode */ FT_UInt cur_gindex; /* current glyph index */ - FT_UInt table_length; FT_UInt num_ranges; FT_UInt cur_range; FT_UInt cur_start; @@ -656,14 +650,9 @@ cmap->cmap.data = table; - p = table + 2; - cmap->table_length = FT_PEEK_USHORT( p ); - p = table + 6; cmap->num_ranges = FT_PEEK_USHORT( p ) >> 1; - cmap->cur_range = cmap->num_ranges; - cmap->old_charcode = 0xFFFFFFFFUL; - cmap->cur_charcode = 0; + cmap->cur_charcode = 0xFFFFFFFFUL; cmap->cur_gindex = 0; return SFNT_Err_Ok; @@ -707,22 +696,28 @@ range_index++; } - cmap->old_charcode = 0xFFFFFFFFUL; - cmap->cur_charcode = 0; - cmap->cur_gindex = 0; - cmap->cur_range = num_ranges; return -1; } + /* find the index of the charcode next to cmap->cur_charcode; */ + /* caller should call tt_cmap4_set_range with proper range */ + /* before calling this function */ + /* */ static void tt_cmap4_next( TT_CMap4 cmap ) { - FT_UInt charcode = cmap->cur_charcode + 1; + FT_UInt charcode; + + if ( cmap->cur_charcode >= 0xFFFFUL ) + goto Fail; - cmap->old_charcode = cmap->cur_charcode; + charcode = cmap->cur_charcode + 1; + if ( charcode < cmap->cur_start ) + charcode = cmap->cur_start; + for ( ;; ) { FT_Byte* values = cmap->cur_values; @@ -775,27 +770,16 @@ if ( tt_cmap4_set_range( cmap, cmap->cur_range + 1 ) < 0 ) break; - charcode = cmap->cur_start; + if ( charcode < cmap->cur_start ) + charcode = cmap->cur_start; } - } - - static void - tt_cmap4_reset( TT_CMap4 cmap, - FT_UInt code, - FT_UInt range_index ) - { - if ( tt_cmap4_set_range( cmap, range_index ) >= 0 ) - { - cmap->cur_charcode = code; - tt_cmap4_next( cmap ); - } + Fail: + cmap->cur_charcode = 0xFFFFFFFFUL; + cmap->cur_gindex = 0; } -#endif /* OPT_CMAP4 */ - - FT_CALLBACK_DEF( FT_Error ) tt_cmap4_validate( FT_Byte* table, FT_Validator valid ) @@ -807,11 +791,11 @@ FT_Error error = SFNT_Err_Ok; - /* in certain fonts, the `length' field is invalid and goes */ - /* out of bound. We try to correct this here... */ if ( length < 16 ) FT_INVALID_TOO_SHORT; + /* in certain fonts, the `length' field is invalid and goes */ + /* out of bound. We try to correct this here... */ if ( table + length > valid->limit ) { if ( valid->level >= FT_VALIDATE_TIGHT ) @@ -832,6 +816,9 @@ num_segs /= 2; + if ( length < 16 + num_segs * 2 * 4 ) + FT_INVALID_TOO_SHORT; + /* check the search parameters - even though we never use them */ /* */ if ( valid->level >= FT_VALIDATE_PARANOID ) @@ -863,9 +850,6 @@ offsets = deltas + num_segs * 2; glyph_ids = offsets + num_segs * 2; - if ( glyph_ids > table + length ) - FT_INVALID_TOO_SHORT; - /* check last segment, its end count must be FFFF */ if ( valid->level >= FT_VALIDATE_PARANOID ) { @@ -874,10 +858,9 @@ FT_INVALID_DATA; } - /* check that segments are sorted in increasing order and do not */ - /* overlap; check also the offsets */ { - FT_UInt start, end, last = 0, offset, n; + FT_UInt start, end, offset, n; + FT_UInt last_start = 0, last_end = 0; FT_Int delta; @@ -899,12 +882,20 @@ /* unfortunately, some popular Asian fonts present overlapping */ /* ranges in their charmaps */ /* */ - if ( n > 0 && start <= last ) + if ( start <= last_end && n > 0 ) { if ( valid->level >= FT_VALIDATE_TIGHT ) FT_INVALID_DATA; else - error = SFNT_Err_Invalid_CharMap_Format; + { + /* allow overlapping segments, provided their start points */ + /* and end points, respectively, are in ascending order. */ + /* */ + if ( last_start > start || last_end > end ) + error |= TT_CMAP_FLAG_UNSORTED; + else + error |= TT_CMAP_FLAG_OVERLAPPED; + } } if ( offset && offset != 0xFFFFU ) @@ -955,7 +946,8 @@ FT_INVALID_DATA; } - last = end; + last_start = start; + last_end = end; } } @@ -963,327 +955,333 @@ } - FT_CALLBACK_DEF( FT_UInt ) - tt_cmap4_char_index( TT_CMap cmap, - FT_UInt32 char_code ) + static FT_UInt + tt_cmap4_char_map_linear( TT_CMap cmap, + FT_UInt* pcharcode, + FT_Bool next ) { - FT_Byte* table = cmap->data; - FT_UInt result = 0; + FT_UInt num_segs2, start, end, offset; + FT_Int delta; + FT_UInt i, num_segs; + FT_UInt32 charcode = *pcharcode; + FT_UInt gindex = 0; + FT_Byte* p; - if ( char_code < 0x10000UL ) - { - FT_UInt idx, num_segs2; - FT_Int delta; - FT_UInt code = (FT_UInt)char_code; - FT_Byte* p; + p = cmap->data + 6; + num_segs2 = FT_PAD_FLOOR( TT_PEEK_USHORT( p ), 2 ); - p = table + 6; - num_segs2 = FT_PAD_FLOOR( TT_PEEK_USHORT( p ), 2 ); /* be paranoid! */ + num_segs = num_segs2 >> 1; - if ( !cmap->unsorted ) - { - /* Some fonts have more than 170 segments in their charmaps! */ - /* We changed this function to use a more efficient binary */ - /* search for improving performance */ - FT_UInt min = 0; - FT_UInt max = num_segs2 >> 1; - FT_UInt mid, start, end, offset; + if ( !num_segs ) + return 0; + if ( next ) + charcode++; - while ( min < max ) - { - mid = ( min + max ) >> 1; - p = table + 14 + mid * 2; - end = TT_NEXT_USHORT( p ); - p += num_segs2; - start = TT_PEEK_USHORT( p); + /* linear search */ + for ( ; charcode <= 0xFFFFU; charcode++ ) + { + FT_Byte* q; + - if ( code < start ) - max = mid; - else if ( code > end ) - min = mid + 1; - else - { - /* we found the segment */ - idx = code; + p = cmap->data + 14; /* ends table */ + q = cmap->data + 16 + num_segs2; /* starts table */ - p += num_segs2; - delta = TT_PEEK_SHORT( p ); + for ( i = 0; i < num_segs; i++ ) + { + end = TT_NEXT_USHORT( p ); + start = TT_NEXT_USHORT( q ); - p += num_segs2; - offset = TT_PEEK_USHORT( p ); + if ( charcode >= start && charcode <= end ) + { + p = q - 2 + num_segs2; + delta = TT_PEEK_SHORT( p ); + p += num_segs2; + offset = TT_PEEK_USHORT( p ); - if ( offset == 0xFFFFU ) - goto Exit; + if ( offset == 0xFFFFU ) + continue; - if ( offset != 0 ) - { - p += offset + 2 * ( idx - start ); - idx = TT_PEEK_USHORT( p ); - } - - if ( idx != 0 ) - result = (FT_UInt)( idx + delta ) & 0xFFFFU; - - goto Exit; + if ( offset ) + { + p += offset + ( charcode - start ) * 2; + gindex = TT_PEEK_USHORT( p ); + if ( gindex != 0 ) + gindex = (FT_UInt)( gindex + delta ) & 0xFFFFU; } + else + gindex = (FT_UInt)( charcode + delta ) & 0xFFFFU; + + break; } } - else - { - FT_UInt n; - FT_Byte* q; + if ( !next || gindex ) + break; + } - p = table + 14; /* ends table */ - q = table + 16 + num_segs2; /* starts table */ + if ( next && gindex ) + *pcharcode = charcode; + return gindex; + } - for ( n = 0; n < num_segs2; n += 2 ) - { - FT_UInt end = TT_NEXT_USHORT( p ); - FT_UInt start = TT_NEXT_USHORT( q ); - FT_UInt offset; + static FT_UInt + tt_cmap4_char_map_binary( TT_CMap cmap, + FT_UInt* pcharcode, + FT_Bool next ) + { + FT_UInt num_segs2, start, end, offset; + FT_Int delta; + FT_UInt max, min, mid, num_segs; + FT_UInt charcode = *pcharcode; + FT_UInt gindex = 0; + FT_Byte* p; + - if ( code < start ) - break; + p = cmap->data + 6; + num_segs2 = FT_PAD_FLOOR( TT_PEEK_USHORT( p ), 2 ); - if ( code <= end ) - { - idx = code; + num_segs = num_segs2 >> 1; - p = q + num_segs2 - 2; - delta = TT_PEEK_SHORT( p ); - p += num_segs2; - offset = TT_PEEK_USHORT( p ); + if ( !num_segs ) + return 0; - if ( offset == 0xFFFFU ) - goto Exit; + if ( next ) + charcode++; - if ( offset != 0 ) - { - p += offset + 2 * ( idx - start ); - idx = TT_PEEK_USHORT( p ); - } + min = 0; + max = num_segs; - if ( idx != 0 ) - result = (FT_UInt)( idx + delta ) & 0xFFFFU; - } - } - } - } + /* binary search */ + while ( min < max ) + { + mid = ( min + max ) >> 1; + p = cmap->data + 14 + mid * 2; + end = TT_PEEK_USHORT( p ); + p += 2 + num_segs2; + start = TT_PEEK_USHORT( p ); - Exit: - return result; - } + if ( charcode < start ) + max = mid; + else if ( charcode > end ) + min = mid + 1; + else + { + p += num_segs2; + delta = TT_PEEK_SHORT( p ); + p += num_segs2; + offset = TT_PEEK_USHORT( p ); + /* find the first segment containing `charcode' */ + if ( cmap->flags & TT_CMAP_FLAG_OVERLAPPED ) + { + FT_UInt i; - FT_CALLBACK_DEF( FT_UInt ) - tt_cmap4_char_next( TT_CMap cmap, - FT_UInt32 *pchar_code ) - { - FT_Byte* table = cmap->data; - FT_UInt32 result = 0; - FT_UInt gindex = 0; - FT_UInt32 char_code = *pchar_code; - FT_Byte* p; - FT_UInt code, num_segs2; + /* call the current segment `max' */ + max = mid; - if ( char_code >= 0xFFFFUL ) - goto Exit; + if ( offset == 0xFFFFU ) + mid = max + 1; -#ifdef OPT_CMAP4 - { - TT_CMap4 cmap4 = (TT_CMap4)cmap; - - - if ( char_code == cmap4->old_charcode ) - { - result = cmap4->cur_charcode; - gindex = cmap4->cur_gindex; - if ( result != 0 ) + /* find in segments before the current segment */ + for ( i = max ; i > 0; i-- ) { - tt_cmap4_next( cmap4 ); - goto Exit; - } - } - } -#endif /* OPT_CMAP4 */ + FT_UInt prev_end; - code = (FT_UInt)char_code + 1; - p = table + 6; - num_segs2 = FT_PAD_FLOOR( TT_PEEK_USHORT( p ), 2 ); /* ensure even-ness */ - if ( !cmap->unsorted ) - { - for (;;) - { - /* Some fonts have more than 170 segments in their charmaps! */ - /* We changed this function to use a more efficient binary */ - /* search */ - FT_UInt offset; - FT_Int delta; - FT_UInt min = 0; - FT_UInt max = num_segs2 >> 1; - FT_UInt mid, start, end; - FT_UInt hi; + p = cmap->data + 14 + ( i - 1 ) * 2; + prev_end = TT_PEEK_USHORT( p ); + if ( charcode > prev_end ) + break; - /* we begin by finding the segment which end is - closer to our code point */ - hi = max + 1; - while ( min < max ) - { - mid = ( min + max ) >> 1; - p = table + 14 + mid * 2; - end = TT_PEEK_USHORT( p ); + end = prev_end; + p += 2 + num_segs2; + start = TT_PEEK_USHORT( p ); + p += num_segs2; + delta = TT_PEEK_SHORT( p ); + p += num_segs2; + offset = TT_PEEK_USHORT( p ); - if ( end < code ) - min = mid + 1; - else - { - hi = mid; - max = mid; + if ( offset != 0xFFFFU ) + mid = i - 1; } - } - if ( hi > max ) - { - /* the point is behind the last segment; - we will exit right now */ - goto Exit; - } + /* no luck */ + if ( mid == max + 1 ) + { + if ( i != max ) + { + p = cmap->data + 14 + max * 2; + end = TT_PEEK_USHORT( p ); + p += 2 + num_segs2; + start = TT_PEEK_USHORT( p ); + p += num_segs2; + delta = TT_PEEK_SHORT( p ); + p += num_segs2; + offset = TT_PEEK_USHORT( p ); + } - p = table + 14 + hi * 2; - end = TT_PEEK_USHORT( p ); + mid = max; - p += 2 + num_segs2; - start = TT_PEEK_USHORT( p ); + /* find in segments after the current segment */ + for ( i = max + 1; i < num_segs; i++ ) + { + FT_UInt next_end, next_start; - if ( code < start ) - code = start; - p += num_segs2; - delta = TT_PEEK_USHORT( p ); + p = cmap->data + 14 + i * 2; + next_end = TT_PEEK_USHORT( p ); + p += 2 + num_segs2; + next_start = TT_PEEK_USHORT( p ); - p += num_segs2; - offset = TT_PEEK_USHORT( p ); + if ( charcode < next_start ) + break; - if ( offset != 0 && offset != 0xFFFFU ) - { - /* parse the glyph ids array for non-zero index */ - p += offset + ( code - start ) * 2; - while ( code <= end ) - { - gindex = TT_NEXT_USHORT( p ); - if ( gindex != 0 ) + end = next_end; + start = next_start; + p += num_segs2; + delta = TT_PEEK_SHORT( p ); + p += num_segs2; + offset = TT_PEEK_USHORT( p ); + + if ( offset != 0xFFFFU ) + mid = i; + } + i--; + + /* still no luck */ + if ( mid == max ) { - gindex = (FT_UInt)( gindex + delta ) & 0xFFFFU; - if ( gindex != 0 ) - { - result = code; -#ifdef OPT_CMAP4 - tt_cmap4_reset( (TT_CMap4)cmap, code, hi ); -#endif - goto Exit; - } + mid = i; + + break; } - code++; } + + /* end, start, delta and offset are for the i'th segment */ + if ( mid != i ) + { + p = cmap->data + 14 + mid * 2; + end = TT_PEEK_USHORT( p ); + p += 2 + num_segs2; + start = TT_PEEK_USHORT( p ); + p += num_segs2; + delta = TT_PEEK_SHORT( p ); + p += num_segs2; + offset = TT_PEEK_USHORT( p ); + } } - else if ( offset == 0xFFFFU ) + else { - /* an offset of 0xFFFF means an empty segment in certain fonts! */ - code = end + 1; + if ( offset == 0xFFFFU ) + break; } - else /* offset == 0 */ + + if ( offset ) { - gindex = (FT_UInt)( code + delta ) & 0xFFFFU; + p += offset + ( charcode - start ) * 2; + gindex = TT_PEEK_USHORT( p ); if ( gindex != 0 ) - { - result = code; -#ifdef OPT_CMAP4 - tt_cmap4_reset( (TT_CMap4)cmap, code, hi ); -#endif - goto Exit; - } - code++; + gindex = (FT_UInt)( gindex + delta ) & 0xFFFFU; } + else + gindex = (FT_UInt)( charcode + delta ) & 0xFFFFU; + + break; } } - else + + if ( next ) { - for ( ;; ) - { - FT_UInt offset, n; - FT_Int delta; - FT_Byte* q; + TT_CMap4 cmap4 = (TT_CMap4)cmap; - p = table + 14; /* ends table */ - q = table + 16 + num_segs2; /* starts table */ + /* if `charcode' is not in any segment, then `mid' is */ + /* the segment nearest to `charcode' */ + /* */ - for ( n = 0; n < num_segs2; n += 2 ) + if ( charcode > end ) + { + mid++; + if ( mid == num_segs ) + return 0; + } + + if ( tt_cmap4_set_range( cmap4, mid ) ) + { + if ( gindex ) + *pcharcode = charcode; + } + else + { + cmap4->cur_charcode = charcode; + + if ( gindex ) + cmap4->cur_gindex = gindex; + else { - FT_UInt end = TT_NEXT_USHORT( p ); - FT_UInt start = TT_NEXT_USHORT( q ); + cmap4->cur_charcode = charcode; + tt_cmap4_next( cmap4 ); + gindex = cmap4->cur_gindex; + } + if ( gindex ) + *pcharcode = cmap4->cur_charcode; + } + } - if ( code < start ) - code = start; + return gindex; + } - if ( code <= end ) - { - p = q + num_segs2 - 2; - delta = TT_PEEK_SHORT( p ); - p += num_segs2; - offset = TT_PEEK_USHORT( p ); - if ( offset != 0 && offset != 0xFFFFU ) - { - /* parse the glyph ids array for non-0 index */ - p += offset + ( code - start ) * 2; - while ( code <= end ) - { - gindex = TT_NEXT_USHORT( p ); - if ( gindex != 0 ) - { - gindex = (FT_UInt)( gindex + delta ) & 0xFFFFU; - if ( gindex != 0 ) - break; - } - code++; - } - } - else if ( offset == 0xFFFFU ) - { - /* an offset of 0xFFFF means an empty glyph in certain fonts! */ - code = end; - break; - } - else - gindex = (FT_UInt)( code + delta ) & 0xFFFFU; + FT_CALLBACK_DEF( FT_UInt ) + tt_cmap4_char_index( TT_CMap cmap, + FT_UInt32 char_code ) + { + if ( char_code >= 0x10000U ) + return 0; - if ( gindex == 0 ) - break; + if ( cmap->flags & TT_CMAP_FLAG_UNSORTED ) + return tt_cmap4_char_map_linear( cmap, &char_code, 0 ); + else + return tt_cmap4_char_map_binary( cmap, &char_code, 0 ); + } - result = code; - goto Exit; - } - } - /* loop to next trial charcode */ - if ( code >= 0xFFFFU ) - break; - code++; + FT_CALLBACK_DEF( FT_UInt ) + tt_cmap4_char_next( TT_CMap cmap, + FT_UInt32 *pchar_code ) + { + FT_UInt gindex; + + + if ( *pchar_code >= 0xFFFFU ) + return 0; + + if ( cmap->flags & TT_CMAP_FLAG_UNSORTED ) + gindex = tt_cmap4_char_map_linear( cmap, pchar_code, 1 ); + else + { + TT_CMap4 cmap4 = (TT_CMap4)cmap; + + + /* no need to search */ + if ( *pchar_code == cmap4->cur_charcode ) + { + tt_cmap4_next( cmap4 ); + gindex = cmap4->cur_gindex; + if ( gindex ) + *pchar_code = cmap4->cur_charcode; } + else + gindex = tt_cmap4_char_map_binary( cmap, pchar_code, 1 ); } - Exit: - *pchar_code = result; return gindex; } @@ -1305,13 +1303,8 @@ const TT_CMap_ClassRec tt_cmap4_class_rec = { { -#ifdef OPT_CMAP4 sizeof ( TT_CMap4Rec ), (FT_CMap_InitFunc) tt_cmap4_init, -#else - sizeof ( TT_CMapRec ), - (FT_CMap_InitFunc) tt_cmap_init, -#endif (FT_CMap_DoneFunc) NULL, (FT_CMap_CharIndexFunc)tt_cmap4_char_index, (FT_CMap_CharNextFunc) tt_cmap4_char_next @@ -2300,13 +2293,15 @@ if ( valid.validator.error == 0 ) { - (void)FT_CMap_New( (FT_CMap_Class)clazz, cmap, &charmap, NULL ); + FT_CMap ttcmap; - /* it is simpler to directly set the `unsorted' flag instead */ - /* of adding a parameter to FT_CMap_New */ - ((TT_CMap)(face->root.charmaps - [face->root.num_charmaps - 1]))->unsorted = - FT_BOOL( error ); + + if ( !FT_CMap_New( (FT_CMap_Class)clazz, cmap, &charmap, &ttcmap ) ) + { + /* it is simpler to directly set `flags' than adding */ + /* a parameter to FT_CMap_New */ + ((TT_CMap)ttcmap)->flags = (FT_Int)error; + } } else { === src/sfnt/ttcmap.h ================================================================== --- src/sfnt/ttcmap.h (revision 2937) +++ src/sfnt/ttcmap.h (local) @@ -27,11 +27,15 @@ FT_BEGIN_HEADER + +#define TT_CMAP_FLAG_UNSORTED 1 +#define TT_CMAP_FLAG_OVERLAPPED 2 + typedef struct TT_CMapRec_ { FT_CMapRec cmap; FT_Byte* data; /* pointer to in-memory cmap table */ - FT_Bool unsorted; /* for format 4 only */ + FT_Int flags; /* for format 4 only */ } TT_CMapRec, *TT_CMap;
_______________________________________________ Freetype-devel mailing list Freetype-devel@nongnu.org http://lists.nongnu.org/mailman/listinfo/freetype-devel