El dilluns, 16 d’abril de 2018, a les 19:11:01 CEST, Adam Reichold va escriure: > Hello again, > > I really think that poppler::ustring defined as > > std::basic_string<unsigned short> > > is already broken where sizeof(unsigned short) != 2 and hence should be > redefined to > > std::basic_string<std::uint16_t> > > without considering that an API break.
Sure, is there any system where sizeof(unsigned short) != 2 anyway? Cheers, Albert > This then also allows to just use > these buffers acquired using ustring::data to be used with the > conversion functions from UTF.h. > > Best regards, Adam. > > Am 16.04.2018 um 18:44 schrieb suzuki toshiya: > > Adam Reichold wrote: > >>> A) use the functions in UTF.h, and change ustring class > >>> to fit them. > >>> > >>> B) rewrite the functions in UTF.h by template programming > >>> and have the interfaces fitting to current ustring(). > >>> > >>> C) write diversions of the functions in UTF.h in cpp frontend. > >> > >> I think what is best here is hard to decide without seeing actual code > >> that at least starts in one of these directions. > > > > Indeed. In fact, yesterday I was trying to write a sample > > patch for B) direction, but I got stuck. Because current UTF.h > > is written in C-style (*p++ = c style copy/concatenation). > > Thus, if I try to pass std::basic_string or std::vector > > object, I have to make yet another template (and different > > implementations) for the concatenation. > > > > Following is current my proposal, which still use reinterpret_cast<>(), > > but if it is not good (e.g. unsigned short != uint16_t), it allocates > > the buffer and free it internally. By replacing iconv() by UTF.h > > functions, I keep from the cast between unsigned short (with unknown > > byte order) array and char array. > > > > Please let me explain... > > > >> --- a/cpp/poppler-global.cpp > >> +++ b/cpp/poppler-global.cpp > >> @@ -26,6 +26,7 @@ > >> > >> #include "poppler-private.h" > >> > >> #include "DateInfo.h" > >> > >> +#include "UTF.h" > >> > >> #include <algorithm> > > > > Include "UTF.h" to use UTF-8 <-> UTF-16 helpers in there. > > > >> @@ -34,31 +35,8 @@ > >> > >> #include <ios> > >> #include <iostream> > >> > >> -#include <iconv.h> > >> - > >> > >> #include "config.h" > >> > >> -namespace > >> -{ > >> - > >> -struct MiniIconv > >> -{ > >> - MiniIconv(const char *to_code, const char *from_code) > >> - : i_(iconv_open(to_code, from_code)) > >> - {} > >> - ~MiniIconv() > >> - { if (is_valid()) iconv_close(i_); } > >> - MiniIconv(const MiniIconv &) = delete; > >> - MiniIconv& operator=(const MiniIconv &) = delete; > >> - bool is_valid() const > >> - { return i_ != (iconv_t)-1; } > >> - operator iconv_t() const > >> - { return i_; } > >> - iconv_t i_; > >> -}; > >> - > >> -} > >> - > > > > Remove iconv() related functions. > > > >> @@ -226,28 +204,31 @@ byte_array ustring::to_utf8() const > >> > >> return byte_array(); > >> > >> } > >> > >> - MiniIconv ic("UTF-8", "UTF-16"); > >> - if (!ic.is_valid()) { > >> - return byte_array(); > > > > Almost replacement of ustring::to_utf8(). See below. > > > >> +#if 0xFFFFU == USHRT_MAX > >> + const uint16_t* utf16_buff = reinterpret_cast<const uint16_t > >> *>(data()); +#else > > > > above is the popular case that uint16_t == unsigned int. > > simple casting is enough to get uint16_t array from > > ustring object. > > > >> + const uint16_t* utf16_begin = new uint16_t[utf16_len]; > >> + const uint16_t* utf16_buff = utf16_begin; > >> + const unsigned short *me = data(); > >> + > >> + for (int i = 0; i < size(); i++) { > >> + utf16_buff[i] = (uint16_t)me[i]; > >> > >> } > > > > [snip the removal of existing code] > > > >> +#endif > > > > above is the irregular case that uint16_t != unsigned int. > > allocate temporal array of uint16_t and copy the contents. > > for skipping of BOM in the beginning, the heaf of allocated > > buffer is recorded in utf16_begin, but the UTF-16 data to > > be converted starts from utf16_buff. > > > >> + const uint16_t* utf16_end = utf16_buff + size(); > >> + if (utf16_buff[0] == 0xFEFF) { > >> + utf16_buff ++; > >> > >> } > > > > in abovem utf16_buff is shifted if the beginning of the > > buffer is BOM. > > > >> - str.resize(str.size() - str_len_left); > >> - return str; > >> + > >> + const int utf8_len = utf16CountUtf8Bytes(utf16_buff); > >> + byte_array ret(utf8_len + 1, '\0'); > > > > count the required buffer size by utf16CountUtf8Bytes in UTF.h, > > and allocate the byte_array to be returned. To assue the null > > termination, the size is incremented and filled by '\0'. > > > >> + utf16ToUtf8(utf16_buff, reinterpret_cast<char *>(ret.data()), > >> utf8_len + 1, utf16_end - utf16_buff); + > >> ret.resize(std::strlen(ret.data())); > > > > convert by utf16ToUtf8(), and shrink to fit converted result. > > > >> + > >> +#if 0xFFFFU != USHRT_MAX > >> + delete utf16_begin; > >> +#endif > >> + return ret; > >> > >> } > > > > if irregular case, the temporal buffer should be freed. > > > >> @@ -256,15 +237,29 @@ std::string ustring::to_latin1() const > >> > >> return std::string(); > >> > >> } > >> > >> - const size_type mylength = size(); > >> - std::string ret(mylength, '\0'); > >> > >> const value_type *me = data(); > >> > >> - for (size_type i = 0; i < mylength; ++i) { > >> + const value_type *me_end = me + size(); > >> + if (*me == 0xFEFF) { > >> + me ++; > >> + } > >> + std::string ret(me_end - me, '\0'); > >> + for (size_type i = 0; me < me_end; ++i) { > >> > >> ret[i] = (char)*me++; > >> > >> } > >> return ret; > >> > >> } > > > > original implementation of ustring::to_latin() does not > > care for BOM. now BOM is checked and skipped if there is. > > > >> +static bool has_bom_utf8(const char *c, int len) > >> +{ > >> + if ( 3 > len ) { > >> + return false; > >> + } > >> + if (c[0] == 0xEF && c[1] == 0xBB && c[2] == 0xBF) { > >> + return true; > >> + } > >> + return false; > >> +} > >> + > >> > >> ustring ustring::from_utf8(const char *str, int len) > >> { > >> > >> if (len <= 0) { > > > > even if ustring::from_utf8 receives UTF-8 string with BOM, > > created ustring object should not include BOM. has_bom_utf8() > > is an utility function for it. > > > >> @@ -273,31 +268,27 @@ ustring ustring::from_utf8(const char *str, int > >> len) > >> > >> return ustring(); > >> > >> } > >> > >> } > >> > >> - > >> - MiniIconv ic("UTF-16", "UTF-8"); > >> - if (!ic.is_valid()) { > >> - return ustring(); > > > > remove iconv() related function. > > > >> + int utf16_count = utf8CountUtf16CodeUnits(str); > >> + if (has_bom_utf8(str, len)) { > >> + str += 3; // skip BOM > >> + len -= 3; // skip BOM > >> + utf16_count --; // skip BOM > >> > >> } > > > > if UTF-8 starts with BOM, it is skiped, the length of > > UTF-8 buffer is decremented (3 octets), the length of > > UTF-16 buffer is decremented (16bit). > > > >> + ustring ret(utf16_count + 1, 0); > > > > allocate ustring object to be returned. to assure the > > U+0000l termination, the size is incremented. > > > >> +#if 0xFFFFU == USHRT_MAX > >> + const uint16_t* utf16_buff = reinterpret_cast<const uint16_t > >> *>(ret.data());> > > this is for popular case where uint16_t == unsigned short. > > simple cast is enough to make the array of uint16_t. > > > >> +#else > >> + const uint16_t* utf16_buff = new uint16_t[utf16_count + 1](0); > >> +#endif > > > > this is for irregular case. allocate temporal buffer to store > > uint16_t data. > > > >> + utf8ToUtf16(str, (uint16_t*)utf16_buff, utf16_count + 1, len); > > > > fill utf16_buff by converted UTF-16 data. > > > >> +#if 0xFFFFU != USHRT_MAX > >> + for (int i = 0; i < utf16_count; i ++) { > >> + ret[i] = (unsigned short)utf16_buff[i]; > >> > >> } > > > > in irregular case, the content should be copied to the object > > to be returned. > > > >> + delete utf16_buff; > >> +#endif > > > > the buffer for irregular case is freed here. > > > >> return ret; > >> > >> } > >> > >> --- a/cpp/poppler-page.cpp > >> +++ b/cpp/poppler-page.cpp > >> @@ -355,7 +365,7 @@ std::vector<text_box> page::text_list() const > >> > >> TextWord *word = word_list->get(i); > >> > >> std::unique_ptr<GooString> gooWord{word->getText()}; > >> > >> - ustring ustr = > >> detail::unicode_GooString_to_ustring(gooWord.get()); + > >> ustring ustr = ustring::from_utf8(gooWord->getCString());> > > here, gooWord is the object created by TextOutputDev, > > so it is always UTF-8. it should be passed to ustring::from_utf8(). > > > >> --- a/cpp/poppler-private.cpp > >> +++ b/cpp/poppler-private.cpp > >> @@ -23,6 +23,7 @@ > >> > >> #include "poppler-private.h" > >> > >> #include "GooString.h" > >> > >> +#include "PDFDocEncoding.h" > >> > >> #include "Page.h" > >> > >> #include <ctime> > > > > the strings for PDF metadata is UTF-16BE with BOM or PDFDocEncoding. > > to use PDFDocEncoding helper, include PDFDocEncoding.h > > > >> @@ -58,16 +59,16 @@ rectf detail::pdfrectangle_to_rectf(const > >> PDFRectangle &pdfrect)>> > >> ustring detail::unicode_GooString_to_ustring(const GooString *str) > >> { > >> > >> - const char *data = str->getCString(); > >> > >> const int len = str->getLength(); > >> > >> + const char *data = str->getCString(); > >> + const char *data_end = data + len; > > > > input data may or may not have BOM, so the length of UTF-8 string > > to be converted may or may not be len. to clarify the end of buffer, > > data_end is calculated, instead of the index "i" in the existing > > implementation. > > > >> - int i = 0; > >> > >> bool is_unicode = false; > >> if ((data[0] & 0xff) == 0xfe && (len > 1 && (data[1] & 0xff) == > >> 0xff)) { > >> > >> is_unicode = true; > >> > >> - i = 2; > >> + data += 2; // skip BOM > >> > >> } > > > > instead of the index i, the beginning of UTF-16BE buffer is shifted > > to skip BOM. > > > >> - ustring::size_type ret_len = len - i; > >> + ustring::size_type ret_len = data_end - data; > >> > >> if (is_unicode) { > >> > >> ret_len >>= 1; > >> > >> } > > > > ditto. > > > >> @@ -75,15 +76,15 @@ ustring detail::unicode_GooString_to_ustring(const > >> GooString *str)>> > >> size_t ret_index = 0; > >> ustring::value_type u; > >> if (is_unicode) { > >> > >> - while (i < len) { > >> - u = ((data[i] & 0xff) << 8) | (data[i + 1] & 0xff); > >> - i += 2; > >> + while (data < data_end) { > >> + u = ((*data & 0xff) << 8) | (*(data + 1) & 0xff); > >> + data += 2; > >> > >> ret[ret_index++] = u; > >> > >> } > >> > >> } else { > >> > >> - while (i < len) { > >> - u = data[i] & 0xff; > >> - ++i; > >> + while (data < data_end) { > >> + u = pdfDocEncoding[ *data & 0xff ]; > >> + data ++; > >> > >> ret[ret_index++] = u; > >> > >> } > >> > >> } > > > > use *data instead of data[i]. _______________________________________________ poppler mailing list poppler@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/poppler