cpp/CMakeLists.txt | 1 cpp/poppler-font-private.h | 82 +++++++++++++++++++++++++++++++ cpp/poppler-font.cpp | 56 ++------------------- cpp/poppler-font.h | 3 + cpp/poppler-page-private.h | 5 + cpp/poppler-page.cpp | 118 ++++++++++++++++++++++++++++++++++++++++++--- cpp/poppler-page.h | 78 +++++++++++++++++++++++++++++ cpp/poppler-private.h | 31 +++++++++++ cpp/tests/poppler-dump.cpp | 20 +++++-- poppler/TextOutputDev.cc | 4 + poppler/TextOutputDev.h | 1 11 files changed, 338 insertions(+), 61 deletions(-)
New commits: commit 3189332012ca46998f8ffb872e7ed81c630c4c7a Author: suzuki toshiya <mpsuz...@hiroshima-u.ac.jp> Date: Sat May 16 04:54:55 2020 +0000 [cpp] separate the font info in text_box to another struct. * add new API, page::text_list(int opt_flag). The old one taking no argument is kept for ABI compatibility. The opt_flag is a bitmask-multiple of the new enum, page::text_list_option_enum. * text_box.m_data->text_box_font is an unique pointer to the storage (if text_list() requests the font info), or just a null pointer (if text_list() does not request the font info). * new option "--show-text-list-with-font" showing font info, to tests/poppler-dump.cpp. "--show-text-list" does not load the font info at all. Co-authored-by: Adam Reichold <adam.reich...@t-online.de> Co-authored-by: Albert Astals Cid <aa...@kde.org> diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp index f274ca5b..01b0409d 100644 --- a/cpp/poppler-page.cpp +++ b/cpp/poppler-page.cpp @@ -299,7 +299,7 @@ static void appendToGooString(void *stream, const char *text, int len) { ustring page::text(const rectf &r, text_layout_enum layout_mode) const { std::unique_ptr<GooString> out(new GooString()); - const bool use_raw_order = (layout_mode == raw_order_layout); + const bool use_raw_order = (layout_mode == raw_order_layout); const bool use_physical_layout = (layout_mode == physical_layout); TextOutputDev td(&appendToGooString, out.get(), use_physical_layout, 0, use_raw_order, false); if (r.is_empty()) { @@ -311,6 +311,11 @@ ustring page::text(const rectf &r, text_layout_enum layout_mode) const return ustring::from_utf8(out->c_str()); } +/* + * text_box_font_info object for text_box + */ +text_box_font_info_data::~text_box_font_info_data() = default; + /* * text_box object for page::text_list() */ @@ -352,30 +357,41 @@ bool text_box::has_space_after() const return m_data->has_space_after; } +bool text_box::has_font_info() const +{ + return (m_data->text_box_font != nullptr); +} + text_box::writing_mode_enum text_box::get_wmode(int i) const { - return m_data->wmodes[i]; + if (this->has_font_info()) + return m_data->text_box_font->wmodes[i]; + else + return text_box::invalid_wmode; } double text_box::get_font_size() const { - return m_data->font_size; + if (this->has_font_info()) + return m_data->text_box_font->font_size; + else + return -1; } std::string text_box::get_font_name(int i) const { - int j = m_data->glyph_to_cache_index[i]; + if (!this->has_font_info()) + return std::string("*ignored*"); + + int j = m_data->text_box_font->glyph_to_cache_index[i]; if (j < 0) { return std::string(""); } - return m_data->font_info_cache[j].name(); + return m_data->text_box_font->font_info_cache[j].name(); } - -std::vector<text_box> page::text_list() const +std::vector<text_box> page::text_list(int opt_flag) const { - d->init_font_info_cache(); - std::vector<text_box> output_list; /* config values are same with Qt5 Page::TextList() */ @@ -419,41 +435,55 @@ std::vector<text_box> page::text_list() const word->getRotation(), {}, word->hasSpaceAfter() == true, - {}, - word->getFontSize(), - d->font_info_cache, - {} + nullptr }}; + std::unique_ptr<text_box_font_info_data> tb_font_info = nullptr; + if (opt_flag & page::text_list_include_font) { + d->init_font_info_cache(); + + std::unique_ptr<text_box_font_info_data> tb_font{new text_box_font_info_data{ + word->getFontSize(), // double font_size + {}, // std::vector<text_box::writing_mode> wmodes; + d->font_info_cache, // std::vector<font_info> font_info_cache; + {} // std::vector<int> glyph_to_cache_index; + }}; + + tb_font_info = std::move(tb_font); + }; + tb.m_data->char_bboxes.reserve(word->getLength()); for (int j = 0; j < word->getLength(); j ++) { word->getCharBBox(j, &xMin, &yMin, &xMax, &yMax); tb.m_data->char_bboxes.emplace_back(xMin, yMin, xMax-xMin, yMax-yMin); } - tb.m_data->glyph_to_cache_index.reserve(word->getLength()); - for (int j = 0; j < word->getLength(); j++) { - const TextFontInfo* cur_text_font_info = word->getFontInfo(j); - - // filter-out the invalid WMode value here. - switch (cur_text_font_info->getWMode()) { - case 0: - tb.m_data->wmodes.push_back(text_box::horizontal_wmode); - break; - case 1: - tb.m_data->wmodes.push_back(text_box::vertical_wmode); - break; - default: - tb.m_data->wmodes.push_back(text_box::invalid_wmode); - }; - - tb.m_data->glyph_to_cache_index[j] = -1; - for (size_t k = 0; k < d->font_info_cache.size(); k++) { - if (cur_text_font_info->matches(&(d->font_info_cache[k].d->ref))) { - tb.m_data->glyph_to_cache_index[j] = k; + if (tb_font_info && d->font_info_cache_initialized) { + tb_font_info->glyph_to_cache_index.reserve(word->getLength()); + for (int j = 0; j < word->getLength(); j++) { + const TextFontInfo* cur_text_font_info = word->getFontInfo(j); + + // filter-out the invalid WMode value here. + switch (cur_text_font_info->getWMode()) { + case 0: + tb_font_info->wmodes.push_back(text_box::horizontal_wmode); break; + case 1: + tb_font_info->wmodes.push_back(text_box::vertical_wmode); + break; + default: + tb_font_info->wmodes.push_back(text_box::invalid_wmode); + }; + + tb_font_info->glyph_to_cache_index[j] = -1; + for (size_t k = 0; k < tb_font_info->font_info_cache.size(); k++) { + if (cur_text_font_info->matches(&(tb_font_info->font_info_cache[k].d->ref))) { + tb_font_info->glyph_to_cache_index[j] = k; + break; + } } } + tb.m_data->text_box_font = std::move(tb_font_info); } output_list.push_back(std::move(tb)); @@ -462,3 +492,8 @@ std::vector<text_box> page::text_list() const return output_list; } + +std::vector<text_box> page::text_list() const +{ + return text_list(0); +} diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h index ca5be2fd..dd6ebf2c 100644 --- a/cpp/poppler-page.h +++ b/cpp/poppler-page.h @@ -66,6 +66,12 @@ public: rectf char_bbox(size_t i) const; bool has_space_after() const; + + /** + \since 0.89 + */ + bool has_font_info() const; + /** Get a writing mode for the i-th glyph @@ -186,6 +192,22 @@ public: */ std::vector<text_box> text_list() const; + /* + * text_list_option_enum is a bitmask-style flags for text_list(), + * 0 means the default & simplest behaviour. + */ + enum text_list_option_enum { + text_list_include_font = 1 // \since 0.89 + }; + + /** + Extended version of text_list() taking an option flag. + The option flag should be the multiple of text_list_option_enum. + + \since 0.89 + */ + std::vector<text_box> text_list(int opt_flag) const; + private: page(document_private *doc, int index); diff --git a/cpp/poppler-private.h b/cpp/poppler-private.h index 83e46319..4ec159a8 100644 --- a/cpp/poppler-private.h +++ b/cpp/poppler-private.h @@ -73,23 +73,17 @@ void delete_all(const Collection &c) } class font_info; -struct text_box_data +struct text_box_font_info_data { - ~text_box_data(); - - ustring text; - rectf bbox; - int rotation; - std::vector<rectf> char_bboxes; - bool has_space_after; + ~text_box_font_info_data(); - std::vector<text_box::writing_mode_enum> wmodes; double font_size; + std::vector<text_box::writing_mode_enum> wmodes; /* * a duplication of the font_info_cache created by the * poppler::font_iterator and owned by the poppler::page - * object. Its lifetime might differ from that of text_box + * object. Its lifetime might differ from that of text_box * object (think about collecting all text_box objects * from all pages), so we have to duplicate it into all * text_box instances. @@ -97,7 +91,7 @@ struct text_box_data std::vector<font_info> font_info_cache; /* - * a std::vector from the glyph index in the current + * a std::vector from the glyph index in the owner * text_box to the font_info index in font_info_cache. * The "-1" means no corresponding fonts found in the * cache. @@ -105,6 +99,20 @@ struct text_box_data std::vector<int> glyph_to_cache_index; }; +class font_info; +struct text_box_data +{ + ~text_box_data(); + + ustring text; + rectf bbox; + int rotation; + std::vector<rectf> char_bboxes; + bool has_space_after; + + std::unique_ptr<text_box_font_info_data> text_box_font; +}; + } #endif diff --git a/cpp/tests/poppler-dump.cpp b/cpp/tests/poppler-dump.cpp index 7864979e..ef391d78 100644 --- a/cpp/tests/poppler-dump.cpp +++ b/cpp/tests/poppler-dump.cpp @@ -60,6 +60,7 @@ bool show_help = false; bool show_version = false; char show_text[32]; bool show_text_list = false; +bool show_text_list_with_font = false; poppler::page::text_layout_enum show_text_layout = poppler::page::physical_layout; static const ArgDesc the_args[] = { @@ -85,6 +86,8 @@ static const ArgDesc the_args[] = { "show text (physical|raw|none) extracted from all pages" }, { "--show-text-list", argFlag, &show_text_list, 0, "show text list (experimental)" }, + { "--show-text-list-with-font", argFlag, &show_text_list_with_font, 0, + "show text list with font info (experimental)" }, { "-h", argFlag, &show_help, 0, "print usage information" }, { "--help", argFlag, &show_help, 0, @@ -417,14 +420,14 @@ static void print_page_text(poppler::page *p) std::cout << std::endl; } -static void print_page_text_list(poppler::page *p) +static void print_page_text_list(poppler::page *p, int opt_flag = 0) { if (!p) { std::cout << std::setw(out_width) << "Broken Page. Could not be parsed" << std::endl; std::cout << std::endl; return; } - auto text_list = p->text_list(); + auto text_list = p->text_list(opt_flag); std::cout << "---" << std::endl; for (const poppler::text_box &text : text_list) { @@ -435,9 +438,9 @@ static void print_page_text_list(poppler::page *p) std::string font_name = text.get_font_name(); std::cout << "[" << ustr << "] @ "; std::cout << "( x=" << bbox.x() << " y=" << bbox.y() << " w=" << bbox.width() << " h=" << bbox.height() << " )"; - std::cout << "( fontname=" << font_name << " fontsize=" << font_size << " wmode=" << wmode << " )"; + if (text.has_font_info()) + std::cout << "( fontname=" << font_name << " fontsize=" << font_size << " wmode=" << wmode << " )"; std::cout << std::endl; - } std::cout << "---" << std::endl; } @@ -538,12 +541,15 @@ int main(int argc, char *argv[]) print_page_text(p.get()); } } - if (show_text_list) { + if (show_text_list || show_text_list_with_font) { const int pages = doc->pages(); for (int i = 0; i < pages; ++i) { std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl; std::unique_ptr<poppler::page> p(doc->create_page(i)); - print_page_text_list(p.get()); + if (show_text_list_with_font) + print_page_text_list(p.get(), poppler::page::text_list_include_font); + else + print_page_text_list(p.get(), 0); } } commit 437553ecb26948f77c3dbf7ad29bca86ffff7f6e Author: Albert Astals Cid <aa...@kde.org> Date: Fri May 15 12:57:32 2020 +0000 [cpp] change page_private::init_font_info_cache() to a void method. We already have a boolean font_info_cache_initialized, no need to guess the initialization result by the size of initialized cache. diff --git a/cpp/poppler-page-private.h b/cpp/poppler-page-private.h index d4954e9d..442f8bb1 100644 --- a/cpp/poppler-page-private.h +++ b/cpp/poppler-page-private.h @@ -50,7 +50,7 @@ public: std::vector<font_info> font_info_cache; bool font_info_cache_initialized; - size_t init_font_info_cache(); + void init_font_info_cache(); }; } diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp index e44ef26e..f274ca5b 100644 --- a/cpp/poppler-page.cpp +++ b/cpp/poppler-page.cpp @@ -57,10 +57,10 @@ page_private::~page_private() delete transition; } -size_t page_private::init_font_info_cache() +void page_private::init_font_info_cache() { if (font_info_cache_initialized) - return font_info_cache.size(); + return; poppler::font_iterator it(index, doc); @@ -69,7 +69,7 @@ size_t page_private::init_font_info_cache() } font_info_cache_initialized = true; - return font_info_cache.size(); + return; } /** commit 57de32198a4406eae18b80eed42e6050e2b48cca Author: Albert Astals Cid <aa...@kde.org> Date: Fri May 15 12:23:50 2020 +0000 [cpp] in poppler-page.h, add "since 0.89" comment to 3 new methods. diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h index 50ccdb06..ca5be2fd 100644 --- a/cpp/poppler-page.h +++ b/cpp/poppler-page.h @@ -66,10 +66,6 @@ public: rectf char_bbox(size_t i) const; bool has_space_after() const; - /** - \since 0.8x - */ - /** Get a writing mode for the i-th glyph @@ -85,6 +81,10 @@ public: horizontal_wmode = 0, vertical_wmode = 1 }; + + /** + \since 0.89 + */ writing_mode_enum get_wmode(int i = 0) const; /** @@ -93,6 +93,10 @@ public: This method return a double floating value of the font size from the text_box instance. */ + + /** + \since 0.89 + */ double get_font_size() const; /** @@ -111,6 +115,10 @@ public: Latin1 or UTF-8. Some legacy PDF producers used in CJK market use GBK, Big5, Wansung or Shift-JIS. */ + + /** + \since 0.89 + */ std::string get_font_name(int i = 0) const; private: commit 507027de297f43146f5bbebe8d098dededffc577 Author: suzuki toshiya <mpsuz...@hiroshima-u.ac.jp> Date: Tue May 5 10:11:49 2020 +0000 [cpp] introduce a boolean font_info_cache_initialized, to distinguish an initialized-but-empty cache from the uninitialized cache Co-authored-by: Adam Reichold <adam.reich...@t-online.de> diff --git a/cpp/poppler-page-private.h b/cpp/poppler-page-private.h index 3e2ee914..d4954e9d 100644 --- a/cpp/poppler-page-private.h +++ b/cpp/poppler-page-private.h @@ -49,6 +49,7 @@ public: { return const_cast<poppler::page *>(p)->d; } std::vector<font_info> font_info_cache; + bool font_info_cache_initialized; size_t init_font_info_cache(); }; diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp index b0bf847e..e44ef26e 100644 --- a/cpp/poppler-page.cpp +++ b/cpp/poppler-page.cpp @@ -48,6 +48,7 @@ page_private::page_private(document_private *_doc, int _index) , page(doc->doc->getCatalog()->getPage(_index + 1)) , index(_index) , transition(nullptr) + , font_info_cache_initialized(false) { } @@ -58,7 +59,7 @@ page_private::~page_private() size_t page_private::init_font_info_cache() { - if (font_info_cache.size() > 0) + if (font_info_cache_initialized) return font_info_cache.size(); poppler::font_iterator it(index, doc); @@ -67,6 +68,7 @@ size_t page_private::init_font_info_cache() font_info_cache = it.next(); } + font_info_cache_initialized = true; return font_info_cache.size(); } commit 2cd79c7382888559d5d8dcc56a84572ac8a77086 Author: Adam Reichold <adam.reich...@t-online.de> Date: Tue May 5 01:22:29 2020 +0000 [cpp] construct a font_iterator instance in the local storage of page_private::init_font_info_cache() method, instead of the heap diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp index 3aa6222b..b0bf847e 100644 --- a/cpp/poppler-page.cpp +++ b/cpp/poppler-page.cpp @@ -61,14 +61,12 @@ size_t page_private::init_font_info_cache() if (font_info_cache.size() > 0) return font_info_cache.size(); - poppler::font_iterator* font_iterator = new poppler::font_iterator(index, doc); + poppler::font_iterator it(index, doc); - if (font_iterator->has_next()) { - font_info_cache = font_iterator->next(); + if (it.has_next()) { + font_info_cache = it.next(); } - delete font_iterator; - return font_info_cache.size(); } commit 7279b4eb397667cd4553f5852286b3f3d73a1a83 Author: Adam Reichold <adam.reich...@t-online.de> Date: Mon May 4 11:51:55 2020 +0000 [cpp] remove wrong warning note for about the std::string object returned by text_box::get_font_name() diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h index 9db6f87b..50ccdb06 100644 --- a/cpp/poppler-page.h +++ b/cpp/poppler-page.h @@ -110,10 +110,6 @@ public: encoding of the font name is one of the ASCII, Latin1 or UTF-8. Some legacy PDF producers used in CJK market use GBK, Big5, Wansung or Shift-JIS. - - \warning The returned std::string is owned by the - text_box instance, it should not be used in the - other objects or should not be destroyed directly. */ std::string get_font_name(int i = 0) const; commit af3805f0b60289c7f522da29f9375119a1cd778a Author: Albert Astals Cid <aa...@kde.org> Date: Mon May 4 04:32:27 2020 +0000 [cpp] new enum poppler::text_box::writing_mode_enum diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp index 715c5ec4..3aa6222b 100644 --- a/cpp/poppler-page.cpp +++ b/cpp/poppler-page.cpp @@ -352,7 +352,7 @@ bool text_box::has_space_after() const return m_data->has_space_after; } -int text_box::get_wmode(int i) const +text_box::writing_mode_enum text_box::get_wmode(int i) const { return m_data->wmodes[i]; } @@ -434,7 +434,18 @@ std::vector<text_box> page::text_list() const tb.m_data->glyph_to_cache_index.reserve(word->getLength()); for (int j = 0; j < word->getLength(); j++) { const TextFontInfo* cur_text_font_info = word->getFontInfo(j); - tb.m_data->wmodes.push_back(cur_text_font_info->getWMode()); + + // filter-out the invalid WMode value here. + switch (cur_text_font_info->getWMode()) { + case 0: + tb.m_data->wmodes.push_back(text_box::horizontal_wmode); + break; + case 1: + tb.m_data->wmodes.push_back(text_box::vertical_wmode); + break; + default: + tb.m_data->wmodes.push_back(text_box::invalid_wmode); + }; tb.m_data->glyph_to_cache_index[j] = -1; for (size_t k = 0; k < d->font_info_cache.size(); k++) { diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h index 6f9e755d..9db6f87b 100644 --- a/cpp/poppler-page.h +++ b/cpp/poppler-page.h @@ -73,17 +73,19 @@ public: /** Get a writing mode for the i-th glyph - This method returns an integer of the writing mode + This method returns an enum of the writing mode for the i-th glyph in the text_box. - 0 means the horizontal writing mode. - 1 means the vertical writing mode. - \note Usually all glyphs in one text_box have the same writing mode. Thus the default value of the glyph index is 0. */ - int get_wmode(int i = 0) const; + enum writing_mode_enum { + invalid_wmode = -1, + horizontal_wmode = 0, + vertical_wmode = 1 + }; + writing_mode_enum get_wmode(int i = 0) const; /** Get a font size of this text_box instance. diff --git a/cpp/poppler-private.h b/cpp/poppler-private.h index 0fe33d3f..83e46319 100644 --- a/cpp/poppler-private.h +++ b/cpp/poppler-private.h @@ -28,6 +28,7 @@ #include "poppler-global.h" #include "poppler-rectangle.h" +#include "poppler-page.h" // to use text_box::writing_mode_enum #include "Error.h" #include "CharTypes.h" @@ -82,7 +83,7 @@ struct text_box_data std::vector<rectf> char_bboxes; bool has_space_after; - std::vector<int> wmodes; + std::vector<text_box::writing_mode_enum> wmodes; double font_size; /* commit 65053f43dbb83b66302bddda27732168fc74cca1 Author: Albert Astals Cid <aa...@kde.org> Date: Sun May 3 16:21:38 2020 +0000 [TextOutputDev] simplify TextFontInfo::matches(const Ref *ref) diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index 84af8af2..993a37da 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -342,7 +342,7 @@ bool TextFontInfo::matches(const TextFontInfo *fontInfo) const { } bool TextFontInfo::matches(const Ref *ref) const { - return (gfxFont->getID()->num == ref->num && gfxFont->getID()->gen == ref->gen); + return (*(gfxFont->getID()) == *ref); } double TextFontInfo::getAscent() const { commit 4ea2e879d4e0e9a5d899adb82bbdaab9e505532c Author: Albert Astals Cid <aa...@kde.org> Date: Sun May 3 16:17:11 2020 +0000 [cpp] simplify the initialization of poppler::font_info_private.ref and .emb_ref diff --git a/cpp/poppler-font-private.h b/cpp/poppler-font-private.h index b24cbaf0..aa26e2f9 100644 --- a/cpp/poppler-font-private.h +++ b/cpp/poppler-font-private.h @@ -48,10 +48,8 @@ public: font_file = fi->getFile()->c_str(); } - ref.num = fi->getRef().num; - ref.gen = fi->getRef().gen; - emb_ref.num = fi->getEmbRef().num; - emb_ref.gen = fi->getEmbRef().gen; + ref = fi->getRef(); + emb_ref = fi->getEmbRef(); } std::string font_name; commit 60400514324d6e5d0a1c50ce4af84320d350e967 Author: suzuki toshiya <mpsuz...@hiroshima-u.ac.jp> Date: Fri May 1 08:04:14 2020 +0000 [cpp] Add the font infos to the text_box object. diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 627920ff..32b3ef88 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -38,6 +38,7 @@ install(FILES poppler-document.h poppler-embedded-file.h poppler-font.h + poppler-font-private.h poppler-global.h poppler-image.h poppler-page.h diff --git a/cpp/poppler-font-private.h b/cpp/poppler-font-private.h new file mode 100644 index 00000000..b24cbaf0 --- /dev/null +++ b/cpp/poppler-font-private.h @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2009, Pino Toscano <p...@kde.org> + * Copyright (C) 2015, Tamas Szekeres <szeker...@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include "poppler-font.h" + +#include "poppler-document-private.h" + +#include "FontInfo.h" + +#include <algorithm> + +using namespace poppler; + +class poppler::font_info_private +{ +public: + font_info_private() + : type(font_info::unknown) + , is_embedded(false) + , is_subset(false) + { + } + font_info_private(FontInfo *fi) + : type((font_info::type_enum)fi->getType()) + , is_embedded(fi->getEmbedded()) + , is_subset(fi->getSubset()) + { + if (fi->getName()) { + font_name = fi->getName()->c_str(); + } + if (fi->getFile()) { + font_file = fi->getFile()->c_str(); + } + + ref.num = fi->getRef().num; + ref.gen = fi->getRef().gen; + emb_ref.num = fi->getEmbRef().num; + emb_ref.gen = fi->getEmbRef().gen; + } + + std::string font_name; + std::string font_file; + font_info::type_enum type : 5; + bool is_embedded : 1; + bool is_subset : 1; + + Ref ref; + Ref emb_ref; +}; + + +class poppler::font_iterator_private +{ +public: + font_iterator_private(int start_page, document_private *dd) + : font_info_scanner(dd->doc, start_page) + , total_pages(dd->doc->getNumPages()) + , current_page((std::max)(start_page, 0)) + { + } + ~font_iterator_private() + { + } + + FontInfoScanner font_info_scanner; + int total_pages; + int current_page; +}; diff --git a/cpp/poppler-font.cpp b/cpp/poppler-font.cpp index 6d833c19..e8a4076f 100644 --- a/cpp/poppler-font.cpp +++ b/cpp/poppler-font.cpp @@ -24,6 +24,8 @@ */ #include "poppler-font.h" +#include "poppler-font-private.h" + #include "poppler-document-private.h" #include "FontInfo.h" @@ -32,54 +34,6 @@ using namespace poppler; -class poppler::font_info_private -{ -public: - font_info_private() - : type(font_info::unknown) - , is_embedded(false) - , is_subset(false) - { - } - font_info_private(FontInfo *fi) - : type((font_info::type_enum)fi->getType()) - , is_embedded(fi->getEmbedded()) - , is_subset(fi->getSubset()) - { - if (fi->getName()) { - font_name = fi->getName()->c_str(); - } - if (fi->getFile()) { - font_file = fi->getFile()->c_str(); - } - } - - std::string font_name; - std::string font_file; - font_info::type_enum type : 5; - bool is_embedded : 1; - bool is_subset : 1; -}; - - -class poppler::font_iterator_private -{ -public: - font_iterator_private(int start_page, document_private *dd) - : font_info_scanner(dd->doc, start_page) - , total_pages(dd->doc->getNumPages()) - , current_page((std::max)(start_page, 0)) - { - } - ~font_iterator_private() - { - } - - FontInfoScanner font_info_scanner; - int total_pages; - int current_page; -}; - /** \class poppler::font_info poppler-font.h "poppler/cpp/poppler-font.h" @@ -208,7 +162,7 @@ font_iterator::~font_iterator() } /** - Returns the fonts of the current page and advances to the next one. + \returns the fonts of the current page and advances to the next one. */ std::vector<font_info> font_iterator::next() { @@ -218,6 +172,10 @@ std::vector<font_info> font_iterator::next() ++d->current_page; + /* FontInfoScanner::scan() receives a number how many pages to + * be scanned from the *current page*, not from the beginning. + * We restrict the font scanning to the current page only. + */ const std::vector<FontInfo*> items = d->font_info_scanner.scan(1); std::vector<font_info> fonts; fonts.reserve(items.size()); diff --git a/cpp/poppler-font.h b/cpp/poppler-font.h index 854b7a40..27667e78 100644 --- a/cpp/poppler-font.h +++ b/cpp/poppler-font.h @@ -67,6 +67,7 @@ private: font_info_private *d; friend class font_iterator; + friend class page; }; @@ -84,6 +85,8 @@ private: font_iterator_private *d; friend class document; + friend class page; + friend class page_private; }; } diff --git a/cpp/poppler-page-private.h b/cpp/poppler-page-private.h index e0c3446d..3e2ee914 100644 --- a/cpp/poppler-page-private.h +++ b/cpp/poppler-page-private.h @@ -29,6 +29,7 @@ namespace poppler class document_private; class page_transition; +class font_info; class page_private { @@ -46,6 +47,9 @@ public: static inline page_private* get(const poppler::page *p) { return const_cast<poppler::page *>(p)->d; } + + std::vector<font_info> font_info_cache; + size_t init_font_info_cache(); }; } diff --git a/cpp/poppler-page.cpp b/cpp/poppler-page.cpp index 7150cd78..715c5ec4 100644 --- a/cpp/poppler-page.cpp +++ b/cpp/poppler-page.cpp @@ -32,6 +32,8 @@ #include "poppler-document-private.h" #include "poppler-page-private.h" #include "poppler-private.h" +#include "poppler-font-private.h" +#include "poppler-font.h" #include "TextOutputDev.h" @@ -54,6 +56,22 @@ page_private::~page_private() delete transition; } +size_t page_private::init_font_info_cache() +{ + if (font_info_cache.size() > 0) + return font_info_cache.size(); + + poppler::font_iterator* font_iterator = new poppler::font_iterator(index, doc); + + if (font_iterator->has_next()) { + font_info_cache = font_iterator->next(); + } + + delete font_iterator; + + return font_info_cache.size(); +} + /** \class poppler::page poppler-page.h "poppler/cpp/poppler-page.h" @@ -334,17 +352,39 @@ bool text_box::has_space_after() const return m_data->has_space_after; } +int text_box::get_wmode(int i) const +{ + return m_data->wmodes[i]; +} + +double text_box::get_font_size() const +{ + return m_data->font_size; +} + +std::string text_box::get_font_name(int i) const +{ + int j = m_data->glyph_to_cache_index[i]; + if (j < 0) { + return std::string(""); + } + return m_data->font_info_cache[j].name(); +} + + std::vector<text_box> page::text_list() const { + d->init_font_info_cache(); + std::vector<text_box> output_list; /* config values are same with Qt5 Page::TextList() */ auto output_dev = std::make_unique<TextOutputDev>( - nullptr, /* char* fileName */ - false, /* bool physLayoutA */ + nullptr, /* char* fileName */ + false, /* bool physLayoutA */ 0, /* double fixedPitchA */ - false, /* bool rawOrderA */ - false /* bool append */ + false, /* bool rawOrderA */ + false /* bool append */ ); /* @@ -378,7 +418,11 @@ std::vector<text_box> page::text_list() const {xMin, yMin, xMax-xMin, yMax-yMin}, word->getRotation(), {}, - word->hasSpaceAfter() == true + word->hasSpaceAfter() == true, + {}, + word->getFontSize(), + d->font_info_cache, + {} }}; tb.m_data->char_bboxes.reserve(word->getLength()); @@ -387,6 +431,20 @@ std::vector<text_box> page::text_list() const tb.m_data->char_bboxes.emplace_back(xMin, yMin, xMax-xMin, yMax-yMin); } + tb.m_data->glyph_to_cache_index.reserve(word->getLength()); + for (int j = 0; j < word->getLength(); j++) { + const TextFontInfo* cur_text_font_info = word->getFontInfo(j); + tb.m_data->wmodes.push_back(cur_text_font_info->getWMode()); + + tb.m_data->glyph_to_cache_index[j] = -1; + for (size_t k = 0; k < d->font_info_cache.size(); k++) { + if (cur_text_font_info->matches(&(d->font_info_cache[k].d->ref))) { + tb.m_data->glyph_to_cache_index[j] = k; + break; + } + } + } + output_list.push_back(std::move(tb)); } } diff --git a/cpp/poppler-page.h b/cpp/poppler-page.h index 30ede302..6f9e755d 100644 --- a/cpp/poppler-page.h +++ b/cpp/poppler-page.h @@ -65,6 +65,56 @@ public: */ rectf char_bbox(size_t i) const; bool has_space_after() const; + + /** + \since 0.8x + */ + + /** + Get a writing mode for the i-th glyph + + This method returns an integer of the writing mode + for the i-th glyph in the text_box. + + 0 means the horizontal writing mode. + 1 means the vertical writing mode. + + \note Usually all glyphs in one text_box have the + same writing mode. Thus the default value of the + glyph index is 0. + */ + int get_wmode(int i = 0) const; + + /** + Get a font size of this text_box instance. + + This method return a double floating value of the + font size from the text_box instance. + */ + double get_font_size() const; + + /** + Get a font name for the i-th glyph + + This method returns a std::string object holding + the font name for the i-th glyph. + + \note The randomization prefix of the embedded fonts + are not removed. The font names including these + prefixes are insuffucient to determine whether the + two fonts are same or different. + + \note The clients should not assume that the + encoding of the font name is one of the ASCII, + Latin1 or UTF-8. Some legacy PDF producers used + in CJK market use GBK, Big5, Wansung or Shift-JIS. + + \warning The returned std::string is owned by the + text_box instance, it should not be used in the + other objects or should not be destroyed directly. + */ + std::string get_font_name(int i = 0) const; + private: text_box(text_box_data *data); diff --git a/cpp/poppler-private.h b/cpp/poppler-private.h index b9bc9b52..0fe33d3f 100644 --- a/cpp/poppler-private.h +++ b/cpp/poppler-private.h @@ -71,6 +71,7 @@ void delete_all(const Collection &c) delete_all(c.begin(), c.end()); } +class font_info; struct text_box_data { ~text_box_data(); @@ -80,6 +81,27 @@ struct text_box_data int rotation; std::vector<rectf> char_bboxes; bool has_space_after; + + std::vector<int> wmodes; + double font_size; + + /* + * a duplication of the font_info_cache created by the + * poppler::font_iterator and owned by the poppler::page + * object. Its lifetime might differ from that of text_box + * object (think about collecting all text_box objects + * from all pages), so we have to duplicate it into all + * text_box instances. + */ + std::vector<font_info> font_info_cache; + + /* + * a std::vector from the glyph index in the current + * text_box to the font_info index in font_info_cache. + * The "-1" means no corresponding fonts found in the + * cache. + */ + std::vector<int> glyph_to_cache_index; }; } diff --git a/cpp/tests/poppler-dump.cpp b/cpp/tests/poppler-dump.cpp index 6196b675..7864979e 100644 --- a/cpp/tests/poppler-dump.cpp +++ b/cpp/tests/poppler-dump.cpp @@ -430,8 +430,12 @@ static void print_page_text_list(poppler::page *p) for (const poppler::text_box &text : text_list) { poppler::rectf bbox = text.bbox(); poppler::ustring ustr = text.text(); + int wmode = text.get_wmode(); + double font_size = text.get_font_size(); + std::string font_name = text.get_font_name(); std::cout << "[" << ustr << "] @ "; std::cout << "( x=" << bbox.x() << " y=" << bbox.y() << " w=" << bbox.width() << " h=" << bbox.height() << " )"; + std::cout << "( fontname=" << font_name << " fontsize=" << font_size << " wmode=" << wmode << " )"; std::cout << std::endl; } diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc index c7c8f852..84af8af2 100644 --- a/poppler/TextOutputDev.cc +++ b/poppler/TextOutputDev.cc @@ -341,6 +341,10 @@ bool TextFontInfo::matches(const TextFontInfo *fontInfo) const { return gfxFont == fontInfo->gfxFont; } +bool TextFontInfo::matches(const Ref *ref) const { + return (gfxFont->getID()->num == ref->num && gfxFont->getID()->gen == ref->gen); +} + double TextFontInfo::getAscent() const { return gfxFont ? gfxFont->getAscent() : 0.95; } diff --git a/poppler/TextOutputDev.h b/poppler/TextOutputDev.h index 0d008b3d..62c95b0f 100644 --- a/poppler/TextOutputDev.h +++ b/poppler/TextOutputDev.h @@ -91,6 +91,7 @@ public: bool matches(const GfxState *state) const; bool matches(const TextFontInfo *fontInfo) const; + bool matches(const Ref *ref) const; // Get the font ascent, or a default value if the font is not set double getAscent() const; _______________________________________________ poppler mailing list poppler@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/poppler