Kelson has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/296911 )
Change subject: Port zimwriterfs to the new API. ...................................................................... Port zimwriterfs to the new API. No more ArticleSource::getData. Change-Id: I76cd6f3e7e4a390ed6a58cf9815dda2a2f1bfde5 --- M zimwriterfs/Makefile.am M zimwriterfs/article.cpp M zimwriterfs/article.h M zimwriterfs/articlesource.cpp M zimwriterfs/articlesource.h A zimwriterfs/mimetypecounter.cpp A zimwriterfs/mimetypecounter.h M zimwriterfs/zimwriterfs.cpp 8 files changed, 376 insertions(+), 248 deletions(-) Approvals: Kelson: Verified; Looks good to me, approved diff --git a/zimwriterfs/Makefile.am b/zimwriterfs/Makefile.am index 6e46553..92641d9 100644 --- a/zimwriterfs/Makefile.am +++ b/zimwriterfs/Makefile.am @@ -5,4 +5,5 @@ zimwriterfs.cpp \ tools.cpp \ article.cpp \ - articlesource.cpp + articlesource.cpp \ + mimetypecounter.cpp diff --git a/zimwriterfs/article.cpp b/zimwriterfs/article.cpp index 98ec882..3840f7b 100644 --- a/zimwriterfs/article.cpp +++ b/zimwriterfs/article.cpp @@ -21,11 +21,61 @@ #include "article.h" #include "tools.h" +#include <iomanip> +#include <sstream> + extern std::string directoryPath; -Article::Article(ArticleSource* source, const std::string& path, const bool detectRedirects): - source(source) +std::string Article::getAid() const +{ + return aid; +} + +bool Article::isInvalid() const +{ + return invalid; +} + +char Article::getNamespace() const +{ + return ns; +} + +std::string Article::getUrl() const +{ + return url; +} + +std::string Article::getTitle() const +{ + return title; +} + +bool Article::isRedirect() const +{ + return !redirectAid.empty(); +} + +std::string Article::getMimeType() const +{ + return mimeType; +} + +std::string Article::getRedirectAid() const +{ + return redirectAid; +} + +bool Article::shouldCompress() const { + return (getMimeType().find("text") == 0 || + getMimeType() == "application/javascript" || + getMimeType() == "application/json" || + getMimeType() == "image/svg+xml" ? true : false); +} + +FileArticle::FileArticle(const std::string& path, const bool detectRedirects): + dataRead(false) { invalid = false; @@ -109,57 +159,125 @@ } } + /* Update links in the html to let them still be valid */ + std::map<std::string, bool> links; + getLinks(root, links); + std::map<std::string, bool>::iterator it; + + /* If a link appearch to be duplicated in the HTML, it will + occurs only one time in the links variable */ + for(it = links.begin(); it != links.end(); it++) { + if (!it->first.empty() + && it->first[0] != '#' + && it->first[0] != '?' + && it->first.substr(0, 5) != "data:") { + replaceStringInPlace(html, "\"" + it->first + "\"", "\"" + computeNewUrl(aid, it->first) + "\""); + } + } + + data = html; + dataRead = true; + gumbo_destroy_output(&kGumboDefaultOptions, output); } } -std::string Article::getAid() const +zim::Blob FileArticle::getData() const { + if ( dataRead ) + return zim::Blob(data.data(), data.size());; + + std::string aidPath = directoryPath + "/" + aid; + std::string fileContent = getFileContent(aidPath); + + if (getMimeType().find("text/css") == 0) { + /* Rewrite url() values in the CSS */ + size_t startPos = 0; + size_t endPos = 0; + std::string url; + + while ((startPos = fileContent.find("url(", endPos)) && startPos != std::string::npos) { + /* URL delimiters */ + endPos = fileContent.find(")", startPos); + startPos = startPos + (fileContent[startPos+4] == '\'' || fileContent[startPos+4] == '"' ? 5 : 4); + endPos = endPos - (fileContent[endPos-1] == '\'' || fileContent[endPos-1] == '"' ? 1 : 0); + url = fileContent.substr(startPos, endPos - startPos); + std::string startDelimiter = fileContent.substr(startPos-1, 1); + std::string endDelimiter = fileContent.substr(endPos, 1); + + if (url.substr(0, 5) != "data:") { + /* Deal with URL with arguments (using '? ') */ + std::string path = url; + size_t markPos = url.find("?"); + if (markPos != std::string::npos) { + path = url.substr(0, markPos); + } + + /* Embeded fonts need to be inline because Kiwix is + otherwise not able to load same because of the + same-origin security */ + std::string mimeType = getMimeTypeForFile(path); + if ( mimeType == "application/font-ttf" + || mimeType == "application/font-woff" + || mimeType == "application/vnd.ms-opentype" + || mimeType == "application/vnd.ms-fontobject") { + try { + std::string fontContent = getFileContent(directoryPath + + "/" + + computeAbsolutePath(aid, path)); + replaceStringInPlaceOnce(fileContent, + startDelimiter + url + endDelimiter, + startDelimiter + "data:" + mimeType + ";base64," + + base64_encode(reinterpret_cast<const unsigned char*>(fontContent.c_str()), + fontContent.length()) + endDelimiter); + } catch (...) {} + } else { + /* Deal with URL with arguments (using '? ') */ + if (markPos != std::string::npos) { + endDelimiter = url.substr(markPos, 1); + } + + replaceStringInPlaceOnce(fileContent, + startDelimiter + url + endDelimiter, + startDelimiter + computeNewUrl(aid, path) + endDelimiter); + } + } + } + } + + data = fileContent; + dataRead = true; + return zim::Blob(data.data(), data.size()); +} + + +MetadataArticle::MetadataArticle(const std::string &id) { + aid = "/M/"+id; + mimeType = "text/plain"; + ns = 'M'; + url = id; +} + +SimpleMetadataArticle::SimpleMetadataArticle(const std::string &id, const std::string &value): + MetadataArticle(id), + value(value) +{} + +MetadataDateArticle::MetadataDateArticle(): + MetadataArticle("Date") +{} + +zim::Blob MetadataDateArticle::getData() const { - return aid; + if ( data.size() == 0 ) + { + time_t t = time(0); + struct tm * now = localtime( & t ); + std::stringstream stream; + stream << (now->tm_year + 1900) << '-' + << std::setw(2) << std::setfill('0') << (now->tm_mon + 1) << '-' + << std::setw(2) << std::setfill('0') << now->tm_mday; + data = stream.str(); + } + return zim::Blob(data.data(), data.size()); } -bool Article::isInvalid() const -{ - return invalid; -} - -char Article::getNamespace() const -{ - return ns; -} - -std::string Article::getUrl() const -{ - return url; -} - -std::string Article::getTitle() const -{ - return title; -} - -bool Article::isRedirect() const -{ - return !redirectAid.empty(); -} - -std::string Article::getMimeType() const -{ - return mimeType; -} - -std::string Article::getRedirectAid() const -{ - return redirectAid; -} - -bool Article::shouldCompress() const { - return (getMimeType().find("text") == 0 || - getMimeType() == "application/javascript" || - getMimeType() == "application/json" || - getMimeType() == "image/svg+xml" ? true : false); -} - -zim::Blob Article::getData() { - return source->getData(getAid()); -} \ No newline at end of file diff --git a/zimwriterfs/article.h b/zimwriterfs/article.h index 92f7ac4..49f7d25 100644 --- a/zimwriterfs/article.h +++ b/zimwriterfs/article.h @@ -36,13 +36,8 @@ std::string title; std::string mimeType; std::string redirectAid; - mutable ArticleSource* source; public: - Article() { - invalid = false; - } - explicit Article(ArticleSource* source, const std::string& id, const bool detectRedirects = true); virtual std::string getAid() const; virtual char getNamespace() const; virtual std::string getUrl() const; @@ -52,26 +47,53 @@ virtual std::string getMimeType() const; virtual std::string getRedirectAid() const; virtual bool shouldCompress() const; - virtual zim::Blob getData(); }; class MetadataArticle : public Article { public: - MetadataArticle(std::string &id) { - if (id == "Favicon") { - aid = "/-/" + id; - mimeType="image/png"; - redirectAid = favicon; + explicit MetadataArticle(const std::string &id); +}; + +class SimpleMetadataArticle : public MetadataArticle { + private: + std::string value; + public: + explicit SimpleMetadataArticle(const std::string &id, const std::string& value); + virtual zim::Blob getData() const { return zim::Blob(value.c_str(), value.size()); } +}; + +class MetadataFaviconArticle : public Article { + public: + explicit MetadataFaviconArticle(std::string value) { + aid = "/-/Favicon"; + mimeType = "image/png"; + redirectAid = value; ns = '-'; url = "favicon"; - } else { - aid = "/M/" + id; - mimeType="text/plain"; - ns = 'M'; - url = id; } - } + virtual zim::Blob getData() const { return zim::Blob(); } }; + +class MetadataDateArticle : public MetadataArticle +{ + private: + mutable std::string data; + public: + MetadataDateArticle(); + virtual zim::Blob getData() const; +}; + + +class FileArticle : public Article { + private: + mutable std::string data; + mutable bool dataRead; + + public: + explicit FileArticle(const std::string& id, const bool detectRedirects = true); + virtual zim::Blob getData() const; +}; + class RedirectArticle : public Article { public: @@ -88,6 +110,7 @@ aid = "/" + line.substr(0, 1) + "/" + url; mimeType = "text/plain"; } + virtual zim::Blob getData() const { return zim::Blob(); } }; #endif // OPENZIM_ZIMWRITERFS_ARTICLE_H diff --git a/zimwriterfs/articlesource.cpp b/zimwriterfs/articlesource.cpp index d2b7156..06a773d 100644 --- a/zimwriterfs/articlesource.cpp +++ b/zimwriterfs/articlesource.cpp @@ -24,37 +24,17 @@ #include <zim/blob.h> -#include <iomanip> -#include <sstream> #include <map> bool isVerbose(); extern std::string welcome; -extern std::string language; -extern std::string creator; -extern std::string publisher; -extern std::string title; -extern std::string description; -extern std::string directoryPath; - -std::map<std::string, unsigned int> counters; -char *data = NULL; -unsigned int dataSize = 0; ArticleSource::ArticleSource(Queue<std::string>& filenameQueue): - filenameQueue(filenameQueue) + filenameQueue(filenameQueue), + loopOverHandlerStarted(false) { - /* Prepare metadata */ - metadataQueue.push("Language"); - metadataQueue.push("Publisher"); - metadataQueue.push("Creator"); - metadataQueue.push("Title"); - metadataQueue.push("Description"); - metadataQueue.push("Date"); - metadataQueue.push("Favicon"); - metadataQueue.push("Counter"); } void ArticleSource::init_redirectsQueue_from_file(const std::string& path){ @@ -77,180 +57,56 @@ std::string path; if (article != NULL) { - delete(article); + delete article; } if (!metadataQueue.empty()) { - path = metadataQueue.front(); + article = metadataQueue.front(); metadataQueue.pop(); - article = new MetadataArticle(this, path); } else if (!redirectsQueue.empty()) { std::string line = redirectsQueue.front(); redirectsQueue.pop(); - article = new RedirectArticle(this, line); + article = new RedirectArticle(line); } else if (filenameQueue.popFromQueue(path)) { - do { - article = new Article(this, path); - } while (article && article->isInvalid() && filenameQueue.popFromQueue(path)); + article = new FileArticle(path); + while (article && article->isInvalid() && filenameQueue.popFromQueue(path)) { + delete article; + article = new FileArticle(path); + }; } else { article = NULL; + if ( !loopOverHandlerStarted ) + { + currentLoopHandler = articleHandlers.begin(); + loopOverHandlerStarted = true; + } else { + currentLoopHandler++; + } + if ( currentLoopHandler != articleHandlers.end() ) + { + article = (*currentLoopHandler)->getMetaArticle(); + } } - /* Count mimetypes */ - if (article != NULL && !article->isRedirect()) { - - if (isVerbose()) - std::cout << "Creating entry for " << article->getAid() << std::endl; - - std::string mimeType = article->getMimeType(); - if (counters.find(mimeType) == counters.end()) { - counters[mimeType] = 1; - } else { - counters[mimeType]++; + if (article != NULL) + { + for (std::vector<IHandler*>::iterator it = articleHandlers.begin(); + it != articleHandlers.end(); + ++it) + { + (*it)->handleArticle(article); } } return article; } -zim::Blob ArticleSource::getData(const std::string& aid) { - - if (isVerbose()) - std::cout << "Packing data for " << aid << std::endl; - - if (data != NULL) { - delete(data); - data = NULL; - } - - if (aid.substr(0, 3) == "/M/") { - std::string value; - - if ( aid == "/M/Language") { - value = language; - } else if (aid == "/M/Creator") { - value = creator; - } else if (aid == "/M/Publisher") { - value = publisher; - } else if (aid == "/M/Title") { - value = title; - } else if (aid == "/M/Description") { - value = description; - } else if ( aid == "/M/Date") { - time_t t = time(0); - struct tm * now = localtime( & t ); - std::stringstream stream; - stream << (now->tm_year + 1900) << '-' - << std::setw(2) << std::setfill('0') << (now->tm_mon + 1) << '-' - << std::setw(2) << std::setfill('0') << now->tm_mday; - value = stream.str(); - } else if ( aid == "/M/Counter") { - std::stringstream stream; - for (std::map<std::string, unsigned int>::iterator it = counters.begin(); it != counters.end(); ++it) { - stream << it->first << "=" << it->second << ";"; - } - value = stream.str(); - } - - dataSize = value.length(); - data = new char[dataSize]; - memcpy(data, value.c_str(), dataSize); - } else { - std::string aidPath = directoryPath + "/" + aid; - - if (getMimeTypeForFile(aid).find("text/html") == 0) { - std::string html = getFileContent(aidPath); - - /* Rewrite links (src|href|...) attributes */ - GumboOutput* output = gumbo_parse(html.c_str()); - GumboNode* root = output->root; - - std::map<std::string, bool> links; - getLinks(root, links); - std::map<std::string, bool>::iterator it; - std::string aidDirectory = removeLastPathElement(aid, false, false); - - /* If a link appearch to be duplicated in the HTML, it will - occurs only one time in the links variable */ - for(it = links.begin(); it != links.end(); it++) { - if (!it->first.empty() && it->first[0] != '#' && it->first[0] != '?' && it->first.substr(0, 5) != "data:") { - replaceStringInPlace(html, "\"" + it->first + "\"", "\"" + computeNewUrl(aid, it->first) + "\""); - } - } - gumbo_destroy_output(&kGumboDefaultOptions, output); - - dataSize = html.length(); - data = new char[dataSize]; - memcpy(data, html.c_str(), dataSize); - } else if (getMimeTypeForFile(aid).find("text/css") == 0) { - std::string css = getFileContent(aidPath); - - /* Rewrite url() values in the CSS */ - size_t startPos = 0; - size_t endPos = 0; - std::string url; - - while ((startPos = css.find("url(", endPos)) && startPos != std::string::npos) { - - /* URL delimiters */ - endPos = css.find(")", startPos); - startPos = startPos + (css[startPos+4] == '\'' || css[startPos+4] == '"' ? 5 : 4); - endPos = endPos - (css[endPos-1] == '\'' || css[endPos-1] == '"' ? 1 : 0); - url = css.substr(startPos, endPos - startPos); - std::string startDelimiter = css.substr(startPos-1, 1); - std::string endDelimiter = css.substr(endPos, 1); - - if (url.substr(0, 5) != "data:") { - /* Deal with URL with arguments (using '? ') */ - std::string path = url; - size_t markPos = url.find("?"); - if (markPos != std::string::npos) { - path = url.substr(0, markPos); - } - - /* Embeded fonts need to be inline because Kiwix is - otherwise not able to load same because of the - same-origin security */ - std::string mimeType = getMimeTypeForFile(path); - if (mimeType == "application/font-ttf" || - mimeType == "application/font-woff" || - mimeType == "application/vnd.ms-opentype" || - mimeType == "application/vnd.ms-fontobject") { - - try { - std::string fontContent = getFileContent(directoryPath + "/" + computeAbsolutePath(aid, path)); - replaceStringInPlaceOnce(css, - startDelimiter + url + endDelimiter, - startDelimiter + "data:" + mimeType + ";base64," + - base64_encode(reinterpret_cast<const unsigned char*>(fontContent.c_str()), fontContent.length()) + - endDelimiter - ); - } catch (...) { - } - } else { - - /* Deal with URL with arguments (using '? ') */ - if (markPos != std::string::npos) { - endDelimiter = url.substr(markPos, 1); - } - - replaceStringInPlaceOnce(css, - startDelimiter + url + endDelimiter, - startDelimiter + computeNewUrl(aid, path) + endDelimiter); - } - } - } - - dataSize = css.length(); - data = new char[dataSize]; - memcpy(data, css.c_str(), dataSize); - } else { - dataSize = getFileSize(aidPath); - data = new char[dataSize]; - memcpy(data, getFileContent(aidPath).c_str(), dataSize); - } - } - - return zim::Blob(data, dataSize); +void ArticleSource::add_customHandler(IHandler* handler) +{ + articleHandlers.push_back(handler); } +void ArticleSource::add_metadataArticle(Article* article) +{ + metadataQueue.push(article); +} diff --git a/zimwriterfs/articlesource.h b/zimwriterfs/articlesource.h index 1ad6524..c019893 100644 --- a/zimwriterfs/articlesource.h +++ b/zimwriterfs/articlesource.h @@ -27,20 +27,34 @@ #include "queue.h" #include <zim/writer/zimcreator.h> +#include <zim/blob.h> + +class Article; + +class IHandler +{ + public: + virtual void handleArticle(Article* article) = 0; + virtual Article* getMetaArticle() = 0; +}; class ArticleSource : public zim::writer::ArticleSource { public: explicit ArticleSource(Queue<std::string>& filenameQueue); + void add_metadataArticle(Article* article); virtual const zim::writer::Article* getNextArticle(); - virtual zim::Blob getData(const std::string& aid); virtual std::string getMainPage(); + virtual void add_customHandler(IHandler* handler); virtual void init_redirectsQueue_from_file(const std::string& path); private: - std::queue<std::string> metadataQueue; + std::queue<Article*> metadataQueue; std::queue<std::string> redirectsQueue; Queue<std::string>& filenameQueue; + std::vector<IHandler*> articleHandlers; + std::vector<IHandler*>::iterator currentLoopHandler; + bool loopOverHandlerStarted; }; #endif //OPENZIM_ZIMWRITERFS_ARTICLESOURCE_H diff --git a/zimwriterfs/mimetypecounter.cpp b/zimwriterfs/mimetypecounter.cpp new file mode 100644 index 0000000..94b6194 --- /dev/null +++ b/zimwriterfs/mimetypecounter.cpp @@ -0,0 +1,48 @@ +/* + * Copyright 2013-2016 Emmanuel Engelhart <kel...@kiwix.org> + * Copyright 2016 Matthieu Gautier <mgaut...@kymeria.fr> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include "mimetypecounter.h" +#include <sstream> + +MetadataCounterArticle::MetadataCounterArticle(MimetypeCounter* counter): + MetadataArticle("Counter"), + counter(counter) +{} + +zim::Blob MetadataCounterArticle::getData() const +{ + std::stringstream stream; + for (std::map<std::string, unsigned int>::iterator it = counter->counters.begin(); it != counter->counters.end(); ++it) { + stream << it->first << "=" << it->second << ";"; + } + data = stream.str(); + return zim::Blob(data.data(), data.size()); +} + +void MimetypeCounter::handleArticle(Article* article) { + if (!article->isRedirect()) { + std::string mimeType = article->getMimeType(); + if (counters.find(mimeType) == counters.end()) { + counters[mimeType] = 1; + } else { + counters[mimeType]++; + } + } +} diff --git a/zimwriterfs/mimetypecounter.h b/zimwriterfs/mimetypecounter.h new file mode 100644 index 0000000..2e1ab96 --- /dev/null +++ b/zimwriterfs/mimetypecounter.h @@ -0,0 +1,54 @@ +/* + * Copyright 2013-2016 Emmanuel Engelhart <kel...@kiwix.org> + * Copyright 2016 Matthieu Gautier <mgaut...@kymeria.fr> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef OPENZIM_ZIMWRITERFS_MIMETYPECOUNTER_H +#define OPENZIM_ZIMWRITERFS_MIMETYPECOUNTER_H + +#include "articlesource.h" +#include "article.h" + +class MimetypeCounter; + +class MetadataCounterArticle : public MetadataArticle +{ + private: + MimetypeCounter* counter; + mutable std::string data; + + public: + MetadataCounterArticle(MimetypeCounter* counter); + virtual zim::Blob getData() const; +}; + +class MimetypeCounter : public IHandler { + public: + void handleArticle(Article* article); + MetadataCounterArticle* getMetaArticle() { return new MetadataCounterArticle(this); } + + private: + std::map<std::string, unsigned int> counters; + + friend class MetadataCounterArticle; +}; + + + + +#endif //OPENZIM_ZIMWRITERFS_MIMETYPECOUNTER_H diff --git a/zimwriterfs/zimwriterfs.cpp b/zimwriterfs/zimwriterfs.cpp index 09a62af..1826c31 100644 --- a/zimwriterfs/zimwriterfs.cpp +++ b/zimwriterfs/zimwriterfs.cpp @@ -36,6 +36,8 @@ #include "article.h" #include "articlesource.h" #include "queue.h" +#include "mimetypecounter.h" + std::string language; std::string creator; @@ -330,6 +332,14 @@ exit(1); } + source.add_metadataArticle(new SimpleMetadataArticle("Language", language)); + source.add_metadataArticle(new SimpleMetadataArticle("Publisher", publisher)); + source.add_metadataArticle(new SimpleMetadataArticle("Creator", creator)); + source.add_metadataArticle(new SimpleMetadataArticle("Title", title)); + source.add_metadataArticle(new SimpleMetadataArticle("Description", description)); + source.add_metadataArticle(new MetadataDateArticle()); + source.add_metadataArticle(new MetadataFaviconArticle(favicon)); + /* Check redirects file and read it if necessary*/ if (!redirectsPath.empty() && !fileExists(redirectsPath)) { std::cerr << "zimwriterfs: unable to find redirects CSV file at '" << redirectsPath << "'. Verify --redirects path/value." << std::endl; @@ -352,6 +362,10 @@ pthread_create(&(directoryVisitor), NULL, visitDirectoryPath, (void*)NULL); pthread_detach(directoryVisitor); + + MimetypeCounter mimetypeCounter; + source.add_customHandler(&mimetypeCounter); + /* ZIM creation */ setenv("ZIM_LZMA_LEVEL", "9e", 1); try { -- To view, visit https://gerrit.wikimedia.org/r/296911 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I76cd6f3e7e4a390ed6a58cf9815dda2a2f1bfde5 Gerrit-PatchSet: 1 Gerrit-Project: openzim Gerrit-Branch: master Gerrit-Owner: Mgautierfr <mgaut...@kymeria.fr> Gerrit-Reviewer: Kelson <kel...@kiwix.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits