Kelson has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/295517 )
Change subject: Move articleSource's related stuffs in articlesource.(h|cpp). ...................................................................... Move articleSource's related stuffs in articlesource.(h|cpp). Change-Id: Iee91484679bf401a693af1ca7e1c7e34f2c741d0 --- M zimwriterfs/Makefile.am A zimwriterfs/articlesource.cpp A zimwriterfs/articlesource.h M zimwriterfs/zimwriterfs.cpp 4 files changed, 305 insertions(+), 229 deletions(-) Approvals: Kelson: Verified; Looks good to me, approved diff --git a/zimwriterfs/Makefile.am b/zimwriterfs/Makefile.am index 3383e35..6e46553 100644 --- a/zimwriterfs/Makefile.am +++ b/zimwriterfs/Makefile.am @@ -4,4 +4,5 @@ zimwriterfs_SOURCES= \ zimwriterfs.cpp \ tools.cpp \ - article.cpp + article.cpp \ + articlesource.cpp diff --git a/zimwriterfs/articlesource.cpp b/zimwriterfs/articlesource.cpp new file mode 100644 index 0000000..8b0b34c --- /dev/null +++ b/zimwriterfs/articlesource.cpp @@ -0,0 +1,256 @@ +/* + * Copyright 2013-2016 Emmanuel Engelhart <kel...@kiwix.org> + * Copyright 2016 Matthieu Gautier <mgaut...@kymeria.fr> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include "articlesource.h" +#include "article.h" +#include "tools.h" + +#include <zim/blob.h> + +#include <iomanip> +#include <sstream> +#include <map> + +bool popFromFilenameQueue(std::string &filename); +bool isVerbose(); + +extern std::string welcome; +extern std::string language; +extern std::string creator; +extern std::string publisher; +extern std::string title; +extern std::string description; +extern std::string directoryPath; + +std::map<std::string, unsigned int> counters; +char *data = NULL; +unsigned int dataSize = 0; + + + +ArticleSource::ArticleSource() { + /* Prepare metadata */ + metadataQueue.push("Language"); + metadataQueue.push("Publisher"); + metadataQueue.push("Creator"); + metadataQueue.push("Title"); + metadataQueue.push("Description"); + metadataQueue.push("Date"); + metadataQueue.push("Favicon"); + metadataQueue.push("Counter"); +} + +void ArticleSource::init_redirectsQueue_from_file(const std::string& path){ + std::ifstream in_stream; + std::string line; + + in_stream.open(path.c_str()); + while (std::getline(in_stream, line)) { + redirectsQueue.push(line); + } + in_stream.close(); +} + +std::string ArticleSource::getMainPage() { + return welcome; +} + +Article *article = NULL; +const zim::writer::Article* ArticleSource::getNextArticle() { + std::string path; + + if (article != NULL) { + delete(article); + } + + if (!metadataQueue.empty()) { + path = metadataQueue.front(); + metadataQueue.pop(); + article = new MetadataArticle(path); + } else if (!redirectsQueue.empty()) { + std::string line = redirectsQueue.front(); + redirectsQueue.pop(); + article = new RedirectArticle(line); + } else if (popFromFilenameQueue(path)) { + do { + article = new Article(path); + } while (article && article->isInvalid() && popFromFilenameQueue(path)); + } else { + article = NULL; + } + + /* Count mimetypes */ + if (article != NULL && !article->isRedirect()) { + + if (isVerbose()) + std::cout << "Creating entry for " << article->getAid() << std::endl; + + std::string mimeType = article->getMimeType(); + if (counters.find(mimeType) == counters.end()) { + counters[mimeType] = 1; + } else { + counters[mimeType]++; + } + } + + return article; +} + +zim::Blob ArticleSource::getData(const std::string& aid) { + + if (isVerbose()) + std::cout << "Packing data for " << aid << std::endl; + + if (data != NULL) { + delete(data); + data = NULL; + } + + if (aid.substr(0, 3) == "/M/") { + std::string value; + + if ( aid == "/M/Language") { + value = language; + } else if (aid == "/M/Creator") { + value = creator; + } else if (aid == "/M/Publisher") { + value = publisher; + } else if (aid == "/M/Title") { + value = title; + } else if (aid == "/M/Description") { + value = description; + } else if ( aid == "/M/Date") { + time_t t = time(0); + struct tm * now = localtime( & t ); + std::stringstream stream; + stream << (now->tm_year + 1900) << '-' + << std::setw(2) << std::setfill('0') << (now->tm_mon + 1) << '-' + << std::setw(2) << std::setfill('0') << now->tm_mday; + value = stream.str(); + } else if ( aid == "/M/Counter") { + std::stringstream stream; + for (std::map<std::string, unsigned int>::iterator it = counters.begin(); it != counters.end(); ++it) { + stream << it->first << "=" << it->second << ";"; + } + value = stream.str(); + } + + dataSize = value.length(); + data = new char[dataSize]; + memcpy(data, value.c_str(), dataSize); + } else { + std::string aidPath = directoryPath + "/" + aid; + + if (getMimeTypeForFile(aid).find("text/html") == 0) { + std::string html = getFileContent(aidPath); + + /* Rewrite links (src|href|...) attributes */ + GumboOutput* output = gumbo_parse(html.c_str()); + GumboNode* root = output->root; + + std::map<std::string, bool> links; + getLinks(root, links); + std::map<std::string, bool>::iterator it; + std::string aidDirectory = removeLastPathElement(aid, false, false); + + /* If a link appearch to be duplicated in the HTML, it will + occurs only one time in the links variable */ + for(it = links.begin(); it != links.end(); it++) { + if (!it->first.empty() && it->first[0] != '#' && it->first[0] != '?' && it->first.substr(0, 5) != "data:") { + replaceStringInPlace(html, "\"" + it->first + "\"", "\"" + computeNewUrl(aid, it->first) + "\""); + } + } + gumbo_destroy_output(&kGumboDefaultOptions, output); + + dataSize = html.length(); + data = new char[dataSize]; + memcpy(data, html.c_str(), dataSize); + } else if (getMimeTypeForFile(aid).find("text/css") == 0) { + std::string css = getFileContent(aidPath); + + /* Rewrite url() values in the CSS */ + size_t startPos = 0; + size_t endPos = 0; + std::string url; + + while ((startPos = css.find("url(", endPos)) && startPos != std::string::npos) { + + /* URL delimiters */ + endPos = css.find(")", startPos); + startPos = startPos + (css[startPos+4] == '\'' || css[startPos+4] == '"' ? 5 : 4); + endPos = endPos - (css[endPos-1] == '\'' || css[endPos-1] == '"' ? 1 : 0); + url = css.substr(startPos, endPos - startPos); + std::string startDelimiter = css.substr(startPos-1, 1); + std::string endDelimiter = css.substr(endPos, 1); + + if (url.substr(0, 5) != "data:") { + /* Deal with URL with arguments (using '? ') */ + std::string path = url; + size_t markPos = url.find("?"); + if (markPos != std::string::npos) { + path = url.substr(0, markPos); + } + + /* Embeded fonts need to be inline because Kiwix is + otherwise not able to load same because of the + same-origin security */ + std::string mimeType = getMimeTypeForFile(path); + if (mimeType == "application/font-ttf" || + mimeType == "application/font-woff" || + mimeType == "application/vnd.ms-opentype" || + mimeType == "application/vnd.ms-fontobject") { + + try { + std::string fontContent = getFileContent(directoryPath + "/" + computeAbsolutePath(aid, path)); + replaceStringInPlaceOnce(css, + startDelimiter + url + endDelimiter, + startDelimiter + "data:" + mimeType + ";base64," + + base64_encode(reinterpret_cast<const unsigned char*>(fontContent.c_str()), fontContent.length()) + + endDelimiter + ); + } catch (...) { + } + } else { + + /* Deal with URL with arguments (using '? ') */ + if (markPos != std::string::npos) { + endDelimiter = url.substr(markPos, 1); + } + + replaceStringInPlaceOnce(css, + startDelimiter + url + endDelimiter, + startDelimiter + computeNewUrl(aid, path) + endDelimiter); + } + } + } + + dataSize = css.length(); + data = new char[dataSize]; + memcpy(data, css.c_str(), dataSize); + } else { + dataSize = getFileSize(aidPath); + data = new char[dataSize]; + memcpy(data, getFileContent(aidPath).c_str(), dataSize); + } + } + + return zim::Blob(data, dataSize); +} + diff --git a/zimwriterfs/articlesource.h b/zimwriterfs/articlesource.h new file mode 100644 index 0000000..adbdbda --- /dev/null +++ b/zimwriterfs/articlesource.h @@ -0,0 +1,44 @@ +/* + * Copyright 2013-2016 Emmanuel Engelhart <kel...@kiwix.org> + * Copyright 2016 Matthieu Gautier <mgaut...@kymeria.fr> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef OPENZIM_ZIMWRITERFS_ARTICLESOURCE_H +#define OPENZIM_ZIMWRITERFS_ARTICLESOURCE_H + +#include <string> +#include <queue> +#include <fstream> + +#include <zim/writer/zimcreator.h> + +class ArticleSource : public zim::writer::ArticleSource { + public: + explicit ArticleSource(); + virtual const zim::writer::Article* getNextArticle(); + virtual zim::Blob getData(const std::string& aid); + virtual std::string getMainPage(); + + virtual void init_redirectsQueue_from_file(const std::string& path); + + private: + std::queue<std::string> metadataQueue; + std::queue<std::string> redirectsQueue; +}; + +#endif //OPENZIM_ZIMWRITERFS_ARTICLESOURCE_H diff --git a/zimwriterfs/zimwriterfs.cpp b/zimwriterfs/zimwriterfs.cpp index 93987b4..de44cb8 100644 --- a/zimwriterfs/zimwriterfs.cpp +++ b/zimwriterfs/zimwriterfs.cpp @@ -26,20 +26,15 @@ #include <unistd.h> #include <pthread.h> -#include <iomanip> -#include <fstream> -#include <sstream> #include <queue> -#include <map> #include <cstdio> #include <magic.h> #include <zim/writer/zimcreator.h> -#include <zim/blob.h> - #include "tools.h" #include "article.h" +#include "articlesource.h" #define MAX_QUEUE_SIZE 100 @@ -57,8 +52,6 @@ pthread_t directoryVisitor; pthread_mutex_t filenameQueueMutex; std::queue<std::string> filenameQueue; -std::queue<std::string> metadataQueue; -std::queue<std::string> redirectsQueue; bool isDirectoryVisitorRunningFlag = false; pthread_mutex_t directoryVisitorRunningMutex; @@ -68,9 +61,6 @@ bool uniqueNamespace = false; magic_t magic; -std::map<std::string, unsigned int> counters; -char *data = NULL; -unsigned int dataSize = 0; void directoryVisitorRunning(bool value) { @@ -136,204 +126,6 @@ } while (isDirectoryVisitorRunning() || !isFilenameQueueEmpty()); return retVal; -} - -/* ArticleSource class */ -class ArticleSource : public zim::writer::ArticleSource { - public: - explicit ArticleSource(); - virtual const zim::writer::Article* getNextArticle(); - virtual zim::Blob getData(const std::string& aid); - virtual std::string getMainPage(); -}; - -ArticleSource::ArticleSource() { -} - -std::string ArticleSource::getMainPage() { - return welcome; -} - -Article *article = NULL; -const zim::writer::Article* ArticleSource::getNextArticle() { - std::string path; - - if (article != NULL) { - delete(article); - } - - if (!metadataQueue.empty()) { - path = metadataQueue.front(); - metadataQueue.pop(); - article = new MetadataArticle(path); - } else if (!redirectsQueue.empty()) { - std::string line = redirectsQueue.front(); - redirectsQueue.pop(); - article = new RedirectArticle(line); - } else if (popFromFilenameQueue(path)) { - do { - article = new Article(path); - } while (article && article->isInvalid() && popFromFilenameQueue(path)); - } else { - article = NULL; - } - - /* Count mimetypes */ - if (article != NULL && !article->isRedirect()) { - - if (isVerbose()) - std::cout << "Creating entry for " << article->getAid() << std::endl; - - std::string mimeType = article->getMimeType(); - if (counters.find(mimeType) == counters.end()) { - counters[mimeType] = 1; - } else { - counters[mimeType]++; - } - } - - return article; -} - -zim::Blob ArticleSource::getData(const std::string& aid) { - - if (isVerbose()) - std::cout << "Packing data for " << aid << std::endl; - - if (data != NULL) { - delete(data); - data = NULL; - } - - if (aid.substr(0, 3) == "/M/") { - std::string value; - - if ( aid == "/M/Language") { - value = language; - } else if (aid == "/M/Creator") { - value = creator; - } else if (aid == "/M/Publisher") { - value = publisher; - } else if (aid == "/M/Title") { - value = title; - } else if (aid == "/M/Description") { - value = description; - } else if ( aid == "/M/Date") { - time_t t = time(0); - struct tm * now = localtime( & t ); - std::stringstream stream; - stream << (now->tm_year + 1900) << '-' - << std::setw(2) << std::setfill('0') << (now->tm_mon + 1) << '-' - << std::setw(2) << std::setfill('0') << now->tm_mday; - value = stream.str(); - } else if ( aid == "/M/Counter") { - std::stringstream stream; - for (std::map<std::string, unsigned int>::iterator it = counters.begin(); it != counters.end(); ++it) { - stream << it->first << "=" << it->second << ";"; - } - value = stream.str(); - } - - dataSize = value.length(); - data = new char[dataSize]; - memcpy(data, value.c_str(), dataSize); - } else { - std::string aidPath = directoryPath + "/" + aid; - - if (getMimeTypeForFile(aid).find("text/html") == 0) { - std::string html = getFileContent(aidPath); - - /* Rewrite links (src|href|...) attributes */ - GumboOutput* output = gumbo_parse(html.c_str()); - GumboNode* root = output->root; - - std::map<std::string, bool> links; - getLinks(root, links); - std::map<std::string, bool>::iterator it; - std::string aidDirectory = removeLastPathElement(aid, false, false); - - /* If a link appearch to be duplicated in the HTML, it will - occurs only one time in the links variable */ - for(it = links.begin(); it != links.end(); it++) { - if (!it->first.empty() && it->first[0] != '#' && it->first[0] != '?' && it->first.substr(0, 5) != "data:") { - replaceStringInPlace(html, "\"" + it->first + "\"", "\"" + computeNewUrl(aid, it->first) + "\""); - } - } - gumbo_destroy_output(&kGumboDefaultOptions, output); - - dataSize = html.length(); - data = new char[dataSize]; - memcpy(data, html.c_str(), dataSize); - } else if (getMimeTypeForFile(aid).find("text/css") == 0) { - std::string css = getFileContent(aidPath); - - /* Rewrite url() values in the CSS */ - size_t startPos = 0; - size_t endPos = 0; - std::string url; - - while ((startPos = css.find("url(", endPos)) && startPos != std::string::npos) { - - /* URL delimiters */ - endPos = css.find(")", startPos); - startPos = startPos + (css[startPos+4] == '\'' || css[startPos+4] == '"' ? 5 : 4); - endPos = endPos - (css[endPos-1] == '\'' || css[endPos-1] == '"' ? 1 : 0); - url = css.substr(startPos, endPos - startPos); - std::string startDelimiter = css.substr(startPos-1, 1); - std::string endDelimiter = css.substr(endPos, 1); - - if (url.substr(0, 5) != "data:") { - /* Deal with URL with arguments (using '? ') */ - std::string path = url; - size_t markPos = url.find("?"); - if (markPos != std::string::npos) { - path = url.substr(0, markPos); - } - - /* Embeded fonts need to be inline because Kiwix is - otherwise not able to load same because of the - same-origin security */ - std::string mimeType = getMimeTypeForFile(path); - if (mimeType == "application/font-ttf" || - mimeType == "application/font-woff" || - mimeType == "application/vnd.ms-opentype" || - mimeType == "application/vnd.ms-fontobject") { - - try { - std::string fontContent = getFileContent(directoryPath + "/" + computeAbsolutePath(aid, path)); - replaceStringInPlaceOnce(css, - startDelimiter + url + endDelimiter, - startDelimiter + "data:" + mimeType + ";base64," + - base64_encode(reinterpret_cast<const unsigned char*>(fontContent.c_str()), fontContent.length()) + - endDelimiter - ); - } catch (...) { - } - } else { - - /* Deal with URL with arguments (using '? ') */ - if (markPos != std::string::npos) { - endDelimiter = url.substr(markPos, 1); - } - - replaceStringInPlaceOnce(css, - startDelimiter + url + endDelimiter, - startDelimiter + computeNewUrl(aid, path) + endDelimiter); - } - } - } - - dataSize = css.length(); - data = new char[dataSize]; - memcpy(data, css.c_str(), dataSize); - } else { - dataSize = getFileSize(aidPath); - data = new char[dataSize]; - memcpy(data, getFileContent(aidPath).c_str(), dataSize); - } - } - - return zim::Blob(data, dataSize); } /* Non ZIM related code */ @@ -554,16 +346,6 @@ directoryPath = directoryPath.substr(0, directoryPath.length()-1); } - /* Prepare metadata */ - metadataQueue.push("Language"); - metadataQueue.push("Publisher"); - metadataQueue.push("Creator"); - metadataQueue.push("Title"); - metadataQueue.push("Description"); - metadataQueue.push("Date"); - metadataQueue.push("Favicon"); - metadataQueue.push("Counter"); - /* Check metadata */ if (!fileExists(directoryPath + "/" + welcome)) { std::cerr << "zimwriterfs: unable to find welcome page at '" << directoryPath << "/" << welcome << "'. --welcome path/value must be relative to HTML_DIRECTORY." << std::endl; @@ -582,15 +364,8 @@ } else { if (isVerbose()) std::cout << "Reading redirects CSV file " << redirectsPath << "..." << std::endl; - - std::ifstream in_stream; - std::string line; - - in_stream.open(redirectsPath.c_str()); - while (std::getline(in_stream, line)) { - redirectsQueue.push(line); - } - in_stream.close(); + + source.init_redirectsQueue_from_file(redirectsPath); } /* Init */ -- To view, visit https://gerrit.wikimedia.org/r/295517 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Iee91484679bf401a693af1ca7e1c7e34f2c741d0 Gerrit-PatchSet: 1 Gerrit-Project: openzim Gerrit-Branch: master Gerrit-Owner: Mgautierfr <mgaut...@kymeria.fr> Gerrit-Reviewer: Kelson <kel...@kiwix.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits