Petr Onderka has uploaded a new change for review. https://gerrit.wikimedia.org/r/83785
Change subject: Don't save namespace as part of title ...................................................................... Don't save namespace as part of title Change-Id: I794c19db9e36d11d6d05e3bbd7f6f3fe26fb8b0a --- M DumpWriters/ArticlesWriterWrapper.cpp M DumpWriters/ArticlesWriterWrapper.h M DumpWriters/CompositeWriter.cpp M DumpWriters/CompositeWriter.h M DumpWriters/DumpWriter.cpp M DumpWriters/DumpWriter.h M DumpWriters/IDumpWriter.h M DumpWriters/WriterWrapper.cpp M DumpWriters/WriterWrapper.h M XmlInput/XmlPageProcessor.cpp M XmlWriter.cpp 11 files changed, 43 insertions(+), 14 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/dumps/incremental refs/changes/85/83785/1 diff --git a/DumpWriters/ArticlesWriterWrapper.cpp b/DumpWriters/ArticlesWriterWrapper.cpp index a17c385..4554784 100644 --- a/DumpWriters/ArticlesWriterWrapper.cpp +++ b/DumpWriters/ArticlesWriterWrapper.cpp @@ -3,12 +3,12 @@ const std::int16_t UserNamespace = 2; -void ArticlesWriterWrapper::StartPage(const std::shared_ptr<const Page> page) +void ArticlesWriterWrapper::StartPage(const std::shared_ptr<const Page> page, bool titleWithNamespace) { pageInlcuded = page->Namespace % 2 == 0 && page->Namespace != UserNamespace; if (pageInlcuded) - wrapped->StartPage(page); + wrapped->StartPage(page, titleWithNamespace); } void ArticlesWriterWrapper::AddRevision(const std::shared_ptr<const Revision> revision) diff --git a/DumpWriters/ArticlesWriterWrapper.h b/DumpWriters/ArticlesWriterWrapper.h index 366f11e..fd7c99b 100644 --- a/DumpWriters/ArticlesWriterWrapper.h +++ b/DumpWriters/ArticlesWriterWrapper.h @@ -11,7 +11,7 @@ : WriterWrapper(std::move(wrapped)), pageInlcuded(false) {} - virtual void StartPage(const std::shared_ptr<const Page> page) override; + virtual void StartPage(const std::shared_ptr<const Page> page, bool titleWithNamespace) override; virtual void AddRevision(const std::shared_ptr<const Revision> revision) override; virtual void EndPage() override; virtual void SetDumpKind(DumpKind dumpKind) override; diff --git a/DumpWriters/CompositeWriter.cpp b/DumpWriters/CompositeWriter.cpp index c93dde4..ad75049 100644 --- a/DumpWriters/CompositeWriter.cpp +++ b/DumpWriters/CompositeWriter.cpp @@ -1,10 +1,10 @@ #include "CompositeWriter.h" #include "../DumpException.h" -void CompositeWriter::StartPage(const std::shared_ptr<const Page> page) +void CompositeWriter::StartPage(const std::shared_ptr<const Page> page, bool titleWithNamespace) { for (auto &writer : writers) - writer->StartPage(page); + writer->StartPage(page, titleWithNamespace); } void CompositeWriter::AddRevision(const std::shared_ptr<const Revision> revision) diff --git a/DumpWriters/CompositeWriter.h b/DumpWriters/CompositeWriter.h index 5352e80..8966f52 100644 --- a/DumpWriters/CompositeWriter.h +++ b/DumpWriters/CompositeWriter.h @@ -15,7 +15,7 @@ : writers(std::move(writers)), getTextFunction(getTextFunction) {} - virtual void StartPage(const std::shared_ptr<const Page> page) override; + virtual void StartPage(const std::shared_ptr<const Page> page, bool titleWithNamespace) override; virtual void AddRevision(const std::shared_ptr<const Revision> revision) override; virtual void EndPage() override; virtual void SetSiteInfo(const std::shared_ptr<const SiteInfo> siteInfo) override; diff --git a/DumpWriters/DumpWriter.cpp b/DumpWriters/DumpWriter.cpp index d1423fe..6183c4c 100644 --- a/DumpWriters/DumpWriter.cpp +++ b/DumpWriters/DumpWriter.cpp @@ -5,6 +5,21 @@ #include "../Indexes/Index.h" #include "../format.h" +void DumpWriter::RemoveNamespace(Page& page) +{ + std::string namespapceName = dump->siteInfo->siteInfo.Namespaces.at(page.Namespace).second; + + if (namespapceName.empty()) + return; + + namespapceName.append(":"); + + if (page.Title.substr(0, namespapceName.length()) != namespapceName) + throw DumpException(); + + page.Title.erase(0, namespapceName.length()); +} + DumpWriter::DumpWriter(std::shared_ptr<WritableDump> dump, bool withText, std::unique_ptr<DiffWriter> diffWriter) : dump(dump), withText(withText), diffWriter(std::move(diffWriter)) { @@ -24,12 +39,17 @@ diffWriter->SetSiteInfo(*siteInfo, dump->fileHeader.Kind); } -void DumpWriter::StartPage(const std::shared_ptr<const Page> page) +void DumpWriter::StartPage(const std::shared_ptr<const Page> page, bool titleWithNamespace) { std::uint32_t pageId = page->PageId; + this->page = std::unique_ptr<DumpPage>(new DumpPage(dump, pageId)); oldPage = this->page->page; this->page->page = *page; + + if (titleWithNamespace) + RemoveNamespace(this->page->page); + unset(unvisitedPageIds, pageId); if (diffWriter != nullptr) diff --git a/DumpWriters/DumpWriter.h b/DumpWriters/DumpWriter.h index 233f9ea..a1fee36 100644 --- a/DumpWriters/DumpWriter.h +++ b/DumpWriters/DumpWriter.h @@ -19,10 +19,11 @@ std::unordered_set<std::uint32_t> newRevisionIds; bool withText; + void RemoveNamespace(Page& page); public: DumpWriter(std::shared_ptr<WritableDump> dump, bool withText, std::unique_ptr<DiffWriter> diffWriter = nullptr); - virtual void StartPage(const std::shared_ptr<const Page> page) override; + virtual void StartPage(const std::shared_ptr<const Page> page, bool titleWithNamespace) override; virtual void AddRevision(const std::shared_ptr<const Revision> revision) override; virtual void EndPage() override; virtual void SetSiteInfo(const std::shared_ptr<const SiteInfo> siteInfo) override; diff --git a/DumpWriters/IDumpWriter.h b/DumpWriters/IDumpWriter.h index 22c717f..42cde90 100644 --- a/DumpWriters/IDumpWriter.h +++ b/DumpWriters/IDumpWriter.h @@ -9,7 +9,7 @@ class IDumpWriter { public: - virtual void StartPage(const std::shared_ptr<const Page> page) = 0; + virtual void StartPage(const std::shared_ptr<const Page> page, bool titleWithNamespace) = 0; virtual void AddRevision(const std::shared_ptr<const Revision> revision) = 0; virtual void EndPage() = 0; virtual void SetSiteInfo(const std::shared_ptr<const SiteInfo> siteInfo) = 0; diff --git a/DumpWriters/WriterWrapper.cpp b/DumpWriters/WriterWrapper.cpp index 1cc81b4..697f89f 100644 --- a/DumpWriters/WriterWrapper.cpp +++ b/DumpWriters/WriterWrapper.cpp @@ -1,8 +1,8 @@ #include "WriterWrapper.h" -void WriterWrapper::StartPage(const std::shared_ptr<const Page> page) +void WriterWrapper::StartPage(const std::shared_ptr<const Page> page, bool titleWithNamespace) { - wrapped->StartPage(page); + wrapped->StartPage(page, titleWithNamespace); } void WriterWrapper::AddRevision(const std::shared_ptr<const Revision> revision) diff --git a/DumpWriters/WriterWrapper.h b/DumpWriters/WriterWrapper.h index af648e2..7718133 100644 --- a/DumpWriters/WriterWrapper.h +++ b/DumpWriters/WriterWrapper.h @@ -11,7 +11,7 @@ : wrapped(std::move(wrapped)) {} - virtual void StartPage(const std::shared_ptr<const Page> page) override; + virtual void StartPage(const std::shared_ptr<const Page> page, bool titleWithNamespace) override; virtual void AddRevision(const std::shared_ptr<const Revision> revision) override; virtual void EndPage() override; virtual void SetSiteInfo(const std::shared_ptr<const SiteInfo> siteInfo) override; diff --git a/XmlInput/XmlPageProcessor.cpp b/XmlInput/XmlPageProcessor.cpp index 75c1907..36c6855 100644 --- a/XmlInput/XmlPageProcessor.cpp +++ b/XmlInput/XmlPageProcessor.cpp @@ -9,7 +9,7 @@ { if (!pageWritten) { - dumpWriter->StartPage(page); + dumpWriter->StartPage(page, true); pageWritten = true; } } diff --git a/XmlWriter.cpp b/XmlWriter.cpp index a7e5cf1..f89027a 100644 --- a/XmlWriter.cpp +++ b/XmlWriter.cpp @@ -69,7 +69,12 @@ output.BeginElement("page"); - output.WriteElement("title", escapeElementText(page.Title)); + std::string title = page.Title; + std::string ns = dump->siteInfo->siteInfo.Namespaces.at(page.Namespace).second; + if (!ns.empty()) + title = ns + ':' + title; + + output.WriteElement("title", escapeElementText(title)); output.WriteElement("ns", page.Namespace); output.WriteElement("id", page.PageId); @@ -84,6 +89,9 @@ { auto revision = DumpRevision(dump, revisionId, true).revision; + if (revision.RevisionId != revisionId) + throw DumpException(); + output.BeginElement("revision"); output.WriteElement("id", revision.RevisionId); -- To view, visit https://gerrit.wikimedia.org/r/83785 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I794c19db9e36d11d6d05e3bbd7f6f3fe26fb8b0a Gerrit-PatchSet: 1 Gerrit-Project: operations/dumps/incremental Gerrit-Branch: gsoc Gerrit-Owner: Petr Onderka <gsv...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits