[MediaWiki-commits] [Gerrit] Write page after its revisions - change (operations...incremental)
Petr Onderka has submitted this change and it was merged. Change subject: Write page after its revisions .. Write page after its revisions This is necessary when creating big dumps from XML. Without this, all revisions of a page have to be kept in memory, including their texts. A downside is that this will increase seeking when reading. Change-Id: Ib6ea9fe67671df7a5686f6a4e441ab95ff6dba01 --- M DumpObjects/DumpPage.cpp M DumpObjects/DumpPage.h M DumpWriters/DumpWriter.cpp M DumpWriters/DumpWriter.h 4 files changed, 36 insertions(+), 50 deletions(-) Approvals: Petr Onderka: Verified; Looks good to me, approved diff --git a/DumpObjects/DumpPage.cpp b/DumpObjects/DumpPage.cpp index 1d074be..b5e695b 100644 --- a/DumpObjects/DumpPage.cpp +++ b/DumpObjects/DumpPage.cpp @@ -40,35 +40,28 @@ void DumpPage::Write() { if (wasLoaded && originalPage == page) -{ -if (diffWriter != nullptr) -diffWriter->StartExistingPage(page.PageId); - return; -} DumpObject::Write(); } -void DumpPage::Write(DiffWriter *diffWriter) +void DumpPage::WriteDiff(DiffWriter &diffWriter) { -this->diffWriter = diffWriter; -Write(); -this->diffWriter = nullptr; +if (wasLoaded) +{ +if (originalPage == page) +diffWriter.StartExistingPage(page.PageId); +else +diffWriter.StartExistingPage(originalPage, page); +} +else +diffWriter.StartNewPage(page); } void DumpPage::WriteInternal() { WriteValue((uint8_t)DumpObjectKind::Page); WriteCore(*stream, page, true); - -if (diffWriter != nullptr) -{ -if (wasLoaded) -diffWriter->StartExistingPage(originalPage, page); -else -diffWriter->StartNewPage(page); -} } void DumpPage::UpdateIndex(Offset offset, bool overwrite) @@ -87,13 +80,13 @@ } DumpPage::DumpPage(weak_ptr dump, uint32_t pageId) -: DumpObject(dump), page(), diffWriter() +: DumpObject(dump), page() { Load(pageId); } DumpPage::DumpPage(weak_ptr dump, Offset offset) -: DumpObject(dump), page(), diffWriter() +: DumpObject(dump), page() { auto dumpRef = dump.lock(); diff --git a/DumpObjects/DumpPage.h b/DumpObjects/DumpPage.h index 4c230bf..3d1e16d 100644 --- a/DumpObjects/DumpPage.h +++ b/DumpObjects/DumpPage.h @@ -11,8 +11,6 @@ Page originalPage; bool wasLoaded; -DiffWriter *diffWriter; - void Load(std::uint32_t pageId); static Page Read(std::shared_ptr dump, Offset offset); protected: @@ -25,7 +23,7 @@ DumpPage(std::weak_ptr dump, Offset offset); virtual void Write() override; -void Write(DiffWriter *diffWriter); +void WriteDiff(DiffWriter& diffWriter); virtual std::uint32_t NewLength() override; static Page ReadCore(std::istream &stream, bool includeRevisionIds); diff --git a/DumpWriters/DumpWriter.cpp b/DumpWriters/DumpWriter.cpp index 96cfa67..d1423fe 100644 --- a/DumpWriters/DumpWriter.cpp +++ b/DumpWriters/DumpWriter.cpp @@ -31,39 +31,38 @@ oldPage = this->page->page; this->page->page = *page; unset(unvisitedPageIds, pageId); + +if (diffWriter != nullptr) +this->page->WriteDiff(*diffWriter); } void DumpWriter::AddRevision(const std::shared_ptr revision) { page->page.RevisionIds.insert(revision->RevisionId); -revisions.push_back(revision); + +DumpRevision dumpRevision(dump, revision->RevisionId, false); +dumpRevision.revision = *revision; + +if (diffWriter != nullptr) +{ +bool isNew; +std::uint8_t modelFormatId = dumpRevision.GetModelFormatId(isNew); + +if (isNew) +diffWriter->NewModelFormat(modelFormatId, dumpRevision.revision.Model, dumpRevision.revision.Format); +} + +bool newRevision = !contains(oldPage.RevisionIds, revision->RevisionId); + +if (newRevision) +newRevisionIds.insert(revision->RevisionId); + +dumpRevision.Write(diffWriter.get(), newRevision); } void DumpWriter::EndPage() { -page->Write(diffWriter.get()); - -for (auto revision : revisions) -{ -DumpRevision dumpRevision(dump, revision->RevisionId, false); -dumpRevision.revision = *revision; - -if (diffWriter != nullptr) -{ -bool isNew; -std::uint8_t modelFormatId = dumpRevision.GetModelFormatId(isNew); - -if (isNew) -diffWriter->NewModelFormat(modelFormatId, dumpRevision.revision.Model, dumpRevision.revision.Format); -} - -bool newRevision = !contains(oldPage.RevisionIds, revision->RevisionId); - -if (newRevision) -newRevisionIds.insert(revision->RevisionId); - -dumpRevision.Write(diffWriter.get(), newRevision); -} +page->Write(); auto deletedRevisionIds = except(oldPage.RevisionIds, page->page.RevisionIds); @@ -78,7 +77,6 @@ if (
[MediaWiki-commits] [Gerrit] Write page after its revisions - change (operations...incremental)
Petr Onderka has uploaded a new change for review. https://gerrit.wikimedia.org/r/83784 Change subject: Write page after its revisions .. Write page after its revisions This is necessary when creating big dumps from XML. Without this, all revisions of a page have to be kept in memory, including their texts. A downside is that this will increase seeking when reading. Change-Id: Ib6ea9fe67671df7a5686f6a4e441ab95ff6dba01 --- M DumpObjects/DumpPage.cpp M DumpObjects/DumpPage.h M DumpWriters/DumpWriter.cpp M DumpWriters/DumpWriter.h 4 files changed, 36 insertions(+), 50 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/dumps/incremental refs/changes/84/83784/1 diff --git a/DumpObjects/DumpPage.cpp b/DumpObjects/DumpPage.cpp index 1d074be..b5e695b 100644 --- a/DumpObjects/DumpPage.cpp +++ b/DumpObjects/DumpPage.cpp @@ -40,35 +40,28 @@ void DumpPage::Write() { if (wasLoaded && originalPage == page) -{ -if (diffWriter != nullptr) -diffWriter->StartExistingPage(page.PageId); - return; -} DumpObject::Write(); } -void DumpPage::Write(DiffWriter *diffWriter) +void DumpPage::WriteDiff(DiffWriter &diffWriter) { -this->diffWriter = diffWriter; -Write(); -this->diffWriter = nullptr; +if (wasLoaded) +{ +if (originalPage == page) +diffWriter.StartExistingPage(page.PageId); +else +diffWriter.StartExistingPage(originalPage, page); +} +else +diffWriter.StartNewPage(page); } void DumpPage::WriteInternal() { WriteValue((uint8_t)DumpObjectKind::Page); WriteCore(*stream, page, true); - -if (diffWriter != nullptr) -{ -if (wasLoaded) -diffWriter->StartExistingPage(originalPage, page); -else -diffWriter->StartNewPage(page); -} } void DumpPage::UpdateIndex(Offset offset, bool overwrite) @@ -87,13 +80,13 @@ } DumpPage::DumpPage(weak_ptr dump, uint32_t pageId) -: DumpObject(dump), page(), diffWriter() +: DumpObject(dump), page() { Load(pageId); } DumpPage::DumpPage(weak_ptr dump, Offset offset) -: DumpObject(dump), page(), diffWriter() +: DumpObject(dump), page() { auto dumpRef = dump.lock(); diff --git a/DumpObjects/DumpPage.h b/DumpObjects/DumpPage.h index 4c230bf..3d1e16d 100644 --- a/DumpObjects/DumpPage.h +++ b/DumpObjects/DumpPage.h @@ -11,8 +11,6 @@ Page originalPage; bool wasLoaded; -DiffWriter *diffWriter; - void Load(std::uint32_t pageId); static Page Read(std::shared_ptr dump, Offset offset); protected: @@ -25,7 +23,7 @@ DumpPage(std::weak_ptr dump, Offset offset); virtual void Write() override; -void Write(DiffWriter *diffWriter); +void WriteDiff(DiffWriter& diffWriter); virtual std::uint32_t NewLength() override; static Page ReadCore(std::istream &stream, bool includeRevisionIds); diff --git a/DumpWriters/DumpWriter.cpp b/DumpWriters/DumpWriter.cpp index 96cfa67..d1423fe 100644 --- a/DumpWriters/DumpWriter.cpp +++ b/DumpWriters/DumpWriter.cpp @@ -31,39 +31,38 @@ oldPage = this->page->page; this->page->page = *page; unset(unvisitedPageIds, pageId); + +if (diffWriter != nullptr) +this->page->WriteDiff(*diffWriter); } void DumpWriter::AddRevision(const std::shared_ptr revision) { page->page.RevisionIds.insert(revision->RevisionId); -revisions.push_back(revision); + +DumpRevision dumpRevision(dump, revision->RevisionId, false); +dumpRevision.revision = *revision; + +if (diffWriter != nullptr) +{ +bool isNew; +std::uint8_t modelFormatId = dumpRevision.GetModelFormatId(isNew); + +if (isNew) +diffWriter->NewModelFormat(modelFormatId, dumpRevision.revision.Model, dumpRevision.revision.Format); +} + +bool newRevision = !contains(oldPage.RevisionIds, revision->RevisionId); + +if (newRevision) +newRevisionIds.insert(revision->RevisionId); + +dumpRevision.Write(diffWriter.get(), newRevision); } void DumpWriter::EndPage() { -page->Write(diffWriter.get()); - -for (auto revision : revisions) -{ -DumpRevision dumpRevision(dump, revision->RevisionId, false); -dumpRevision.revision = *revision; - -if (diffWriter != nullptr) -{ -bool isNew; -std::uint8_t modelFormatId = dumpRevision.GetModelFormatId(isNew); - -if (isNew) -diffWriter->NewModelFormat(modelFormatId, dumpRevision.revision.Model, dumpRevision.revision.Format); -} - -bool newRevision = !contains(oldPage.RevisionIds, revision->RevisionId); - -if (newRevision) -newRevisionIds.insert(revision->RevisionId); - -dumpRevision.Write(diffWriter.get(), newRevision); -} +page->Write(); auto deletedRevisionIds = except(oldPage.Re