[MediaWiki-commits] [Gerrit] Write page after its revisions - change (operations...incremental)

2013-09-11 Thread Petr Onderka (Code Review)
Petr Onderka has submitted this change and it was merged.

Change subject: Write page after its revisions
..


Write page after its revisions

This is necessary when creating big dumps from XML.
Without this, all revisions of a page have to be kept in memory,
including their texts.
A downside is that this will increase seeking when reading.

Change-Id: Ib6ea9fe67671df7a5686f6a4e441ab95ff6dba01
---
M DumpObjects/DumpPage.cpp
M DumpObjects/DumpPage.h
M DumpWriters/DumpWriter.cpp
M DumpWriters/DumpWriter.h
4 files changed, 36 insertions(+), 50 deletions(-)

Approvals:
  Petr Onderka: Verified; Looks good to me, approved



diff --git a/DumpObjects/DumpPage.cpp b/DumpObjects/DumpPage.cpp
index 1d074be..b5e695b 100644
--- a/DumpObjects/DumpPage.cpp
+++ b/DumpObjects/DumpPage.cpp
@@ -40,35 +40,28 @@
 void DumpPage::Write()
 {
 if (wasLoaded && originalPage == page)
-{
-if (diffWriter != nullptr)
-diffWriter->StartExistingPage(page.PageId);
-
 return;
-}
 
 DumpObject::Write();
 }
 
-void DumpPage::Write(DiffWriter *diffWriter)
+void DumpPage::WriteDiff(DiffWriter &diffWriter)
 {
-this->diffWriter = diffWriter;
-Write();
-this->diffWriter = nullptr;
+if (wasLoaded)
+{
+if (originalPage == page)
+diffWriter.StartExistingPage(page.PageId);
+else
+diffWriter.StartExistingPage(originalPage, page);
+}
+else
+diffWriter.StartNewPage(page);
 }
 
 void DumpPage::WriteInternal()
 {
 WriteValue((uint8_t)DumpObjectKind::Page);
 WriteCore(*stream, page, true);
-
-if (diffWriter != nullptr)
-{
-if (wasLoaded)
-diffWriter->StartExistingPage(originalPage, page);
-else
-diffWriter->StartNewPage(page);
-}
 }
 
 void DumpPage::UpdateIndex(Offset offset, bool overwrite)
@@ -87,13 +80,13 @@
 }
 
 DumpPage::DumpPage(weak_ptr dump, uint32_t pageId)
-: DumpObject(dump), page(), diffWriter()
+: DumpObject(dump), page()
 {
 Load(pageId);
 }
 
 DumpPage::DumpPage(weak_ptr dump, Offset offset)
-: DumpObject(dump), page(), diffWriter()
+: DumpObject(dump), page()
 {
 auto dumpRef = dump.lock();
 
diff --git a/DumpObjects/DumpPage.h b/DumpObjects/DumpPage.h
index 4c230bf..3d1e16d 100644
--- a/DumpObjects/DumpPage.h
+++ b/DumpObjects/DumpPage.h
@@ -11,8 +11,6 @@
 Page originalPage;
 bool wasLoaded;
 
-DiffWriter *diffWriter;
-
 void Load(std::uint32_t pageId);
 static Page Read(std::shared_ptr dump, Offset offset);
 protected:
@@ -25,7 +23,7 @@
 DumpPage(std::weak_ptr dump, Offset offset);
 
 virtual void Write() override;
-void Write(DiffWriter *diffWriter);
+void WriteDiff(DiffWriter& diffWriter);
 virtual std::uint32_t NewLength() override;
 
 static Page ReadCore(std::istream &stream, bool includeRevisionIds);
diff --git a/DumpWriters/DumpWriter.cpp b/DumpWriters/DumpWriter.cpp
index 96cfa67..d1423fe 100644
--- a/DumpWriters/DumpWriter.cpp
+++ b/DumpWriters/DumpWriter.cpp
@@ -31,39 +31,38 @@
 oldPage = this->page->page;
 this->page->page = *page;
 unset(unvisitedPageIds, pageId);
+
+if (diffWriter != nullptr)
+this->page->WriteDiff(*diffWriter);
 }
 
 void DumpWriter::AddRevision(const std::shared_ptr revision)
 {
 page->page.RevisionIds.insert(revision->RevisionId);
-revisions.push_back(revision);
+
+DumpRevision dumpRevision(dump, revision->RevisionId, false);
+dumpRevision.revision = *revision;
+
+if (diffWriter != nullptr)
+{
+bool isNew;
+std::uint8_t modelFormatId = dumpRevision.GetModelFormatId(isNew);
+
+if (isNew)
+diffWriter->NewModelFormat(modelFormatId, 
dumpRevision.revision.Model, dumpRevision.revision.Format);
+}
+
+bool newRevision = !contains(oldPage.RevisionIds, revision->RevisionId);
+
+if (newRevision)
+newRevisionIds.insert(revision->RevisionId);
+
+dumpRevision.Write(diffWriter.get(), newRevision);
 }
 
 void DumpWriter::EndPage()
 {
-page->Write(diffWriter.get());
-
-for (auto revision : revisions)
-{
-DumpRevision dumpRevision(dump, revision->RevisionId, false);
-dumpRevision.revision = *revision;
-
-if (diffWriter != nullptr)
-{
-bool isNew;
-std::uint8_t modelFormatId = dumpRevision.GetModelFormatId(isNew);
-
-if (isNew)
-diffWriter->NewModelFormat(modelFormatId, 
dumpRevision.revision.Model, dumpRevision.revision.Format);
-}
-
-bool newRevision = !contains(oldPage.RevisionIds, 
revision->RevisionId);
-
-if (newRevision)
-newRevisionIds.insert(revision->RevisionId);
-
-dumpRevision.Write(diffWriter.get(), newRevision);
-}
+page->Write();
 
 auto deletedRevisionIds = except(oldPage.RevisionIds, 
page->page.RevisionIds);
 
@@ -78,7 +77,6 @@
 if (

[MediaWiki-commits] [Gerrit] Write page after its revisions - change (operations...incremental)

2013-09-11 Thread Petr Onderka (Code Review)
Petr Onderka has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/83784


Change subject: Write page after its revisions
..

Write page after its revisions

This is necessary when creating big dumps from XML.
Without this, all revisions of a page have to be kept in memory,
including their texts.
A downside is that this will increase seeking when reading.

Change-Id: Ib6ea9fe67671df7a5686f6a4e441ab95ff6dba01
---
M DumpObjects/DumpPage.cpp
M DumpObjects/DumpPage.h
M DumpWriters/DumpWriter.cpp
M DumpWriters/DumpWriter.h
4 files changed, 36 insertions(+), 50 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/dumps/incremental 
refs/changes/84/83784/1

diff --git a/DumpObjects/DumpPage.cpp b/DumpObjects/DumpPage.cpp
index 1d074be..b5e695b 100644
--- a/DumpObjects/DumpPage.cpp
+++ b/DumpObjects/DumpPage.cpp
@@ -40,35 +40,28 @@
 void DumpPage::Write()
 {
 if (wasLoaded && originalPage == page)
-{
-if (diffWriter != nullptr)
-diffWriter->StartExistingPage(page.PageId);
-
 return;
-}
 
 DumpObject::Write();
 }
 
-void DumpPage::Write(DiffWriter *diffWriter)
+void DumpPage::WriteDiff(DiffWriter &diffWriter)
 {
-this->diffWriter = diffWriter;
-Write();
-this->diffWriter = nullptr;
+if (wasLoaded)
+{
+if (originalPage == page)
+diffWriter.StartExistingPage(page.PageId);
+else
+diffWriter.StartExistingPage(originalPage, page);
+}
+else
+diffWriter.StartNewPage(page);
 }
 
 void DumpPage::WriteInternal()
 {
 WriteValue((uint8_t)DumpObjectKind::Page);
 WriteCore(*stream, page, true);
-
-if (diffWriter != nullptr)
-{
-if (wasLoaded)
-diffWriter->StartExistingPage(originalPage, page);
-else
-diffWriter->StartNewPage(page);
-}
 }
 
 void DumpPage::UpdateIndex(Offset offset, bool overwrite)
@@ -87,13 +80,13 @@
 }
 
 DumpPage::DumpPage(weak_ptr dump, uint32_t pageId)
-: DumpObject(dump), page(), diffWriter()
+: DumpObject(dump), page()
 {
 Load(pageId);
 }
 
 DumpPage::DumpPage(weak_ptr dump, Offset offset)
-: DumpObject(dump), page(), diffWriter()
+: DumpObject(dump), page()
 {
 auto dumpRef = dump.lock();
 
diff --git a/DumpObjects/DumpPage.h b/DumpObjects/DumpPage.h
index 4c230bf..3d1e16d 100644
--- a/DumpObjects/DumpPage.h
+++ b/DumpObjects/DumpPage.h
@@ -11,8 +11,6 @@
 Page originalPage;
 bool wasLoaded;
 
-DiffWriter *diffWriter;
-
 void Load(std::uint32_t pageId);
 static Page Read(std::shared_ptr dump, Offset offset);
 protected:
@@ -25,7 +23,7 @@
 DumpPage(std::weak_ptr dump, Offset offset);
 
 virtual void Write() override;
-void Write(DiffWriter *diffWriter);
+void WriteDiff(DiffWriter& diffWriter);
 virtual std::uint32_t NewLength() override;
 
 static Page ReadCore(std::istream &stream, bool includeRevisionIds);
diff --git a/DumpWriters/DumpWriter.cpp b/DumpWriters/DumpWriter.cpp
index 96cfa67..d1423fe 100644
--- a/DumpWriters/DumpWriter.cpp
+++ b/DumpWriters/DumpWriter.cpp
@@ -31,39 +31,38 @@
 oldPage = this->page->page;
 this->page->page = *page;
 unset(unvisitedPageIds, pageId);
+
+if (diffWriter != nullptr)
+this->page->WriteDiff(*diffWriter);
 }
 
 void DumpWriter::AddRevision(const std::shared_ptr revision)
 {
 page->page.RevisionIds.insert(revision->RevisionId);
-revisions.push_back(revision);
+
+DumpRevision dumpRevision(dump, revision->RevisionId, false);
+dumpRevision.revision = *revision;
+
+if (diffWriter != nullptr)
+{
+bool isNew;
+std::uint8_t modelFormatId = dumpRevision.GetModelFormatId(isNew);
+
+if (isNew)
+diffWriter->NewModelFormat(modelFormatId, 
dumpRevision.revision.Model, dumpRevision.revision.Format);
+}
+
+bool newRevision = !contains(oldPage.RevisionIds, revision->RevisionId);
+
+if (newRevision)
+newRevisionIds.insert(revision->RevisionId);
+
+dumpRevision.Write(diffWriter.get(), newRevision);
 }
 
 void DumpWriter::EndPage()
 {
-page->Write(diffWriter.get());
-
-for (auto revision : revisions)
-{
-DumpRevision dumpRevision(dump, revision->RevisionId, false);
-dumpRevision.revision = *revision;
-
-if (diffWriter != nullptr)
-{
-bool isNew;
-std::uint8_t modelFormatId = dumpRevision.GetModelFormatId(isNew);
-
-if (isNew)
-diffWriter->NewModelFormat(modelFormatId, 
dumpRevision.revision.Model, dumpRevision.revision.Format);
-}
-
-bool newRevision = !contains(oldPage.RevisionIds, 
revision->RevisionId);
-
-if (newRevision)
-newRevisionIds.insert(revision->RevisionId);
-
-dumpRevision.Write(diffWriter.get(), newRevision);
-}
+page->Write();
 
 auto deletedRevisionIds = except(oldPage.Re