Petr Onderka has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/83785


Change subject: Don't save namespace as part of title
......................................................................

Don't save namespace as part of title

Change-Id: I794c19db9e36d11d6d05e3bbd7f6f3fe26fb8b0a
---
M DumpWriters/ArticlesWriterWrapper.cpp
M DumpWriters/ArticlesWriterWrapper.h
M DumpWriters/CompositeWriter.cpp
M DumpWriters/CompositeWriter.h
M DumpWriters/DumpWriter.cpp
M DumpWriters/DumpWriter.h
M DumpWriters/IDumpWriter.h
M DumpWriters/WriterWrapper.cpp
M DumpWriters/WriterWrapper.h
M XmlInput/XmlPageProcessor.cpp
M XmlWriter.cpp
11 files changed, 43 insertions(+), 14 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/dumps/incremental 
refs/changes/85/83785/1

diff --git a/DumpWriters/ArticlesWriterWrapper.cpp 
b/DumpWriters/ArticlesWriterWrapper.cpp
index a17c385..4554784 100644
--- a/DumpWriters/ArticlesWriterWrapper.cpp
+++ b/DumpWriters/ArticlesWriterWrapper.cpp
@@ -3,12 +3,12 @@
 
 const std::int16_t UserNamespace = 2;
 
-void ArticlesWriterWrapper::StartPage(const std::shared_ptr<const Page> page)
+void ArticlesWriterWrapper::StartPage(const std::shared_ptr<const Page> page, 
bool titleWithNamespace)
 {
     pageInlcuded = page->Namespace % 2 == 0 && page->Namespace != 
UserNamespace;
 
     if (pageInlcuded)
-        wrapped->StartPage(page);
+        wrapped->StartPage(page, titleWithNamespace);
 }
 
 void ArticlesWriterWrapper::AddRevision(const std::shared_ptr<const Revision> 
revision)
diff --git a/DumpWriters/ArticlesWriterWrapper.h 
b/DumpWriters/ArticlesWriterWrapper.h
index 366f11e..fd7c99b 100644
--- a/DumpWriters/ArticlesWriterWrapper.h
+++ b/DumpWriters/ArticlesWriterWrapper.h
@@ -11,7 +11,7 @@
         : WriterWrapper(std::move(wrapped)), pageInlcuded(false)
     {}
 
-    virtual void StartPage(const std::shared_ptr<const Page> page) override;
+    virtual void StartPage(const std::shared_ptr<const Page> page, bool 
titleWithNamespace) override;
     virtual void AddRevision(const std::shared_ptr<const Revision> revision) 
override;
     virtual void EndPage() override;
     virtual void SetDumpKind(DumpKind dumpKind) override;
diff --git a/DumpWriters/CompositeWriter.cpp b/DumpWriters/CompositeWriter.cpp
index c93dde4..ad75049 100644
--- a/DumpWriters/CompositeWriter.cpp
+++ b/DumpWriters/CompositeWriter.cpp
@@ -1,10 +1,10 @@
 #include "CompositeWriter.h"
 #include "../DumpException.h"
 
-void CompositeWriter::StartPage(const std::shared_ptr<const Page> page)
+void CompositeWriter::StartPage(const std::shared_ptr<const Page> page, bool 
titleWithNamespace)
 {
     for (auto &writer : writers)
-        writer->StartPage(page);
+        writer->StartPage(page, titleWithNamespace);
 }
 
 void CompositeWriter::AddRevision(const std::shared_ptr<const Revision> 
revision)
diff --git a/DumpWriters/CompositeWriter.h b/DumpWriters/CompositeWriter.h
index 5352e80..8966f52 100644
--- a/DumpWriters/CompositeWriter.h
+++ b/DumpWriters/CompositeWriter.h
@@ -15,7 +15,7 @@
         : writers(std::move(writers)), getTextFunction(getTextFunction)
     {}
 
-    virtual void StartPage(const std::shared_ptr<const Page> page) override;
+    virtual void StartPage(const std::shared_ptr<const Page> page, bool 
titleWithNamespace) override;
     virtual void AddRevision(const std::shared_ptr<const Revision> revision) 
override;
     virtual void EndPage() override;
     virtual void SetSiteInfo(const std::shared_ptr<const SiteInfo> siteInfo) 
override;
diff --git a/DumpWriters/DumpWriter.cpp b/DumpWriters/DumpWriter.cpp
index d1423fe..6183c4c 100644
--- a/DumpWriters/DumpWriter.cpp
+++ b/DumpWriters/DumpWriter.cpp
@@ -5,6 +5,21 @@
 #include "../Indexes/Index.h"
 #include "../format.h"
 
+void DumpWriter::RemoveNamespace(Page& page)
+{
+    std::string namespapceName = 
dump->siteInfo->siteInfo.Namespaces.at(page.Namespace).second;
+
+    if (namespapceName.empty())
+        return;
+
+    namespapceName.append(":");
+
+    if (page.Title.substr(0, namespapceName.length()) != namespapceName)
+        throw DumpException();
+
+    page.Title.erase(0, namespapceName.length());
+}
+
 DumpWriter::DumpWriter(std::shared_ptr<WritableDump> dump, bool withText, 
std::unique_ptr<DiffWriter> diffWriter)
     : dump(dump), withText(withText), diffWriter(std::move(diffWriter))
 {
@@ -24,12 +39,17 @@
         diffWriter->SetSiteInfo(*siteInfo, dump->fileHeader.Kind);
 }
 
-void DumpWriter::StartPage(const std::shared_ptr<const Page> page)
+void DumpWriter::StartPage(const std::shared_ptr<const Page> page, bool 
titleWithNamespace)
 {
     std::uint32_t pageId = page->PageId;
+
     this->page = std::unique_ptr<DumpPage>(new DumpPage(dump, pageId));
     oldPage = this->page->page;
     this->page->page = *page;
+
+    if (titleWithNamespace)
+        RemoveNamespace(this->page->page);
+
     unset(unvisitedPageIds, pageId);
 
     if (diffWriter != nullptr)
diff --git a/DumpWriters/DumpWriter.h b/DumpWriters/DumpWriter.h
index 233f9ea..a1fee36 100644
--- a/DumpWriters/DumpWriter.h
+++ b/DumpWriters/DumpWriter.h
@@ -19,10 +19,11 @@
     std::unordered_set<std::uint32_t> newRevisionIds;
     bool withText;
 
+    void RemoveNamespace(Page& page);
 public:
     DumpWriter(std::shared_ptr<WritableDump> dump, bool withText, 
std::unique_ptr<DiffWriter> diffWriter = nullptr);
 
-    virtual void StartPage(const std::shared_ptr<const Page> page) override;
+    virtual void StartPage(const std::shared_ptr<const Page> page, bool 
titleWithNamespace) override;
     virtual void AddRevision(const std::shared_ptr<const Revision> revision) 
override;
     virtual void EndPage() override;
     virtual void SetSiteInfo(const std::shared_ptr<const SiteInfo> siteInfo) 
override;
diff --git a/DumpWriters/IDumpWriter.h b/DumpWriters/IDumpWriter.h
index 22c717f..42cde90 100644
--- a/DumpWriters/IDumpWriter.h
+++ b/DumpWriters/IDumpWriter.h
@@ -9,7 +9,7 @@
 class IDumpWriter
 {
 public:
-    virtual void StartPage(const std::shared_ptr<const Page> page) = 0;
+    virtual void StartPage(const std::shared_ptr<const Page> page, bool 
titleWithNamespace) = 0;
     virtual void AddRevision(const std::shared_ptr<const Revision> revision) = 
0;
     virtual void EndPage() = 0;
     virtual void SetSiteInfo(const std::shared_ptr<const SiteInfo> siteInfo) = 
0;
diff --git a/DumpWriters/WriterWrapper.cpp b/DumpWriters/WriterWrapper.cpp
index 1cc81b4..697f89f 100644
--- a/DumpWriters/WriterWrapper.cpp
+++ b/DumpWriters/WriterWrapper.cpp
@@ -1,8 +1,8 @@
 #include "WriterWrapper.h"
 
-void WriterWrapper::StartPage(const std::shared_ptr<const Page> page)
+void WriterWrapper::StartPage(const std::shared_ptr<const Page> page, bool 
titleWithNamespace)
 {
-    wrapped->StartPage(page);
+    wrapped->StartPage(page, titleWithNamespace);
 }
 
 void WriterWrapper::AddRevision(const std::shared_ptr<const Revision> revision)
diff --git a/DumpWriters/WriterWrapper.h b/DumpWriters/WriterWrapper.h
index af648e2..7718133 100644
--- a/DumpWriters/WriterWrapper.h
+++ b/DumpWriters/WriterWrapper.h
@@ -11,7 +11,7 @@
         : wrapped(std::move(wrapped))
     {}
 
-    virtual void StartPage(const std::shared_ptr<const Page> page) override;
+    virtual void StartPage(const std::shared_ptr<const Page> page, bool 
titleWithNamespace) override;
     virtual void AddRevision(const std::shared_ptr<const Revision> revision) 
override;
     virtual void EndPage() override;
     virtual void SetSiteInfo(const std::shared_ptr<const SiteInfo> siteInfo) 
override;
diff --git a/XmlInput/XmlPageProcessor.cpp b/XmlInput/XmlPageProcessor.cpp
index 75c1907..36c6855 100644
--- a/XmlInput/XmlPageProcessor.cpp
+++ b/XmlInput/XmlPageProcessor.cpp
@@ -9,7 +9,7 @@
 {
     if (!pageWritten)
     {
-        dumpWriter->StartPage(page);
+        dumpWriter->StartPage(page, true);
         pageWritten = true;
     }
 }
diff --git a/XmlWriter.cpp b/XmlWriter.cpp
index a7e5cf1..f89027a 100644
--- a/XmlWriter.cpp
+++ b/XmlWriter.cpp
@@ -69,7 +69,12 @@
 
         output.BeginElement("page");
 
-        output.WriteElement("title", escapeElementText(page.Title));
+        std::string title = page.Title;
+        std::string ns = 
dump->siteInfo->siteInfo.Namespaces.at(page.Namespace).second;
+        if (!ns.empty())
+            title = ns + ':' + title;
+
+        output.WriteElement("title", escapeElementText(title));
         output.WriteElement("ns", page.Namespace);
         output.WriteElement("id", page.PageId);
 
@@ -84,6 +89,9 @@
         {
             auto revision = DumpRevision(dump, revisionId, true).revision;
 
+            if (revision.RevisionId != revisionId)
+                throw DumpException();
+
             output.BeginElement("revision");
 
             output.WriteElement("id", revision.RevisionId);

-- 
To view, visit https://gerrit.wikimedia.org/r/83785
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I794c19db9e36d11d6d05e3bbd7f6f3fe26fb8b0a
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps/incremental
Gerrit-Branch: gsoc
Gerrit-Owner: Petr Onderka <gsv...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to