Petr Onderka has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/71995


Change subject: starting with dump format: file header
......................................................................

starting with dump format: file header

Change-Id: I29350dbe9de280efa1248a6c87d174fa6494d32e
---
A Dump.cpp
A Dump.h
A DumpObject.cpp
A DumpObject.h
A FileHeader.cpp
A FileHeader.h
M Incremental dumps.vcxproj
A Offset.cpp
A Offset.h
M main.cpp
10 files changed, 246 insertions(+), 2 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/dumps/incremental 
refs/changes/95/71995/1

diff --git a/Dump.cpp b/Dump.cpp
new file mode 100644
index 0000000..5a0d77f
--- /dev/null
+++ b/Dump.cpp
@@ -0,0 +1,49 @@
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <fstream>
+#include "Dump.h"
+
+using std::unique_ptr;
+using std::move;
+using std::string;
+using std::fstream;
+using std::ios;
+
+ReadableDump::ReadableDump(unique_ptr<iostream> stream)
+    : stream(move(stream))
+{}
+
+ReadableDump::ReadableDump(string fileName)
+    : stream(unique_ptr<fstream>(new fstream(fileName, ios::in | ios::binary)))
+{}
+
+unique_ptr<iostream> WritableDump::openStream(string fileName)
+{
+    fstream *stream = new fstream(fileName, ios::in | ios::out | ios::binary);
+
+    if (!stream->is_open())
+    {
+        // this feels dangerous, isn't there a better way?
+        stream = new fstream(fileName, ios::in | ios::out | ios::binary | 
ios::trunc);
+    }
+
+    stream->exceptions(ios::failbit | ios::badbit);
+
+    return unique_ptr<iostream>(stream);
+}
+
+WritableDump::WritableDump(string fileName)
+    : ReadableDump(openStream(fileName))
+{
+    if (stream->peek() == EOF)
+    {
+        stream->clear();
+        fileHeader = FileHeader();
+        fileHeader.Write(stream, 0);
+    }
+    else
+    {
+        fileHeader = FileHeader::Read(stream);
+    }
+}
\ No newline at end of file
diff --git a/Dump.h b/Dump.h
new file mode 100644
index 0000000..e995d1b
--- /dev/null
+++ b/Dump.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <iostream>
+#include "FileHeader.h"
+
+using std::int64_t;
+using std::unique_ptr;
+using std::string;
+using std::iostream;
+
+class DumpException : std::exception
+{
+};
+
+class ReadableDump
+{
+protected:
+    unique_ptr<iostream> stream;
+    ReadableDump(unique_ptr<iostream> stream);
+public:
+    ReadableDump(string fileName);
+};
+
+class WritableDump : public ReadableDump
+{
+private:
+    FileHeader fileHeader;
+
+    static unique_ptr<iostream> openStream(string fileName);
+public:
+    WritableDump(string fileName);
+};
\ No newline at end of file
diff --git a/DumpObject.cpp b/DumpObject.cpp
new file mode 100644
index 0000000..ce6715f
--- /dev/null
+++ b/DumpObject.cpp
@@ -0,0 +1,7 @@
+#include "DumpObject.h"
+
+void DumpObject::Write(unique_ptr<iostream> const &stream, int64_t offset)
+{
+    stream->seekp(offset);
+    WriteInternal(stream);
+}
\ No newline at end of file
diff --git a/DumpObject.h b/DumpObject.h
new file mode 100644
index 0000000..f7b5c2f
--- /dev/null
+++ b/DumpObject.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <iostream>
+
+using std::int64_t;
+using std::unique_ptr;
+using std::iostream;
+
+class DumpObject
+{
+protected:
+    virtual void WriteInternal(unique_ptr<iostream> const &stream) = 0;
+public:
+    void Write(unique_ptr<iostream> const &stream, int64_t offset);
+};
\ No newline at end of file
diff --git a/FileHeader.cpp b/FileHeader.cpp
new file mode 100644
index 0000000..ddee2dc
--- /dev/null
+++ b/FileHeader.cpp
@@ -0,0 +1,37 @@
+#include "FileHeader.h"
+#include "Dump.h"
+
+FileHeader::FileHeader(Offset fileEnd, Offset pageIdIndexRoot, Offset 
freeSpaceIndexRoot)
+    : FileEnd(fileEnd), PageIdIndexRoot(pageIdIndexRoot), 
FreeSpaceIndexRoot(freeSpaceIndexRoot)
+{
+}
+
+void FileHeader::WriteInternal(unique_ptr<iostream> const &stream)
+{
+    stream->write("WMID", 4);
+    stream->write(&FileFormatVersion, 1);
+    stream->write(&FileDataVersion, 1);
+
+    FileEnd.Write(stream);
+    PageIdIndexRoot.Write(stream);
+    FreeSpaceIndexRoot.Write(stream);
+}
+
+FileHeader FileHeader::Read(unique_ptr<iostream> const &stream)
+{
+    char bytes[6];
+    stream->read(bytes, 6);
+    if (stream->fail() || strncmp(bytes, "WMID", 4) != 0 || bytes[4] != 
FileFormatVersion || bytes[5] != FileDataVersion)
+        throw new DumpException();
+
+    Offset fileEnd = Offset::Read(stream);
+    Offset pageIdIndexRoot = Offset::Read(stream);
+    Offset freeSpaceIndexRoot = Offset::Read(stream);
+
+    return FileHeader(fileEnd, pageIdIndexRoot, freeSpaceIndexRoot);
+}
+
+FileHeader::FileHeader()
+    : FileEnd(0), PageIdIndexRoot(0), FreeSpaceIndexRoot(0)
+{
+}
\ No newline at end of file
diff --git a/FileHeader.h b/FileHeader.h
new file mode 100644
index 0000000..20e1bb1
--- /dev/null
+++ b/FileHeader.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <iostream>
+#include "DumpObject.h"
+#include "Offset.h"
+
+using std::istream;
+
+class FileHeader : public DumpObject
+{
+private:
+    FileHeader(Offset fileEnd, Offset pageIdIndexRoot, Offset 
freeSpaceIndexRoot);
+protected:
+    virtual void WriteInternal(unique_ptr<iostream> const &stream);
+public:
+    static const char FileFormatVersion = 1;
+    static const char FileDataVersion = 1;
+
+    static FileHeader Read(unique_ptr<iostream> const &stream);
+
+    Offset FileEnd;
+    Offset PageIdIndexRoot;
+    Offset FreeSpaceIndexRoot;
+
+    FileHeader();
+};
\ No newline at end of file
diff --git a/Incremental dumps.vcxproj b/Incremental dumps.vcxproj
index c22e91a..63131ae 100644
--- a/Incremental dumps.vcxproj
+++ b/Incremental dumps.vcxproj
@@ -78,7 +78,11 @@
     </Link>
   </ItemDefinitionGroup>
   <ItemGroup>
+    <ClCompile Include="Dump.cpp" />
+    <ClCompile Include="DumpObject.cpp" />
+    <ClCompile Include="FileHeader.cpp" />
     <ClCompile Include="main.cpp" />
+    <ClCompile Include="Offset.cpp" />
     <ClCompile Include="TestDumpWriter.cpp" />
     <ClCompile Include="XmlPageProcessor.cpp" />
     <ClCompile Include="XmlRevisionProcessor.cpp" />
@@ -90,7 +94,11 @@
     <ClCompile Include="XML\xmloutput.cpp" />
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="Dump.h" />
+    <ClInclude Include="DumpObject.h" />
     <ClInclude Include="DumpWriter.h" />
+    <ClInclude Include="FileHeader.h" />
+    <ClInclude Include="Offset.h" />
     <ClInclude Include="Page.h" />
     <ClInclude Include="Revision.h" />
     <ClInclude Include="TestDumpWriter.h" />
diff --git a/Offset.cpp b/Offset.cpp
new file mode 100644
index 0000000..d6097e0
--- /dev/null
+++ b/Offset.cpp
@@ -0,0 +1,43 @@
+#include "Offset.h"
+#include "Dump.h"
+
+Offset::Offset(int64_t value)
+    : value(value)
+{
+    if (value < 0 || value > 0xFFFFFFFFFFFF) // 6 bytes
+        throw DumpException();
+}
+
+void Offset::Write(unique_ptr<iostream> const &stream) const
+{
+    char bytes[6];
+
+    bytes[0] = value & 0xFF;
+    bytes[1] = (value >> 8) & 0xFF;
+    bytes[2] = (value >> 16) & 0xFF;
+    bytes[3] = (value >> 24) & 0xFF;
+    bytes[4] = (value >> 32) & 0xFF;
+    bytes[5] = (value >> 40) & 0xFF;
+
+    stream->write(bytes, 6);
+}
+
+Offset Offset::Read(unique_ptr<iostream> const &stream)
+{
+    char bytes[6];
+
+    stream->read(bytes, 6);
+
+    if (stream->fail())
+        throw new DumpException();
+
+    int64_t offset = 0;
+    offset |= (int64_t)bytes[0];
+    offset |= (int64_t)bytes[1] << 8;
+    offset |= (int64_t)bytes[2] << 16;
+    offset |= (int64_t)bytes[3] << 24;
+    offset |= (int64_t)bytes[4] << 32;
+    offset |= (int64_t)bytes[5] << 40;
+
+    return Offset(offset);
+}
\ No newline at end of file
diff --git a/Offset.h b/Offset.h
new file mode 100644
index 0000000..f0f451a
--- /dev/null
+++ b/Offset.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <iostream>
+
+using std::int64_t;
+using std::unique_ptr;
+using std::iostream;
+
+class Offset
+{
+public:
+    int64_t value;
+
+    Offset(int64_t value);
+    void Write(unique_ptr<iostream> const &stream) const;
+    static Offset Read(unique_ptr<iostream> const &stream);
+};
\ No newline at end of file
diff --git a/main.cpp b/main.cpp
index 0bb615a..229b7b2 100644
--- a/main.cpp
+++ b/main.cpp
@@ -4,6 +4,7 @@
 #include "XML/xmlfile.h"
 #include "TestDumpWriter.h"
 #include "XmlPageProcessor.h"
+#include "Dump.h"
 
 using std::string;
 using std::cin;
@@ -31,7 +32,7 @@
 int main(int argc, const char* argv[])
 {
     //StandardInputStream stream;
-    XML::FileInputStream stream = 
XML::FileInputStream("C:\\Users\\Svick\\Downloads\\tenwiki-20130622-pages-meta-history.xml");
+    /*XML::FileInputStream stream = 
XML::FileInputStream("C:\\Users\\Svick\\Downloads\\tenwiki-20130622-pages-meta-history.xml");
 
     XML::Input input(stream);
 
@@ -42,5 +43,7 @@
 
     TestDumpWriter writer;
 
-    input.Process(handlers, &writer);
+    input.Process(handlers, &writer);*/
+
+    WritableDump dump("test.id");
 }
\ No newline at end of file

-- 
To view, visit https://gerrit.wikimedia.org/r/71995
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I29350dbe9de280efa1248a6c87d174fa6494d32e
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps/incremental
Gerrit-Branch: gsoc
Gerrit-Owner: Petr Onderka <gsv...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to