On Thu, May 22, 2008 at 1:13 AM, Martin Langhoff
<[EMAIL PROTECTED]> wrote:
> On Thu, May 22, 2008 at 6:14 AM, Tomeu Vizoso <[EMAIL PROTECTED]> wrote:
>> the patch attached maintains a copy of the metadata of each object
>> outside the xapian index. How it works:
>
> Fantastic. Except that... erm... arhm... you forgot the patch ;-)

Ouch.

>> - at every create and update, a json file is created next to the object's 
>> file,
>>
>> - it's also deleted along the object,
>>
>> - at startup, if the file <datastore_path>/.metadata.exported doesn't
>> exist, check how many objects need to get their metadata exported
>> (0.8s for 3000 entries)
>
> That's pretty good.
>
>> - in an idle callback, process each of those objects one per iteration
>> (3ms per entry with simplejson, 2ms with cjson).
>
> Exporting a few 100 per iteration probably is more efficient ;-)

Yes, but it needs to be? We are balancing here the speed at which
metadata will get exported _the first time after the update to 8.2_
with usability of Sugar during that limited period of time.

Anyway, feel free to try and play with the amount of entries exported
at each go, the code allows for this quite easily.

>> In my tests this has worked quite well, but I have one concern: can
>> something bad happen if we have 20k files in the same dir (for a
>> journal with 10k entries)?
>
> Ok, we can split it into a subdir (which will only have 10K files then).

Yes, this is the easiest option. I would like to hear from Dave if
there could be any problem with 10k files in the same dir.

> If there's a cost to large dirs in jfffs2 then we can use hashed dirs,
> and that change will be needed for both the main datastore storage
> _and_ the metadata files.

Yes, that's the approach I used in my DS rewrite, but I would prefer
to leave this change for a future release if it's possible.

>> One side effect of this is that when (if) we agree on a new on-disk
>> data structure for the DS, it will be easier to convert than if we had
>> to extract all the metadata from the index.
>
> Yes. And as you said earlier, easy recovery if xapian goes to la-la land.

Yeah, I'm wondering if metadata should be retrieved from the json file
instead of from the index, that may give us a performance improvement,
as well as increased robustness.

Thanks,

Tomeu
From 7c1a25140c5c2132444abe5f7d372e19d7fdda8b Mon Sep 17 00:00:00 2001
From: Tomeu Vizoso <[EMAIL PROTECTED]>
Date: Wed, 21 May 2008 20:04:42 +0200
Subject: [PATCH] Maintain a metadata copy outside the index.

---
 src/olpc/datastore/backingstore.py |   61 +++++++++++++++++++++++++++++++++--
 src/olpc/datastore/datastore.py    |   22 +------------
 2 files changed, 59 insertions(+), 24 deletions(-)

diff --git a/src/olpc/datastore/backingstore.py b/src/olpc/datastore/backingstore.py
index fc3c05f..c46a0e9 100644
--- a/src/olpc/datastore/backingstore.py
+++ b/src/olpc/datastore/backingstore.py
@@ -28,6 +28,7 @@ import sys
 import dbus
 import xapian
 import gobject
+import cjson
 
 from olpc.datastore.xapianindex import IndexManager
 from olpc.datastore import bin_copy
@@ -215,7 +216,11 @@ class FileBackingStore(BackingStore):
         instead of a method parameter because this is less invasive for Update 1.
         """
         self.current_user_id = None
-        
+
+        # source for an idle callback that exports to the file system the
+        # metadata from the index
+        self._export_metadata_source = None
+
     # Informational
     def descriptor(self):
         """return a dict with atleast the following keys
@@ -327,7 +332,29 @@ class FileBackingStore(BackingStore):
             im.connect(index_name)
 
             self.indexmanager = im
-            
+
+        # Check that all entries have their metadata in the file system.
+        if not os.path.exists(os.path.join(self.base, '.metadata.exported')):
+            uids_to_export = []
+            uids = self.indexmanager.get_all_ids()
+
+            t = time.time()
+            for uid in uids:
+                if not os.path.exists(os.path.join(self.base, uid + '.metadata')):
+                    uids_to_export.append(uid)
+
+            if uids_to_export:
+                self._export_metadata_source = gobject.idle_add(
+                        self._export_metadata, uids_to_export)
+            else:
+                open(os.path.join(self.base, '.metadata.exported'), 'w').close()
+
+    def _export_metadata(self, uids_to_export):
+        uid = uids_to_export.pop()
+        props = self.indexmanager.get(uid).properties
+        self._store_metadata(uid, props)
+        return len(uids_to_export) > 0
+
     def bind_to(self, datastore):
         ## signal from datastore that we are being bound to it
         self.datastore = datastore
@@ -502,6 +529,23 @@ class FileBackingStore(BackingStore):
         return c.hexdigest()
         
     # File Management API
+    def _store_metadata(self, uid, props):
+        t = time.time()
+        path = os.path.join(self.base, uid + '.metadata')
+        props = props.copy()
+        for property_name in model.defaultModel.get_external_properties():
+            if property_name in props:
+                del props[property_name]
+        f = open(path, 'w')
+        f.write(cjson.encode(props))
+        f.close()
+        logging.debug('exported metadata: %r s.' % (time.time() - t))
+
+    def _delete_metadata(self, uid, props):
+        path = os.path.join(self.base, uid + '.metadata')
+        if os.path.exists(path):
+            os.unlink(path)
+
     def _create_completion(self, uid, props, completion, exc=None, path=None):
         if exc:
             completion(exc)
@@ -517,6 +561,7 @@ class FileBackingStore(BackingStore):
         if completion is None:
             raise RuntimeError("Completion must be valid for async create")
         uid = self.indexmanager.index(props)
+        self._store_metadata(uid, props)
         props['uid'] = uid
         if filelike:
             if isinstance(filelike, basestring):
@@ -531,6 +576,7 @@ class FileBackingStore(BackingStore):
     def create(self, props, filelike, can_move=False):
         if filelike:
             uid = self.indexmanager.index(props)
+            self._store_metadata(uid, props)
             props['uid'] = uid
             if isinstance(filelike, basestring):
                 # lets treat it as a filename
@@ -540,7 +586,9 @@ class FileBackingStore(BackingStore):
             self.indexmanager.index(props, path)
             return uid
         else:
-            return self.indexmanager.index(props)
+            uid = self.indexmanager.index(props)
+            self._store_metadata(uid, props)
+            return uid
     
     def get(self, uid, env=None, allowMissing=False, includeFile=False):
         content = self.indexmanager.get(uid)
@@ -575,6 +623,7 @@ class FileBackingStore(BackingStore):
             raise RuntimeError("Completion must be valid for async update")
 
         props['uid'] = uid
+        self._store_metadata(uid, props)
         if filelike:
             uid = self.indexmanager.index(props, filelike)
             props['uid'] = uid
@@ -590,6 +639,7 @@ class FileBackingStore(BackingStore):
 
     def update(self, uid, props, filelike=None, can_move=False):
         props['uid'] = uid
+        self._store_metadata(uid, props)
         if filelike:
             if isinstance(filelike, basestring):
                 # lets treat it as a filename
@@ -610,6 +660,7 @@ class FileBackingStore(BackingStore):
 
     def delete(self, uid, allowMissing=True):
         self._delete_external_properties(uid)
+        self._delete_metadata(uid, props)
 
         self.indexmanager.delete(uid)
         path = self._translatePath(uid)
@@ -617,7 +668,7 @@ class FileBackingStore(BackingStore):
             os.unlink(path)
         else:
             if not allowMissing:
-                raise KeyError("object for uid:%s missing" % uid)            
+                raise KeyError("object for uid:%s missing" % uid)
         
     def get_uniquevaluesfor(self, propertyname):
         return self.indexmanager.get_uniquevaluesfor(propertyname)
@@ -651,6 +702,8 @@ class FileBackingStore(BackingStore):
         return self.indexmanager.get_all_ids()
     
     def stop(self):
+        if self._export_metadata_source is not None:
+            gobject.source_remove(self._export_metadata_source)
         self.indexmanager.stop()
 
     def complete_indexing(self):
diff --git a/src/olpc/datastore/datastore.py b/src/olpc/datastore/datastore.py
index 67ddca9..a15d5cf 100644
--- a/src/olpc/datastore/datastore.py
+++ b/src/olpc/datastore/datastore.py
@@ -128,28 +128,10 @@ class DataStore(dbus.service.Object):
 
     ### Backup support
     def pause(self, mountpoints=None):
-        """pause the datastore, during this time it will not process
-    requests. this allows the underlying stores to be backup up via
-    traditional mechanisms
-    """
-        if mountpoints:
-            mps = [self.mountpoints[mp] for mp in mountpoints]
-        else:
-            mps = self.mountpoints.values()
-
-        for mp in mps:
-            mp.stop()
+        """ Deprecated. """
 
     def unpause(self, mountpoints=None):
-        """resume the operation of a set of paused mountpoints"""
-        if mountpoints:
-            mps = [self.mountpoints[mp] for mp in mountpoints]
-        else:
-            mps = self.mountpoints.values()
-
-        for mp in mps:
-            mp.initialize_and_load()
-        
+        """ Deprecated. """
     ### End Backups
             
     def connect_backingstore(self, uri, **kwargs):
-- 
1.5.2.5

_______________________________________________
Devel mailing list
Devel@lists.laptop.org
http://lists.laptop.org/listinfo/devel

Reply via email to