Hi Anthony
I did some profiling with line_profiler.
Here are the results for an table with index:
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1469 def readWhere(
self, condition, condvars=None, field=None,
1470
start=None, stop=None, step=None ):
1471 """
1480 """
1481 1 8 8.0 0.0
self._checkFieldIfNumeric(field)
1482
1483 1 2 2.0 0.0 coords = [
p.nrow for p in
1484 6951894 25942594 3.7 92.4
self._where(condition, condvars, start, stop, step) ]
1485 1 5 5.0 0.0
self._whereCondition = None # reset the conditions
1486 1 6 6.0 0.0 if len(coords) > 1:
1487 1 13 13.0 0.0 cstart,
cstop = coords[0], coords[-1]+1
1488 1 5 5.0 0.0 if cstop
- cstart == len(coords):
1489 #
Chances for monotonically increasing row values. Refine.
1490 1 4 4.0 0.0
inc_seq = numpy.alltrue(
1491 1 1124713 1124713.0 4.0
numpy.arange(cstart, cstop) == numpy.array(coords))
1492 1 5 5.0 0.0 if inc_seq:
1493 1 1000172 1000172.0 3.6
return self.read(cstart, cstop, field=field)
1494 return
self.readCoordinates(coords, field)
File: profile.py
Function: function_to_test at line 11
Total time: 39.1439 s
the profile for the _where() function looks like this:
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1423 def _where( self,
condition, condvars,
1424
start=None, stop=None, step=None ):
1425 """Low-level
counterpart of `self.where()`."""
1426 1 2 2.0 0.0 if profile:
tref = time()
1427 1 1 1.0 0.0 if profile:
show_stats("Entering table._where", tref)
1428 # Adjust the
slice to be used.
1429 1 20 20.0 0.0 (start, stop,
step) = self._processRangeRead(start, stop, step)
1430 1 2 2.0 0.0 if start >=
stop: # empty range, reset conditions
1431
self._useIndex = False
1432
self._whereCondition = None
1433 return iter([])
1434
1435 # Compile the
condition and extract usable index conditions.
1436 1 153 153.0 0.0 condvars =
self._requiredExprVars(condition, condvars, depth=3)
1437 1 1299 1299.0 0.3 compiled =
self._compileCondition(condition, condvars)
1438
1439 # Can we use indexes?
1440 1 2 2.0 0.0 if
compiled.index_expressions:
1441 1 1 1.0 0.0 chunkmap
= _table__whereIndexed(
1442 1 439602 439602.0 99.6 self,
compiled, condition, condvars, start, stop, step)
1443 1 5 5.0 0.0 if
type(chunkmap) != numpy.ndarray:
1444 # If
it is not a NumPy array it should be an iterator
1445 #
Reset conditions
1446
self._useIndex = False
1447
self._whereCondition = None
1448 #
...and return the iterator
1449 return chunkmap
1450 else:
1451 chunkmap
= None # default to an in-kernel query
1452
1453 2 7 3.5 0.0 args =
[condvars[param] for param in compiled.parameters]
1454 1 5 5.0 0.0
self._whereCondition = (compiled.function, args)
1455 1 141 141.0 0.0 row =
tableExtension.Row(self)
1456 1 2 2.0 0.0 if profile:
show_stats("Exiting table._where", tref)
1457 1 27 27.0 0.0 return
row._iter(start, stop, step, chunkmap=chunkmap)
File: profile.py
Function: function_to_test at line 11
Total time: 22.4212 s
The profile for a non-indexed table looks like this:
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1469 def readWhere(
self, condition, condvars=None, field=None,
1470
start=None, stop=None, step=None ):
1480 """
1481 1 8 8.0 0.0
self._checkFieldIfNumeric(field)
1482
1483 1 1 1.0 0.0 coords = [
p.nrow for p in
1484 6951894 14060474 2.0 87.2
self._where(condition, condvars, start, stop, step) ]
1485 1 4 4.0 0.0
self._whereCondition = None # reset the conditions
1486 1 4 4.0 0.0 if len(coords) > 1:
1487 1 10 10.0 0.0 cstart,
cstop = coords[0], coords[-1]+1
1488 1 5 5.0 0.0 if cstop
- cstart == len(coords):
1489 #
Chances for monotonically increasing row values. Refine.
1490 1 3 3.0 0.0
inc_seq = numpy.alltrue(
1491 1 1104094 1104094.0 6.8
numpy.arange(cstart, cstop) == numpy.array(coords))
1492 1 4 4.0 0.0 if inc_seq:
1493 1 959634 959634.0 6.0
return self.read(cstart, cstop, field=field)
1494 return
self.readCoordinates(coords, field)
File: profile.py
Function: function_to_test at line 11
Total time: 27.5148 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1423 def _where( self,
condition, condvars,
1424
start=None, stop=None, step=None ):
1425 """Low-level
counterpart of `self.where()`."""
1426 1 2 2.0 0.1 if profile:
tref = time()
1427 1 2 2.0 0.1 if profile:
show_stats("Entering table._where", tref)
1428 # Adjust the
slice to be used.
1429 1 29 29.0 2.1 (start, stop,
step) = self._processRangeRead(start, stop, step)
1430 1 2 2.0 0.1 if start >=
stop: # empty range, reset conditions
1431
self._useIndex = False
1432
self._whereCondition = None
1433 return iter([])
1434
1435 # Compile the
condition and extract usable index conditions.
1436 1 154 154.0 10.9 condvars =
self._requiredExprVars(condition, condvars, depth=3)
1437 1 1057 1057.0 75.0 compiled =
self._compileCondition(condition, condvars)
1438
1439 # Can we use indexes?
1440 1 2 2.0 0.1 if
compiled.index_expressions:
1441 chunkmap
= _table__whereIndexed(
1442 self,
compiled, condition, condvars, start, stop, step)
1443 if
type(chunkmap) != numpy.ndarray:
1444 # If
it is not a NumPy array it should be an iterator
1445 #
Reset conditions
1446
self._useIndex = False
1447
self._whereCondition = None
1448 #
...and return the iterator
1449 return chunkmap
1450 else:
1451 1 2 2.0 0.1 chunkmap
= None # default to an in-kernel query
1452
1453 2 4 2.0 0.3 args =
[condvars[param] for param in compiled.parameters]
1454 1 2 2.0 0.1
self._whereCondition = (compiled.function, args)
1455 1 128 128.0 9.1 row =
tableExtension.Row(self)
1456 1 2 2.0 0.1 if profile:
show_stats("Exiting table._where", tref)
1457 1 23 23.0 1.6 return
row._iter(start, stop, step, chunkmap=chunkmap)
File: profile.py
Function: function_to_test at line 11
Total time: 10.7759 s
Well I have the hdf5 file which reproduces the problem. I will try to
write a script which creates the an hdf5 file that shows the same
behavior.
On Sun, Jan 22, 2012 at 11:20 PM, Anthony Scopatz <[email protected]> wrote:
> Hello Ümit,
>
> Yes this is some seriously messed up behavior. I would suggest profiling
> (kern_prof, line_profiler) to try to figure out what is going on here. The
> is counter productive at the very least....
>
> My suspicion is that when indexes are used that BOTH the index and the table
> values themselves are being compared rather than ONLY the index being
> compared. This is just my suspicion, but it would seem to give this
> behavior. Profiling would at least tell us which function or method is the
> trouble maker.
>
> Do you have a script that reproduces this as a whole?
>
> Be Well
> Anthony
>
> On Sat, Jan 21, 2012 at 7:23 AM, Ümit Seren <[email protected]> wrote:
>>
>> I recently used ptrepack to compact my hdf5 file and forgot to active
>> the options to propagate indexes.
>> Just out of curiosity I decided to compare performance between the two
>> tables (one with index and one without) for some queries.
>>
>> The table structure looks like this:
>>
>> "gene_mid_pos": UInt32Col(shape=(), dflt=0, pos=3),
>> "gene_start": UInt32Col(shape=(), dflt=0, pos=4),
>> "mac": Int32Col(shape=(), dflt=0, pos=5),
>> "maf": Float32Col(shape=(), dflt=0.0, pos=6),
>> "perc_var_expl": Float32Col(shape=(), dflt=0.0, pos=7),
>> "score": Float32Col(shape=(), dflt=0.0, pos=8),
>> "snp_chr": UInt8Col(shape=(), dflt=0, pos=9),
>> "snp_pos": UInt32Col(shape=(), dflt=0, pos=10)}
>> byteorder := 'little'
>> chunkshape := (4854,)
>> autoIndex := True
>> colindexes := {
>> "gene": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>> "gene_start": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>> "snp_pos": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>> "gene_mid_pos": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>> "perc_var_expl": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>> "mac": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>> "score": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>> "snp_chr": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>> "maf": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>> "gene_end": Index(6, medium, shuffle, zlib(1)).is_CSI=False,
>> "gene_chr": Index(6, medium, shuffle, zlib(1)).is_CSI=False}
>>
>>
>> and contains around 7 million rows (sorted after score in descending
>> order)
>>
>> To verify the results I also created a second hdf5 file with ptrepack
>> where I propagate the indexes just to make sure that compacting isn't
>> the root of the performance issue.
>> Here are my results:
>>
>> orig_table ... original table with indexes
>> com_table_no_index = compacted table without indexes
>> com_table_with_index = compacted table with indexes
>>
>> table.readWhere('score > 5')
>>
>> orig_table: 19.4s
>> com_table_with_index: 18.7s
>> com_table_no_index: 7.9s
>>
>>
>> table.readWhere('gene_chr >2') (really low selectivity because snp_chr
>> is between 1 and 5)
>>
>> orig_table: 46.3s
>> com_table_with_index: 44.8s
>> com_table_no_index: 37.8s
>>
>> table.readWhere('(maf >0.2 and maf < 0.9 )')
>>
>> orig_table: 19.4 s
>> com_table_with_index: 19.2 s
>> com_table_no_index: 9.31 s
>>
>> Both queries have a hit rate of 90%
>>
>> table.readWhere('(maf >0.21 and maf < 0.22 )'
>> orig_table: 35.8 s
>> com_table_with_index: 35.7 s
>> com_table_no_index: 29.3 s
>>
>> I tried also a query with a smaller hit-rate (10%)
>>
>>
>> table.readWhere('(gene_start >2000) & (gene_end<900000)')
>>
>> orig_table: 5.67s
>> com_table_with_index: 5.52s
>> com_table_no_index: 4.22s
>>
>>
>> table.readWhere('(snp_chr ==2 )') (really low selectivity because
>> snp_chr is between 1 and 5)
>>
>> orig_table: 12 s
>> com_table_with_index: 12.3 s
>> com_table_no_index: 9.16 s
>>
>>
>>
>> I checked with willQueryUseIndexing if the indexes are used and in all
>> cases they were.
>>
>> Here are some observations:
>>
>> - Querying for a column with high selectivity (score,maf) runs more
>> than 2x fast on a table without index compared to one with an index.
>> - Querying for a column with low selectivity (chr) or with small
>> hit-rate still shows a difference but isn't as high as in the first
>> case.
>>
>>
>> Anyone knows why querying an indexed table is slower than an un-indexed
>> one?
>>
>> Thanks in advance
>>
>> I am using pytables 2.3
>>
>> cheers
>> Ümit
>>
>>
>> ------------------------------------------------------------------------------
>> Try before you buy = See our experts in action!
>> The most comprehensive online learning library for Microsoft developers
>> is just $99.99! Visual Studio, SharePoint, SQL - plus HTML5, CSS3, MVC3,
>> Metro Style Apps, more. Free future releases when you subscribe now!
>> http://p.sf.net/sfu/learndevnow-dev2
>> _______________________________________________
>> Pytables-users mailing list
>> [email protected]
>> https://lists.sourceforge.net/lists/listinfo/pytables-users
>
>
>
> ------------------------------------------------------------------------------
> Try before you buy = See our experts in action!
> The most comprehensive online learning library for Microsoft developers
> is just $99.99! Visual Studio, SharePoint, SQL - plus HTML5, CSS3, MVC3,
> Metro Style Apps, more. Free future releases when you subscribe now!
> http://p.sf.net/sfu/learndevnow-dev2
> _______________________________________________
> Pytables-users mailing list
> [email protected]
> https://lists.sourceforge.net/lists/listinfo/pytables-users
>
------------------------------------------------------------------------------
Try before you buy = See our experts in action!
The most comprehensive online learning library for Microsoft developers
is just $99.99! Visual Studio, SharePoint, SQL - plus HTML5, CSS3, MVC3,
Metro Style Apps, more. Free future releases when you subscribe now!
http://p.sf.net/sfu/learndevnow-dev2
_______________________________________________
Pytables-users mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/pytables-users