Physikerwelt has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/127469

Change subject: Pubmed similarity experiments
......................................................................

Pubmed similarity experiments

improve calculation of distance calculation for medium size wikis

Change-Id: Icb0b6ea3f4a387a21efc307ab68b3cc8ab62809d
---
A db/mathpagesimilarity.sql
A db/snippets/materializeMathPage9.sql
M maintenance/CalculateDistances.php
3 files changed, 20 insertions(+), 8 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/MathSearch 
refs/changes/69/127469/1

diff --git a/db/mathpagesimilarity.sql b/db/mathpagesimilarity.sql
new file mode 100644
index 0000000..e881462
--- /dev/null
+++ b/db/mathpagesimilarity.sql
@@ -0,0 +1,6 @@
+CREATE TABLE /*_*/mathpagesimilarity (
+  `pagesimilarity_A` int(11) NOT NULL,
+  `pagesimilarity_B` int(11) NOT NULL,
+  `pagesimilarity_Value` double NOT NULL DEFAULT '0',
+  PRIMARY KEY (`pagesimilarity_B`,`pagesimilarity_A`)
+) /*$wgDBTableOptions*/;
diff --git a/db/snippets/materializeMathPage9.sql 
b/db/snippets/materializeMathPage9.sql
new file mode 100644
index 0000000..0ec566c
--- /dev/null
+++ b/db/snippets/materializeMathPage9.sql
@@ -0,0 +1,5 @@
+INSERT INTO mathpage9 (page_id)
+(SELECT `mathindex_page_id` from (
+SELECT `mathindex_page_id` ,count(`mathindex_anchor`)  as cnt FROM `mathindex` 
group by mathindex_page_id ) t
+WHERE
+t.cnt > 9)
\ No newline at end of file
diff --git a/maintenance/CalculateDistances.php 
b/maintenance/CalculateDistances.php
index 5e1eede..783123a 100644
--- a/maintenance/CalculateDistances.php
+++ b/maintenance/CalculateDistances.php
@@ -21,9 +21,8 @@
 
 require_once( dirname( __FILE__ ) . '/../../../maintenance/Maintenance.php' );
 
-class UpdateMath extends Maintenance {
+class CalculateDistances extends Maintenance {
        const RTI_CHUNK_SIZE = 1;
-       var $purge = false;
        var $dbw = null;
 
        /**
@@ -36,7 +35,7 @@
        public function __construct() {
                parent::__construct();
                $this->mDescription = 'Outputs page text to stdout';
-               $this->addOption( 'purge', "If set all formulae are rendered 
again from strech. (Very time consuming!)", false, false, "f" );
+               $this->addOption( 'limit', 'Only the pages with the most 
features are used. Default 2000', false, true, "l" );
                $this->addArg( 'min', "If set processing is started at the page 
with rank(pageID)>min", false );
                $this->addArg( 'max', "If set processing is stopped at the page 
with rank(pageID)<=max", false );
        }
@@ -62,7 +61,7 @@
                        $res = $this->db->selectField( 'mathpagestat', 
'pagestat_pageid', "pagestat_pageid=$n" );
                        if ( $res ) {
                                $this->dbw->begin();
-                               $fcount += self::doUpdate( $res, $this->dbw );
+                               $fcount += self::doUpdate( $res, $this->dbw , 
$this->getOption( 'limit', 2000 ) );
                        $start = microtime( true );
                        $this->dbw->commit();
                        echo " committed in " . ( microtime( true ) -$start ) . 
"s\n\n";
@@ -77,12 +76,15 @@
         * @param string $purge
         * @return number
         */
-       private static function doUpdate( $pid  , $dbw ) {
+       private static function doUpdate( $pid  , $dbw, $limit ) {
                // TODO: fix link id problem
                $sql = "INSERT IGNORE INTO 
mathpagesimilarity(pagesimilarity_A,pagesimilarity_B,pagesimilarity_Value)\n"
                                . "SELECT DISTINCT 
'.$pid.',`pagestat_pageid`,\n"
                                . "CosProd('.$pid.',`pagestat_pageid`)\n"
-                                               . "FROM `mathpagestat` WHERE 
pagestat_pageid<" . $pid;
+                               . "FROM `mathpagestat` m JOIN "
+                               . "(SELECT `pagestat_pageid` as pageid FROM 
`mathpagestat` GROUP BY `pagestat_pageid` "
+                               . "ORDER BY sum(`pagestat_featurecount`) DESC 
LIMIT $limit ) as r WHERE m.pagestat_pageid < $pid"
+                               . " AND m.pagestat_pageid=r.pageid";
                echo "writing entries for page $pid...";
                $start = microtime( true );
                $dbw->query( $sql );
@@ -94,12 +96,11 @@
         */
        public function execute() {
                $this->dbw = wfGetDB( DB_MASTER );
-               $this->purge = $this->getOption( "purge", false );
                $this->db = wfGetDB( DB_MASTER );
                $this->output( "Done.\n" );
                $this->populateSearchIndex( $this->getArg( 0, 0 ), 
$this->getArg( 1, -1 ) );
        }
 }
 
-$maintClass = "UpdateMath";
+$maintClass = "CalculateDistances";
 require_once( RUN_MAINTENANCE_IF_MAIN );

-- 
To view, visit https://gerrit.wikimedia.org/r/127469
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Icb0b6ea3f4a387a21efc307ab68b3cc8ab62809d
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/MathSearch
Gerrit-Branch: master
Gerrit-Owner: Physikerwelt <w...@physikerwelt.de>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to