Physikerwelt has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/127634

Change subject: Improvements to CalculateDistances
......................................................................

Improvements to CalculateDistances

new option -9 to ingore all pages with 9
or less formulae

Change-Id: Ic7bd3dcd2e19f269e281d5c11a2eb83c4f8772ae
---
M MathObject.php
M MathSearch.php
M db/snippets/materializeMathPage9.sql
M maintenance/CalculateDistances.php
4 files changed, 87 insertions(+), 56 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/MathSearch 
refs/changes/34/127634/1

diff --git a/MathObject.php b/MathObject.php
index 5daad86..7c52e6f 100644
--- a/MathObject.php
+++ b/MathObject.php
@@ -6,6 +6,7 @@
        protected $pageID = 0;
        protected $index_timestamp = null;
        protected $dbLoadTime= 0;
+       protected $mathTableName = null;
 
        private static function DebugPrint( $s ) {
                // $s= Sanitizer::safeEncodeAttribute($s);
@@ -130,9 +131,15 @@
                        }
                }
        }
+
+       /**
+        * @param $identifier
+        * @return bool|ResultWrapper
+        */
        public function getNouns($identifier){
                $dbr = wfGetDB( DB_SLAVE );
                $article = Article::newFromId( $this->pageID );
+               if( ! $article ) return false;
                $pagename = (string)$article->getTitle();;
                $identifiers = $dbr->select('mathidentifier',
                        array( 'noun', 'evidence' ),
@@ -275,8 +282,22 @@
                }
                return '<'.$arg[1]." 
title=\"$title\"".$attribs.'>'.$arg[4].'</'.$arg[1].'>';
        }
+       protected function getMathTableName() {
+               global $wgMathAnalysisTableName;
+               if ( is_null( $this->mathTableName ) ){
+                       return $wgMathAnalysisTableName;
+               } else {
+                       return $this->mathTableName;
+               }
+       }
 
        /**
+        * @param string $tableName mathoid or mathlatexml
+        */
+       public function setMathTableName( $tableName ) {
+               $this->mathTableName = $tableName;
+       }
+       /**
         * @param $wikiText
         * @return mixed
         */
diff --git a/MathSearch.php b/MathSearch.php
index aac812a..f981fd4 100644
--- a/MathSearch.php
+++ b/MathSearch.php
@@ -32,6 +32,9 @@
 $wgMathSearchMWSUrl = 'http://localhost:9090/';
 /** @var boolean if true the observation is updated everytime the SpecialPage 
formulainfo is shown. */
 $wgMathUpdateObservations = false;
+/** @var string $wgMathAnalysisTableName mathoid or mathlatexml */
+$wgMathAnalysisTableName = 'mathlatexml';
+
 $dir = dirname( __FILE__ ) . '/';
 
 $wgAutoloadClasses['MathSearchHooks'] = $dir . 'MathSearch.hooks.php';
diff --git a/db/snippets/materializeMathPage9.sql 
b/db/snippets/materializeMathPage9.sql
index 0ec566c..cee680b 100644
--- a/db/snippets/materializeMathPage9.sql
+++ b/db/snippets/materializeMathPage9.sql
@@ -1,3 +1,6 @@
+CREATE TABLE /*_*/mathpage9 (
+  `page_id` int(11) NOT NULL
+) /*$wgDBTableOptions*/;
 INSERT INTO mathpage9 (page_id)
 (SELECT `mathindex_page_id` from (
 SELECT `mathindex_page_id` ,count(`mathindex_anchor`)  as cnt FROM `mathindex` 
group by mathindex_page_id ) t
diff --git a/maintenance/CalculateDistances.php 
b/maintenance/CalculateDistances.php
index 783123a..0cb1ea2 100644
--- a/maintenance/CalculateDistances.php
+++ b/maintenance/CalculateDistances.php
@@ -22,83 +22,87 @@
 require_once( dirname( __FILE__ ) . '/../../../maintenance/Maintenance.php' );
 
 class CalculateDistances extends Maintenance {
-       const RTI_CHUNK_SIZE = 1;
+       const RTI_CHUNK_SIZE = 100;
+       /**@var DatabaseBase $dbw */
        var $dbw = null;
 
        /**
         * @var DatabaseBase
         */
        private $db;
+       private $pagelist = array();
+
        /**
         *
         */
        public function __construct() {
                parent::__construct();
                $this->mDescription = 'Outputs page text to stdout';
-               $this->addOption( 'limit', 'Only the pages with the most 
features are used. Default 2000', false, true, "l" );
-               $this->addArg( 'min', "If set processing is started at the page 
with rank(pageID)>min", false );
-               $this->addArg( 'max', "If set processing is stopped at the page 
with rank(pageID)<=max", false );
+               $this->addOption( 'page9', 'Ignore pages with only 9 equations 
or less.', false, false, "9" );
+               $this->addArg( 'min', "If set processing is started at the page 
with curid>min", false );
+               $this->addArg( 'max', "If set processing is stopped at the page 
with curid<=max", false );
        }
-       /**
-        * Populates the search index with content from all pages
-        */
-       protected function populateSearchIndex( $n = 0, $cmax = -1 ) {
-               $res = $this->db->select( 'page', 'MAX(page_id) AS count' );
-               $s = $this->db->fetchObject( $res );
-               $count = $s->count;
-               if ( $cmax > 0 && $count > $cmax ) {
-                       $count = $cmax;
-               }
-               $this->output( "Rebuilding index fields for {$count} pages with 
option {$this->purge}...\n" );
-               $fcount = 0;
 
-               while ( $n < $count ) {
-                       if ( $n ) {
-                               $this->output( $n . " of $count \n" );
-                       }
-                       $end = $n + self::RTI_CHUNK_SIZE - 1;
-
-                       $res = $this->db->selectField( 'mathpagestat', 
'pagestat_pageid', "pagestat_pageid=$n" );
-                       if ( $res ) {
-                               $this->dbw->begin();
-                               $fcount += self::doUpdate( $res, $this->dbw , 
$this->getOption( 'limit', 2000 ) );
-                       $start = microtime( true );
-                       $this->dbw->commit();
-                       echo " committed in " . ( microtime( true ) -$start ) . 
"s\n\n";
-                       }
-                       $n += self::RTI_CHUNK_SIZE;
-               }
-       }
-       /**
-        * @param unknown $pId
-        * @param unknown $pText
-        * @param string $pTitle
-        * @param string $purge
-        * @return number
-        */
-       private static function doUpdate( $pid  , $dbw, $limit ) {
-               // TODO: fix link id problem
-               $sql = "INSERT IGNORE INTO 
mathpagesimilarity(pagesimilarity_A,pagesimilarity_B,pagesimilarity_Value)\n"
-                               . "SELECT DISTINCT 
'.$pid.',`pagestat_pageid`,\n"
-                               . "CosProd('.$pid.',`pagestat_pageid`)\n"
-                               . "FROM `mathpagestat` m JOIN "
-                               . "(SELECT `pagestat_pageid` as pageid FROM 
`mathpagestat` GROUP BY `pagestat_pageid` "
-                               . "ORDER BY sum(`pagestat_featurecount`) DESC 
LIMIT $limit ) as r WHERE m.pagestat_pageid < $pid"
-                               . " AND m.pagestat_pageid=r.pageid";
-               echo "writing entries for page $pid...";
-               $start = microtime( true );
-               $dbw->query( $sql );
-               echo 'done in ' . ( microtime( true ) -$start ) . "\n";
-               return 1;
-       }
        /**
         *
         */
        public function execute() {
                $this->dbw = wfGetDB( DB_MASTER );
                $this->db = wfGetDB( DB_MASTER );
+               $this->pagelist = array();
+               $min = $this->getArg( 0, 0 );
+               $max = $this->getArg( 1, PHP_INT_MAX );
+               $conds = "pagestat_pageid >= $min";
+               if ( $max < PHP_INT_MAX ) {
+                       $conds .= " AND pagestat_pageid <= $max";
+               }
+               if ( $this->getOption( 'page9', false ) ) {
+                       $res = $this->db->select( array( 'mathpage9' , 
'mathpagestat'), array( 'page_id' ,'pagestat_pageid') ,
+                               $conds . ' AND pagestat_pageid = page_id',  
__METHOD__, array( 'DISTINCT' ) );
+               } else {
+                       $res = $this->db->select( 'mathpagestat', 
'pagestat_pageid', $conds, __METHOD__, array( 'DISTINCT' ) );
+               }
+               foreach ( $res as $row ) {
+                       array_push( $this->pagelist, $row->pagestat_pageid );
+               }
+               $this->populateSearchIndex();
                $this->output( "Done.\n" );
-               $this->populateSearchIndex( $this->getArg( 0, 0 ), 
$this->getArg( 1, -1 ) );
+       }
+
+       /**
+        * Populates the search index with content from all pages
+        */
+       protected function populateSearchIndex( ) {
+               $n = 0;
+               $count = sizeof($this->pagelist);
+               $this->output( "Rebuilding index fields for $count pages...\n" 
);
+               while ( $n < $count ) {
+                       if ( $n ) {
+                               $this->output( $n . " of $count \n" );
+                       }
+                               $this->dbw->begin();
+                       for($j=0;$j<self::RTI_CHUNK_SIZE;$j++){
+                               //TODO: USE PREPARED STATEMENTS
+                               $pid = $this->pagelist[$n];
+                               $sql = "INSERT IGNORE INTO 
mathpagesimilarity(pagesimilarity_A,pagesimilarity_B,pagesimilarity_Value)\n"
+                                       . "SELECT DISTINCT 
$pid,`pagestat_pageid`,\n"
+                                       . "CosProd( $pid,`pagestat_pageid`) 
FROM `mathpagestat` m ";
+                               if ( $this->getOption( 'page9', false ) ){
+                                       $sql .= " JOIN (SELECT page_id from 
mathpage9) as r WHERE m.pagestat_pageid=r.page_id AND ";
+                               } else {
+                                       $sql .= " WHERE ";
+                               }
+                               $sql .= "m.pagestat_pageid < $pid ";
+                               echo "writing entries for page $pid...";
+                               $start = microtime( true );
+                               $this->dbw->query( $sql );
+                               echo 'done in ' . ( microtime( true ) - $start 
) . "\n";
+                               $n++;
+                       }
+                       $start = microtime( true );
+                       $this->dbw->commit();
+                       echo " committed in " . ( microtime( true ) - $start ) 
. "s\n\n";
+               }
        }
 }
 

-- 
To view, visit https://gerrit.wikimedia.org/r/127634
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ic7bd3dcd2e19f269e281d5c11a2eb83c4f8772ae
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/MathSearch
Gerrit-Branch: master
Gerrit-Owner: Physikerwelt <w...@physikerwelt.de>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to