Physikerwelt has submitted this change and it was merged. Change subject: Improvements to CalculateDistances ......................................................................
Improvements to CalculateDistances new option -9 to ingore all pages with 9 or less formulae Change-Id: Ic7bd3dcd2e19f269e281d5c11a2eb83c4f8772ae --- M MathObject.php M MathSearch.php M db/snippets/materializeMathPage9.sql M maintenance/CalculateDistances.php 4 files changed, 87 insertions(+), 56 deletions(-) Approvals: Physikerwelt: Looks good to me, approved jenkins-bot: Verified diff --git a/MathObject.php b/MathObject.php index 5daad86..7c52e6f 100644 --- a/MathObject.php +++ b/MathObject.php @@ -6,6 +6,7 @@ protected $pageID = 0; protected $index_timestamp = null; protected $dbLoadTime= 0; + protected $mathTableName = null; private static function DebugPrint( $s ) { // $s= Sanitizer::safeEncodeAttribute($s); @@ -130,9 +131,15 @@ } } } + + /** + * @param $identifier + * @return bool|ResultWrapper + */ public function getNouns($identifier){ $dbr = wfGetDB( DB_SLAVE ); $article = Article::newFromId( $this->pageID ); + if( ! $article ) return false; $pagename = (string)$article->getTitle();; $identifiers = $dbr->select('mathidentifier', array( 'noun', 'evidence' ), @@ -275,8 +282,22 @@ } return '<'.$arg[1]." title=\"$title\"".$attribs.'>'.$arg[4].'</'.$arg[1].'>'; } + protected function getMathTableName() { + global $wgMathAnalysisTableName; + if ( is_null( $this->mathTableName ) ){ + return $wgMathAnalysisTableName; + } else { + return $this->mathTableName; + } + } /** + * @param string $tableName mathoid or mathlatexml + */ + public function setMathTableName( $tableName ) { + $this->mathTableName = $tableName; + } + /** * @param $wikiText * @return mixed */ diff --git a/MathSearch.php b/MathSearch.php index aac812a..f981fd4 100644 --- a/MathSearch.php +++ b/MathSearch.php @@ -32,6 +32,9 @@ $wgMathSearchMWSUrl = 'http://localhost:9090/'; /** @var boolean if true the observation is updated everytime the SpecialPage formulainfo is shown. */ $wgMathUpdateObservations = false; +/** @var string $wgMathAnalysisTableName mathoid or mathlatexml */ +$wgMathAnalysisTableName = 'mathlatexml'; + $dir = dirname( __FILE__ ) . '/'; $wgAutoloadClasses['MathSearchHooks'] = $dir . 'MathSearch.hooks.php'; diff --git a/db/snippets/materializeMathPage9.sql b/db/snippets/materializeMathPage9.sql index 0ec566c..cee680b 100644 --- a/db/snippets/materializeMathPage9.sql +++ b/db/snippets/materializeMathPage9.sql @@ -1,3 +1,6 @@ +CREATE TABLE /*_*/mathpage9 ( + `page_id` int(11) NOT NULL +) /*$wgDBTableOptions*/; INSERT INTO mathpage9 (page_id) (SELECT `mathindex_page_id` from ( SELECT `mathindex_page_id` ,count(`mathindex_anchor`) as cnt FROM `mathindex` group by mathindex_page_id ) t diff --git a/maintenance/CalculateDistances.php b/maintenance/CalculateDistances.php index 783123a..0cb1ea2 100644 --- a/maintenance/CalculateDistances.php +++ b/maintenance/CalculateDistances.php @@ -22,83 +22,87 @@ require_once( dirname( __FILE__ ) . '/../../../maintenance/Maintenance.php' ); class CalculateDistances extends Maintenance { - const RTI_CHUNK_SIZE = 1; + const RTI_CHUNK_SIZE = 100; + /**@var DatabaseBase $dbw */ var $dbw = null; /** * @var DatabaseBase */ private $db; + private $pagelist = array(); + /** * */ public function __construct() { parent::__construct(); $this->mDescription = 'Outputs page text to stdout'; - $this->addOption( 'limit', 'Only the pages with the most features are used. Default 2000', false, true, "l" ); - $this->addArg( 'min', "If set processing is started at the page with rank(pageID)>min", false ); - $this->addArg( 'max', "If set processing is stopped at the page with rank(pageID)<=max", false ); + $this->addOption( 'page9', 'Ignore pages with only 9 equations or less.', false, false, "9" ); + $this->addArg( 'min', "If set processing is started at the page with curid>min", false ); + $this->addArg( 'max', "If set processing is stopped at the page with curid<=max", false ); } - /** - * Populates the search index with content from all pages - */ - protected function populateSearchIndex( $n = 0, $cmax = -1 ) { - $res = $this->db->select( 'page', 'MAX(page_id) AS count' ); - $s = $this->db->fetchObject( $res ); - $count = $s->count; - if ( $cmax > 0 && $count > $cmax ) { - $count = $cmax; - } - $this->output( "Rebuilding index fields for {$count} pages with option {$this->purge}...\n" ); - $fcount = 0; - while ( $n < $count ) { - if ( $n ) { - $this->output( $n . " of $count \n" ); - } - $end = $n + self::RTI_CHUNK_SIZE - 1; - - $res = $this->db->selectField( 'mathpagestat', 'pagestat_pageid', "pagestat_pageid=$n" ); - if ( $res ) { - $this->dbw->begin(); - $fcount += self::doUpdate( $res, $this->dbw , $this->getOption( 'limit', 2000 ) ); - $start = microtime( true ); - $this->dbw->commit(); - echo " committed in " . ( microtime( true ) -$start ) . "s\n\n"; - } - $n += self::RTI_CHUNK_SIZE; - } - } - /** - * @param unknown $pId - * @param unknown $pText - * @param string $pTitle - * @param string $purge - * @return number - */ - private static function doUpdate( $pid , $dbw, $limit ) { - // TODO: fix link id problem - $sql = "INSERT IGNORE INTO mathpagesimilarity(pagesimilarity_A,pagesimilarity_B,pagesimilarity_Value)\n" - . "SELECT DISTINCT '.$pid.',`pagestat_pageid`,\n" - . "CosProd('.$pid.',`pagestat_pageid`)\n" - . "FROM `mathpagestat` m JOIN " - . "(SELECT `pagestat_pageid` as pageid FROM `mathpagestat` GROUP BY `pagestat_pageid` " - . "ORDER BY sum(`pagestat_featurecount`) DESC LIMIT $limit ) as r WHERE m.pagestat_pageid < $pid" - . " AND m.pagestat_pageid=r.pageid"; - echo "writing entries for page $pid..."; - $start = microtime( true ); - $dbw->query( $sql ); - echo 'done in ' . ( microtime( true ) -$start ) . "\n"; - return 1; - } /** * */ public function execute() { $this->dbw = wfGetDB( DB_MASTER ); $this->db = wfGetDB( DB_MASTER ); + $this->pagelist = array(); + $min = $this->getArg( 0, 0 ); + $max = $this->getArg( 1, PHP_INT_MAX ); + $conds = "pagestat_pageid >= $min"; + if ( $max < PHP_INT_MAX ) { + $conds .= " AND pagestat_pageid <= $max"; + } + if ( $this->getOption( 'page9', false ) ) { + $res = $this->db->select( array( 'mathpage9' , 'mathpagestat'), array( 'page_id' ,'pagestat_pageid') , + $conds . ' AND pagestat_pageid = page_id', __METHOD__, array( 'DISTINCT' ) ); + } else { + $res = $this->db->select( 'mathpagestat', 'pagestat_pageid', $conds, __METHOD__, array( 'DISTINCT' ) ); + } + foreach ( $res as $row ) { + array_push( $this->pagelist, $row->pagestat_pageid ); + } + $this->populateSearchIndex(); $this->output( "Done.\n" ); - $this->populateSearchIndex( $this->getArg( 0, 0 ), $this->getArg( 1, -1 ) ); + } + + /** + * Populates the search index with content from all pages + */ + protected function populateSearchIndex( ) { + $n = 0; + $count = sizeof($this->pagelist); + $this->output( "Rebuilding index fields for $count pages...\n" ); + while ( $n < $count ) { + if ( $n ) { + $this->output( $n . " of $count \n" ); + } + $this->dbw->begin(); + for($j=0;$j<self::RTI_CHUNK_SIZE;$j++){ + //TODO: USE PREPARED STATEMENTS + $pid = $this->pagelist[$n]; + $sql = "INSERT IGNORE INTO mathpagesimilarity(pagesimilarity_A,pagesimilarity_B,pagesimilarity_Value)\n" + . "SELECT DISTINCT $pid,`pagestat_pageid`,\n" + . "CosProd( $pid,`pagestat_pageid`) FROM `mathpagestat` m "; + if ( $this->getOption( 'page9', false ) ){ + $sql .= " JOIN (SELECT page_id from mathpage9) as r WHERE m.pagestat_pageid=r.page_id AND "; + } else { + $sql .= " WHERE "; + } + $sql .= "m.pagestat_pageid < $pid "; + echo "writing entries for page $pid..."; + $start = microtime( true ); + $this->dbw->query( $sql ); + echo 'done in ' . ( microtime( true ) - $start ) . "\n"; + $n++; + } + $start = microtime( true ); + $this->dbw->commit(); + echo " committed in " . ( microtime( true ) - $start ) . "s\n\n"; + } } } -- To view, visit https://gerrit.wikimedia.org/r/127634 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ic7bd3dcd2e19f269e281d5c11a2eb83c4f8772ae Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/MathSearch Gerrit-Branch: master Gerrit-Owner: Physikerwelt <w...@physikerwelt.de> Gerrit-Reviewer: Physikerwelt <w...@physikerwelt.de> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits