Physikerwelt has submitted this change and it was merged. Change subject: Pubmed similarity experiments ......................................................................
Pubmed similarity experiments improve calculation of distance calculation for medium size wikis Change-Id: Icb0b6ea3f4a387a21efc307ab68b3cc8ab62809d --- A db/mathpagesimilarity.sql A db/snippets/materializeMathPage9.sql M maintenance/CalculateDistances.php 3 files changed, 20 insertions(+), 8 deletions(-) Approvals: Physikerwelt: Looks good to me, approved jenkins-bot: Verified diff --git a/db/mathpagesimilarity.sql b/db/mathpagesimilarity.sql new file mode 100644 index 0000000..e881462 --- /dev/null +++ b/db/mathpagesimilarity.sql @@ -0,0 +1,6 @@ +CREATE TABLE /*_*/mathpagesimilarity ( + `pagesimilarity_A` int(11) NOT NULL, + `pagesimilarity_B` int(11) NOT NULL, + `pagesimilarity_Value` double NOT NULL DEFAULT '0', + PRIMARY KEY (`pagesimilarity_B`,`pagesimilarity_A`) +) /*$wgDBTableOptions*/; diff --git a/db/snippets/materializeMathPage9.sql b/db/snippets/materializeMathPage9.sql new file mode 100644 index 0000000..0ec566c --- /dev/null +++ b/db/snippets/materializeMathPage9.sql @@ -0,0 +1,5 @@ +INSERT INTO mathpage9 (page_id) +(SELECT `mathindex_page_id` from ( +SELECT `mathindex_page_id` ,count(`mathindex_anchor`) as cnt FROM `mathindex` group by mathindex_page_id ) t +WHERE +t.cnt > 9) \ No newline at end of file diff --git a/maintenance/CalculateDistances.php b/maintenance/CalculateDistances.php index 5e1eede..783123a 100644 --- a/maintenance/CalculateDistances.php +++ b/maintenance/CalculateDistances.php @@ -21,9 +21,8 @@ require_once( dirname( __FILE__ ) . '/../../../maintenance/Maintenance.php' ); -class UpdateMath extends Maintenance { +class CalculateDistances extends Maintenance { const RTI_CHUNK_SIZE = 1; - var $purge = false; var $dbw = null; /** @@ -36,7 +35,7 @@ public function __construct() { parent::__construct(); $this->mDescription = 'Outputs page text to stdout'; - $this->addOption( 'purge', "If set all formulae are rendered again from strech. (Very time consuming!)", false, false, "f" ); + $this->addOption( 'limit', 'Only the pages with the most features are used. Default 2000', false, true, "l" ); $this->addArg( 'min', "If set processing is started at the page with rank(pageID)>min", false ); $this->addArg( 'max', "If set processing is stopped at the page with rank(pageID)<=max", false ); } @@ -62,7 +61,7 @@ $res = $this->db->selectField( 'mathpagestat', 'pagestat_pageid', "pagestat_pageid=$n" ); if ( $res ) { $this->dbw->begin(); - $fcount += self::doUpdate( $res, $this->dbw ); + $fcount += self::doUpdate( $res, $this->dbw , $this->getOption( 'limit', 2000 ) ); $start = microtime( true ); $this->dbw->commit(); echo " committed in " . ( microtime( true ) -$start ) . "s\n\n"; @@ -77,12 +76,15 @@ * @param string $purge * @return number */ - private static function doUpdate( $pid , $dbw ) { + private static function doUpdate( $pid , $dbw, $limit ) { // TODO: fix link id problem $sql = "INSERT IGNORE INTO mathpagesimilarity(pagesimilarity_A,pagesimilarity_B,pagesimilarity_Value)\n" . "SELECT DISTINCT '.$pid.',`pagestat_pageid`,\n" . "CosProd('.$pid.',`pagestat_pageid`)\n" - . "FROM `mathpagestat` WHERE pagestat_pageid<" . $pid; + . "FROM `mathpagestat` m JOIN " + . "(SELECT `pagestat_pageid` as pageid FROM `mathpagestat` GROUP BY `pagestat_pageid` " + . "ORDER BY sum(`pagestat_featurecount`) DESC LIMIT $limit ) as r WHERE m.pagestat_pageid < $pid" + . " AND m.pagestat_pageid=r.pageid"; echo "writing entries for page $pid..."; $start = microtime( true ); $dbw->query( $sql ); @@ -94,12 +96,11 @@ */ public function execute() { $this->dbw = wfGetDB( DB_MASTER ); - $this->purge = $this->getOption( "purge", false ); $this->db = wfGetDB( DB_MASTER ); $this->output( "Done.\n" ); $this->populateSearchIndex( $this->getArg( 0, 0 ), $this->getArg( 1, -1 ) ); } } -$maintClass = "UpdateMath"; +$maintClass = "CalculateDistances"; require_once( RUN_MAINTENANCE_IF_MAIN ); -- To view, visit https://gerrit.wikimedia.org/r/127469 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Icb0b6ea3f4a387a21efc307ab68b3cc8ab62809d Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/MathSearch Gerrit-Branch: master Gerrit-Owner: Physikerwelt <w...@physikerwelt.de> Gerrit-Reviewer: Physikerwelt <w...@physikerwelt.de> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits