Tobias Gritschacher has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/321011 )

Change subject: Script to recalculate normalized hashes
......................................................................


Script to recalculate normalized hashes

Change-Id: I83181ffb3a78bd98fb34bfcbb8613a7618816f23
---
A maintenance/recalculateCognateNormalizedHashes.php
1 file changed, 151 insertions(+), 0 deletions(-)

Approvals:
  Daniel Kinzler: Looks good to me, approved
  Addshore: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/maintenance/recalculateCognateNormalizedHashes.php 
b/maintenance/recalculateCognateNormalizedHashes.php
new file mode 100644
index 0000000..cacb312
--- /dev/null
+++ b/maintenance/recalculateCognateNormalizedHashes.php
@@ -0,0 +1,151 @@
+<?php
+
+namespace Cognate;
+
+use Database;
+use Maintenance;
+use MediaWiki\MediaWikiServices;
+use Wikimedia\Rdbms\ConnectionManager;
+
+if ( getenv( 'MW_INSTALL_PATH' ) !== false ) {
+       require_once getenv( 'MW_INSTALL_PATH' ) . 
'/maintenance/Maintenance.php';
+} else {
+       require_once __DIR__ . '/../../../maintenance/Maintenance.php';
+}
+
+/**
+ * Maintenance script for recalculating the normalized Cognate hashes
+ *
+ * @license GPL-2.0+
+ * @author Addshore
+ */
+class RecalculateCognateNormalizedHashes extends Maintenance {
+
+       /**
+        * @var Database
+        */
+       private $dbr;
+
+       /**
+        * @var Database
+        */
+       private $dbw;
+
+       /**
+        * @var StringHasher
+        */
+       private $stringHasher;
+
+       /**
+        * @var StringNormalizer
+        */
+       private $stringNormalizer;
+
+       public function __construct() {
+               parent::__construct();
+
+               $this->addDescription( 'Recalculate the normalized Cognate 
hashes' );
+               $this->addOption( 'dry-run', 'Perform a dry run' );
+               $this->setBatchSize( 100 );
+       }
+
+       private function setupServices() {
+               $services = MediaWikiServices::getInstance();
+               /** @var ConnectionManager $connectionManager */
+               $connectionManager = $services->getService( 
'CognateConnectionManager' );
+               $this->dbr = $connectionManager->getReadConnection();
+               $this->dbw = $connectionManager->getWriteConnection();
+               $this->stringHasher = new StringHasher();
+               $this->stringNormalizer = new StringNormalizer();
+       }
+
+       public function execute() {
+               $this->output( "Started processing...\n" );
+               $dryrun = $this->hasOption( 'dry-run' );
+               $this->setupServices();
+               $batchStart = $this->getLowestRawKey();
+
+               if ( !$batchStart ) {
+                       $this->output( "Nothing to do.\n" );
+                       return true;
+               }
+
+               $totalUpdates = 0;
+
+               while ( $batchStart ) {
+                       $this->output( "Getting batch starting from 
$batchStart\n" );
+                       $rows = $this->dbw->select(
+                               CognateStore::TITLES_TABLE_NAME,
+                               [ 'cgti_raw', 'cgti_raw_key', 
'cgti_normalized_key' ],
+                               [ 'cgti_raw_key > ' . $batchStart ],
+                               __METHOD__,
+                               [ 'LIMIT ' . $this->mBatchSize, 'ORDER BY 
cgti_raw_key ASC' ]
+                       );
+
+                       $this->output( "Calculating new hashes..\n" );
+                       $batchStart = false;
+                       $rowsToUpdate = [];
+                       foreach ( $rows as $key => $row ) {
+                               $batchStart = $row->cgti_raw_key;
+
+                               $newNormalizedHash = $this->normalizeAndHash( 
$row->cgti_raw );
+                               if ( $newNormalizedHash != 
$row->cgti_normalized_key ) {
+                                       $newRow = (array)$row;
+                                       $newRow['cgti_normalized_key'] = 
$newNormalizedHash;
+                                       $rowsToUpdate[] = $newRow;
+                               }
+                       }
+
+                       $numberOfUpdates = count( $rowsToUpdate );
+                       $totalUpdates += $numberOfUpdates;
+
+                       if ( !$dryrun ) {
+                               $this->output( "Performing $numberOfUpdates 
updates\n" );
+                               $this->dbw->upsert(
+                                       CognateStore::TITLES_TABLE_NAME,
+                                       $rowsToUpdate,
+                                       [ 'cgti_raw_key' ],
+                                       [
+                                               
'cgti_normalized_key=VALUES(cgti_normalized_key)',
+                                       ],
+                                       __METHOD__
+                               );
+                       }
+               }
+
+               $this->output( "$totalUpdates hashes recalculated\n" );
+               $this->output( "Done!\n" );
+
+               return true;
+       }
+
+       /**
+        * Select 1 less than the minimum so that > can be used in selects in 
this script.
+        *
+        * @return int|false
+        * @throws \DBUnexpectedError
+        */
+       private function getLowestRawKey() {
+               return $this->dbr->selectField(
+                       CognateStore::TITLES_TABLE_NAME,
+                       'MIN(cgti_raw_key)-1',
+                       false,
+                       __METHOD__
+               );
+       }
+
+       /**
+        * @param string $string
+        *
+        * @return string
+        */
+       private function normalizeAndHash( $string ) {
+               return $this->stringHasher->hash(
+                       $this->stringNormalizer->normalize( $string )
+               );
+       }
+
+}
+
+$maintClass = RecalculateCognateNormalizedHashes::class;
+require_once RUN_MAINTENANCE_IF_MAIN;

-- 
To view, visit https://gerrit.wikimedia.org/r/321011
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I83181ffb3a78bd98fb34bfcbb8613a7618816f23
Gerrit-PatchSet: 9
Gerrit-Project: mediawiki/extensions/Cognate
Gerrit-Branch: master
Gerrit-Owner: Addshore <addshorew...@gmail.com>
Gerrit-Reviewer: Addshore <addshorew...@gmail.com>
Gerrit-Reviewer: Daniel Kinzler <daniel.kinz...@wikimedia.de>
Gerrit-Reviewer: Tobias Gritschacher <tobias.gritschac...@wikimedia.de>
Gerrit-Reviewer: WMDE-Fisch <christoph.jau...@wikimedia.de>
Gerrit-Reviewer: WMDE-leszek <leszek.mani...@wikimedia.de>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to