MaxSem has uploaded a new change for review. https://gerrit.wikimedia.org/r/134851
Change subject: Kill Solr support ...................................................................... Kill Solr support Change-Id: I29fc406ba085795db68911487ee074b3eae6e8a2 --- M GeoData.body.php M GeoData.php M GeoDataHooks.php D api/ApiQueryGeoSearchSolr.php D solr/SolrGeoData.php D solr/SolrUpdateJob.php D solr/SolrUpdateWork.php D solr/schema.xml D solrupdate.php A sql/drop-updates-killlist.sql M sql/externally-backed.sql D sql/wmfFixTables.sql 12 files changed, 5 insertions(+), 854 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/GeoData refs/changes/51/134851/1 diff --git a/GeoData.body.php b/GeoData.body.php index 5c088c7..71f3856 100644 --- a/GeoData.body.php +++ b/GeoData.body.php @@ -180,15 +180,4 @@ public static function pickRandom( $weights ) { return ArrayUtils::pickRandom( $weights ); } - - /** - * Adds an update job if needed - */ - public static function maybeUpdate() { - global $wgGeoDataBackend, $wgGeoDataUpdatesViaJob; - - if ( $wgGeoDataBackend == 'solr' && $wgGeoDataUpdatesViaJob ) { - JobQueueGroup::singleton()->push( new SolrUpdateJob( null ) ); - } - } } diff --git a/GeoData.php b/GeoData.php index 919c11f..913f7b7 100644 --- a/GeoData.php +++ b/GeoData.php @@ -17,7 +17,6 @@ $wgAutoloadClasses['ApiQueryGeoSearch'] = "$dir/api/ApiQueryGeoSearch.php"; $wgAutoloadClasses['ApiQueryGeoSearchDb'] = "$dir/api/ApiQueryGeoSearchDb.php"; $wgAutoloadClasses['ApiQueryGeoSearchElastic'] = "$dir/api/ApiQueryGeoSearchElastic.php"; -$wgAutoloadClasses['ApiQueryGeoSearchSolr'] = "$dir/api/ApiQueryGeoSearchSolr.php"; $wgAutoloadClasses['ApiQueryAllPages_GeoData'] = "$dir/api/ApiQueryAllPages_GeoData.php"; $wgAutoloadClasses['ApiQueryCategoryMembers_GeoData'] = "$dir/api/ApiQueryCategoryMembers_GeoData.php"; $wgAutoloadClasses['GeoDataQueryExtender'] = "$dir/api/GeoDataQueryExtender.php"; @@ -28,11 +27,6 @@ $wgAutoloadClasses['GeoData'] = "$dir/GeoData.body.php"; $wgAutoloadClasses['GeoDataHooks'] = "$dir/GeoDataHooks.php"; $wgAutoloadClasses['GeoDataMath'] = "$dir/GeoDataMath.php"; -$wgAutoloadClasses['SolrUpdate'] = "$dir/solrupdate.php"; -$wgAutoloadClasses['SolrUpdateJob'] = "$dir/solr/SolrUpdateJob.php"; -$wgAutoloadClasses['SolrUpdateWork'] = "$dir/solr/SolrUpdateWork.php"; - -$wgAutoloadClasses['SolrGeoData'] = "$dir/solr/SolrGeoData.php"; $wgMessagesDirs['GeoData'] = __DIR__ . '/i18n'; $wgExtensionMessagesFiles['GeoData'] = "$dir/GeoData.i18n.php"; @@ -62,8 +56,6 @@ $wgAPIListModules['geosearch'] = 'ApiQueryGeoSearch' . ucfirst( $wgGeoDataBackend ); } } - -$wgJobClasses['solrUpdate'] = 'SolrUpdateJob'; // Tracking categories for Special:TrackingCategories $wgTrackingCategories[] = 'geodata-broken-tags-category'; @@ -190,48 +182,9 @@ $wgGeoDataIndexGranularity = 10; /** - * Which backend should be used by spatial searhces: 'db', 'solr' or 'elastic' + * Which backend should be used by spatial searhces: 'db' or 'elastic' */ $wgGeoDataBackend = 'db'; - - -// Solr-specific settings - -/** - * Generic Solr connection options, see Solarium docs. - * Note: host must be set in $wgGeoDataSolrHosts for load-balancicng. - */ -$wgGeoDataSolrOptions = array( - 'adapteroptions' => array( - //'host' => '127.0.0.1', - 'port' => 8983, - 'path' => '/solr/', - ), -); - -/** - * @var string|array: Solr host, string "hostname" or array( 'host1' => weight1, 'host2' => weight2 ... ) - */ -$wgGeoDataSolrHosts = 'localhost'; - -/** - * @var string: Solr master used for updates - */ -$wgGeoDataSolrMaster = 'localhost'; - -/** - * @var int|string: Commit policy - * Possible values: - * - 'never': Never commit explicitly, let Solr decide on its own. - * - 'immediate': Commit after every change. - * - (some number): Commit within this number of milliseconds. - */ -$wgGeoDataSolrCommitPolicy = 'immediate'; - -/** - * Whether search index should be updated via jobs. Supported only for Solr. - */ -$wgGeoDataUpdatesViaJob = false; /** * Specifies which information about page's primary coordinate is added to global JS variable wgCoordinates. diff --git a/GeoDataHooks.php b/GeoDataHooks.php index 3f3181f..a98902e 100644 --- a/GeoDataHooks.php +++ b/GeoDataHooks.php @@ -20,6 +20,7 @@ case 'mysql': if ( $wgGeoDataBackend != 'db' ) { $updater->addExtensionTable( 'geo_tags', dirname( __FILE__ ) . '/sql/externally-backed.sql' ); + $updater->dropExtensionTable( 'geo_killlist', dirname( __FILE__ ) . '/sql/drop-updates-killlist.sql' ); } else { $updater->addExtensionTable( 'geo_tags', dirname( __FILE__ ) . '/sql/db-backed.sql' ); } @@ -77,22 +78,10 @@ * @return bool */ public static function onArticleDeleteComplete( &$article, User &$user, $reason, $id ) { - global $wgGeoDataBackend; wfProfileIn( __METHOD__ ); $dbw = wfGetDB( DB_MASTER ); - if ( $wgGeoDataBackend == 'solr' ) { - $res = $dbw->select( 'geo_tags', 'gt_id', array( 'gt_page_id' => $id ), __METHOD__ ); - $killlist = array(); - foreach ( $res as $row ) { - $killlist[] = array( 'gk_killed_id' => $row->gt_id ); - } - if ( $killlist ) { - $dbw->insert( 'geo_killlist', $killlist, __METHOD__ ); - } - } $dbw->delete( 'geo_tags', array( 'gt_page_id' => $id ), __METHOD__ ); - GeoData::maybeUpdate(); wfProfileOut( __METHOD__ ); return true; @@ -128,7 +117,6 @@ } else { self::doSmartUpdate( $data, $linksUpdate->mId ); } - GeoData::maybeUpdate(); wfProfileOut( __METHOD__ ); return true; @@ -218,12 +206,6 @@ if ( count( $delete ) ) { $deleteIds = array_keys( $delete ); $dbw->delete( 'geo_tags', array( 'gt_id' => $deleteIds ), __METHOD__ ); - if ( $wgGeoDataBackend != 'db' ) { - $rows = array_map( function( $id ) { - return array( 'gk_killed_id' => $id ); - }, $deleteIds ); - $dbw->insert( 'geo_killlist', $rows, __METHOD__ ); - } } if ( count( $add ) ) { $dbw->insert( 'geo_tags', $add, __METHOD__ ); diff --git a/api/ApiQueryGeoSearchSolr.php b/api/ApiQueryGeoSearchSolr.php deleted file mode 100644 index 8fc1820..0000000 --- a/api/ApiQueryGeoSearchSolr.php +++ /dev/null @@ -1,114 +0,0 @@ -<?php - -class ApiQueryGeoSearchSolr extends ApiQueryGeoSearch { - public function __construct( $query, $moduleName ) { - parent::__construct( $query, $moduleName ); - } - - /** - * @param ApiPageSet $resultPageSet - */ - protected function run( $resultPageSet = null ) { - global $wgDefaultGlobe; - - wfProfileIn( __METHOD__ ); - parent::run( $resultPageSet ); - - try { - $params = $this->extractRequestParams(); - - $solr = SolrGeoData::newClient(); - $query = $solr->createSelect(); - $helper = $query->getHelper(); - - // @todo: props - $query->setQueryDefaultOperator( 'AND' ); - $query->createFilterQuery( 'wiki' )->setQuery( 'wiki:' . wfWikiID() ); // Only Earth is supported - $query->createFilterQuery( 'globe' )->setQuery( 'globe:earth' ); // Only Earth is supported - if ( isset( $params['maxdim'] ) ) { - $query->addFilterQuery( "dim:[* TO {$params['maxdim']}]" ); - } - $primary = $params['primary']; - if ( $primary !== 'all' ) { - $query->createFilterQuery( 'primary' )->setQuery( 'primary:' . intval( $primary === 'primary' ) ); - } - $query->createFilterQuery( 'coord' )->setQuery( $helper->geofilt( $this->lat, $this->lon, 'coord', $this->radius / 1000 ) ); - $query->addSort( $helper->geodist( $this->lat, $this->lon, 'coord' ), Solarium_Query_Select::SORT_ASC ); - - $limit = $params['limit']; - $query->setRows( $limit + ( $this->idToExclude ? 1 : 0 ) ); // +1 in case we need to exclude a page - - wfProfileIn( __METHOD__ . '-solr' ); - $docs = $solr->select( $query ); - wfProfileOut( __METHOD__ . '-solr' ); - $mapping = array(); - foreach ( $docs as $doc ) { - $id = $doc->page_id; - if ( !isset( $mapping[$id] ) && $id != $this->idToExclude ) { - $mapping[$id] = $doc; - } - } - - if ( !count( $mapping ) ) { - wfProfileOut( __METHOD__ ); - return; // No results, no point in doing anything else - } - $this->addWhere( array( 'page_id' => array_keys( $mapping ) ) ); - - wfProfileIn( __METHOD__ . '-sql' ); - $res = $this->select( __METHOD__ ); - wfProfileOut( __METHOD__ . '-sql' ); - - $result = $this->getResult(); - $rows = array(); - foreach ( $res as $row ) { - $rows[$row->page_id] = $row; - } - - foreach ( $mapping as $id => $doc ) { - if ( !$limit-- ) { - break; - } - if ( !isset( $rows[$id] ) ) { - continue; - } - $row = $rows[$id]; - if ( is_null( $resultPageSet ) ) { - $title = Title::newFromRow( $row ); - list( $lat, $lon ) = explode( ',', $doc->coord ); - $vals = array( - 'pageid' => intval( $row->page_id ), - 'ns' => intval( $title->getNamespace() ), - 'title' => $title->getPrefixedText(), - 'lat' => floatval( $lat ), - 'lon' => floatval( $lon ), - 'dist' => round( GeoDataMath::distance( $lat, $lon, $this->lat, $this->lon ), 1 ), - ); - - if ( $doc->primary ) { - $vals['primary'] = ''; - } - foreach( $params['prop'] as $prop ) { - // Don't output default globe - if ( !( $prop === 'globe' && $doc->$prop === $wgDefaultGlobe ) ) { - $vals[$prop] = $doc->$prop; - } - } - $fit = $result->addValue( array( 'query', $this->getModuleName() ), null, $vals ); - if ( !$fit ) { - break; - } - } else { - $resultPageSet->processDbRow( $row ); - } - } - } catch ( Solarium_Exception $e ) { - throw new MWException( get_class( $e ) . " at {$e->getFile()}, line {$e->getLine()}: {$e->getMessage()}", 0, $e ); - } - wfProfileOut( __METHOD__ ); - } - - public function getVersion() { - return __CLASS__ . ': $Id$'; - } -} diff --git a/solr/SolrGeoData.php b/solr/SolrGeoData.php deleted file mode 100644 index 3e71975..0000000 --- a/solr/SolrGeoData.php +++ /dev/null @@ -1,21 +0,0 @@ -<?php - -class SolrGeoData { - /** - * @param bool $master - * - * @return Solarium_Client - */ - public static function newClient( $master = false ) { - global $wgGeoDataSolrOptions, $wgGeoDataSolrHosts, $wgGeoDataSolrMaster; - - $options = $wgGeoDataSolrOptions; - if ( $master ) { - $options['adapteroptions']['host'] = $wgGeoDataSolrMaster; - } else { - $options['adapteroptions']['host'] = GeoData::pickRandom( $wgGeoDataSolrHosts ); - } - - return new Solarium_Client( $options ); - } -} diff --git a/solr/SolrUpdateJob.php b/solr/SolrUpdateJob.php deleted file mode 100644 index 8a857a2..0000000 --- a/solr/SolrUpdateJob.php +++ /dev/null @@ -1,25 +0,0 @@ -<?php - -class SolrUpdateJob extends Job { - - public function __construct( $title, $params = array(), $id = 0 ) { - parent::__construct( 'solrUpdate', Title::newMainPage(), $params, $id ); - $this->removeDuplicates = true; - } - - /** - * Run the job - * @return boolean success - */ - public function run() { - global $wgGeoDataUpdatesViaJob; - - // Allow disabling jobs on the fly - if ( $wgGeoDataUpdatesViaJob ) { - $maint = new SolrUpdate(); - $maint->enableJobMode(); - $maint->execute(); - } - return true; - } -} diff --git a/solr/SolrUpdateWork.php b/solr/SolrUpdateWork.php deleted file mode 100644 index 83de0ba..0000000 --- a/solr/SolrUpdateWork.php +++ /dev/null @@ -1,15 +0,0 @@ -<?php - -class SolrUpdateWork extends PoolCounterWork { - private $maint; - - public function __construct( SolrUpdate $maint ) { - parent::__construct( 'solrUpdate', '*' ); - $this->maint = $maint; - } - - function doWork() { - $this->maint->safeExecute(); - return true; - } -} diff --git a/solr/schema.xml b/solr/schema.xml deleted file mode 100644 index 3d6e56d..0000000 --- a/solr/schema.xml +++ /dev/null @@ -1,330 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" ?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<!-- - This is the Solr schema file. This file should be named "schema.xml" and - should be in the conf directory under the solr home - (i.e. ./solr/conf/schema.xml by default) - or located where the classloader for the Solr webapp can find it. - - This example schema is the recommended starting point for users. - It should be kept correct and concise, usable out-of-the-box. - - For more information, on how to customize this file, please see - http://wiki.apache.org/solr/SchemaXml - - PERFORMANCE NOTE: this schema includes many optional features and should not - be used for benchmarking. To improve performance one could - - set stored="false" for all fields possible (esp large fields) when you - only need to search on the field but don't need to return the original - value. - - set indexed="false" if you don't need to search on the field, but only - return the field as a result of searching on other indexed fields. - - remove all unneeded copyField statements - - for best index size and searching performance, set "index" to false - for all general text fields, use copyField to copy them to the - catchall "text" field, and use that for searching. - - For maximum indexing performance, use the StreamingUpdateSolrServer - java client. - - Remember to run the JVM in server mode, and use a higher logging level - that avoids logging every request ---> - -<schema name="geodata" version="1.5"> - - <fields> - <!-- Valid attributes for fields: - name: mandatory - the name for the field - type: mandatory - the name of a field type from the - <types> fieldType section - indexed: true if this field should be indexed (searchable or sortable) - stored: true if this field should be retrievable - multiValued: true if this field may contain multiple values per document - omitNorms: (expert) set to true to omit the norms associated with - this field (this disables length normalization and index-time - boosting for the field, and saves some memory). Only full-text - fields or fields that need an index-time boost need norms. - Norms are omitted for primitive (non-analyzed) types by default. - termVectors: [false] set to true to store the term vector for a - given field. - When using MoreLikeThis, fields used for similarity should be - stored for best performance. - termPositions: Store position information with the term vector. - This will increase storage costs. - termOffsets: Store offset information with the term vector. This - will increase storage costs. - required: The field is required. It will throw an error if the - value does not exist - default: a value that should be used if no value is specified - when adding a document. - --> - - <!-- field names should consist of alphanumeric or underscore characters only and - not start with a digit. This is not currently strictly enforced, - but other field names will not have first class support from all components - and back compatibility is not guaranteed. Names with both leading and - trailing underscores (e.g. _version_) are reserved. - --> - - <field name="id" type="string" indexed="true" stored="true" required="true" /> - <field name="wiki" type="string" indexed="true" stored="false" required="true" omitNorms="true"/> - <field name="coord" type="location" indexed="true" stored="true" required="true"/> - <field name="page_id" type="tlong" indexed="true" stored="true" required="true"/> - <field name="globe" type="string" indexed="true" stored="true" omitNorms="true" required="true"/> - <field name="primary" type="boolean" indexed="true" stored="true" required="true"/> - <field name="dim" type="float" indexed="true" stored="true" required="false"/> - <field name="type" type="string" indexed="false" stored="true" omitNorms="true"/> - <field name="name" type="string" indexed="false" stored="true" omitNorms="true"/> - <field name="country" type="string" indexed="true" stored="true" omitNorms="true"/> - <field name="region" type="string" indexed="true" stored="true" omitNorms="true"/> - - <!-- Dynamic field definitions allow using convention over configuration - for fields via the specification of patterns to match field names. - EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i) - RESTRICTION: the glob-like pattern in the name attribute must have - a "*" only at the start or the end. --> - - <!-- Type used to index the lat and lon components for the "location" FieldType --> - <dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false" /> - - <!-- Uncommenting the following will create a "timestamp" field using - a default value of "NOW" to indicate when each document was indexed. - --> - <!-- - <field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/> - --> - - <!-- uncomment the following to ignore any fields that don't already match an existing - field name or dynamic field, rather than reporting them as an error. - alternately, change the type="ignored" to some other type e.g. "text" if you want - unknown fields indexed and/or stored by default --> - <!--dynamicField name="*" type="ignored" multiValued="true" /--> - - </fields> - - <!-- Field to use to determine and enforce document uniqueness. - Unless this field is marked with required="false", it will be a required field - --> - <uniqueKey>id</uniqueKey> - - <!-- DEPRECATED: The defaultOperator (AND|OR) is consulted by various query parsers - when parsing a query string to determine if a clause of the query should be marked as - required or optional, assuming the clause isn't already marked by some operator. - The default is OR, which is generally assumed so it is not a good idea to change it - globally here. The "q.op" request parameter takes precedence over this. - <solrQueryParser defaultOperator="OR"/> --> - - - <types> - <!-- field type definitions. The "name" attribute is - just a label to be used by field definitions. The "class" - attribute and any other attributes determine the real - behavior of the fieldType. - Class names starting with "solr" refer to java classes in a - standard package such as org.apache.solr.analysis - --> - - <!-- The StrField type is not analyzed, but indexed/stored verbatim. --> - <fieldType name="string" class="solr.StrField" sortMissingLast="true" /> - - <!-- boolean type: "true" or "false" --> - <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/> - - <!-- sortMissingLast and sortMissingFirst attributes are optional attributes are - currently supported on types that are sorted internally as strings - and on numeric types. - This includes "string","boolean", and, as of 3.5 (and 4.x), - int, float, long, date, double, including the "Trie" variants. - - If sortMissingLast="true", then a sort on this field will cause documents - without the field to come after documents with the field, - regardless of the requested sort order (asc or desc). - - If sortMissingFirst="true", then a sort on this field will cause documents - without the field to come before documents with the field, - regardless of the requested sort order. - - If sortMissingLast="false" and sortMissingFirst="false" (the default), - then default lucene sorting will be used which places docs without the - field first in an ascending sort and last in a descending sort. - --> - - <!-- - Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types. - --> - <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/> - <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/> - <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/> - <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/> - - <!-- - Numeric field types that index each value at various levels of precision - to accelerate range queries when the number of values between the range - endpoints is large. See the javadoc for NumericRangeQuery for internal - implementation details. - - Smaller precisionStep values (specified in bits) will lead to more tokens - indexed per value, slightly larger index size, and faster range queries. - A precisionStep of 0 disables indexing at different precision levels. - --> - <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" positionIncrementGap="0"/> - <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" positionIncrementGap="0"/> - <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" positionIncrementGap="0"/> - <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" positionIncrementGap="0"/> - - <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and - is a more restricted form of the canonical representation of dateTime - http://www.w3.org/TR/xmlschema-2/#dateTime - The trailing "Z" designates UTC time and is mandatory. - Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z - All other components are mandatory. - - Expressions can also be used to denote calculations that should be - performed relative to "NOW" to determine the value, ie... - - NOW/HOUR - ... Round to the start of the current hour - NOW-1DAY - ... Exactly 1 day prior to now - NOW/DAY+6MONTHS+3DAYS - ... 6 months and 3 days in the future from the start of - the current day - - Consult the DateField javadocs for more information. - - Note: For faster range queries, consider the tdate type - --> - <fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/> - - <!-- A Trie based date field for faster date range queries and date faceting. --> - <fieldType name="tdate" class="solr.TrieDateField" precisionStep="6" positionIncrementGap="0"/> - - - <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings --> - <fieldtype name="binary" class="solr.BinaryField"/> - - <!-- - Note: - These should only be used for compatibility with existing indexes (created with lucene or older Solr versions). - Use Trie based fields instead. As of Solr 3.5 and 4.x, Trie based fields support sortMissingFirst/Last - - Plain numeric field types that store and index the text - value verbatim (and hence don't correctly support range queries, since the - lexicographic ordering isn't equal to the numeric ordering) - --> - <fieldType name="pint" class="solr.IntField"/> - <fieldType name="plong" class="solr.LongField"/> - <fieldType name="pfloat" class="solr.FloatField"/> - <fieldType name="pdouble" class="solr.DoubleField"/> - <fieldType name="pdate" class="solr.DateField" sortMissingLast="true"/> - - <!-- The "RandomSortField" is not used to store or search any - data. You can declare fields of this type it in your schema - to generate pseudo-random orderings of your docs for sorting - or function purposes. The ordering is generated based on the field - name and the version of the index. As long as the index version - remains unchanged, and the same field name is reused, - the ordering of the docs will be consistent. - If you want different psuedo-random orderings of documents, - for the same version of the index, use a dynamicField and - change the field name in the request. - --> - <fieldType name="random" class="solr.RandomSortField" indexed="true" /> - - <!-- solr.TextField allows the specification of custom text analyzers - specified as a tokenizer and a list of token filters. Different - analyzers may be specified for indexing and querying. - - The optional positionIncrementGap puts space between multiple fields of - this type on the same document, with the purpose of preventing false phrase - matching across fields. - - For more info on customizing your analyzer chain, please see - http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters - --> - - <!-- One can also specify an existing Analyzer class that has a - default constructor via the class attribute on the analyzer element. - Example: - <fieldType name="text_greek" class="solr.TextField"> - <analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/> - </fieldType> - --> - - <!-- A text field that only splits on whitespace for exact matching of words --> - <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100"> - <analyzer> - <tokenizer class="solr.WhitespaceTokenizerFactory"/> - </analyzer> - </fieldType> - - - - <!-- charFilter + WhitespaceTokenizer --> - <!-- - <fieldType name="text_char_norm" class="solr.TextField" positionIncrementGap="100" > - <analyzer> - <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/> - <tokenizer class="solr.WhitespaceTokenizerFactory"/> - </analyzer> - </fieldType> - --> - - <!-- lowercases the entire field value, keeping it as a single token. --> - <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100"> - <analyzer> - <tokenizer class="solr.KeywordTokenizerFactory"/> - <filter class="solr.LowerCaseFilterFactory" /> - </analyzer> - </fieldType> - - <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100"> - <analyzer> - <tokenizer class="solr.PathHierarchyTokenizerFactory"/> - </analyzer> - </fieldType> - - - <!-- since fields of this type are by default not stored or indexed, - any data added to them will be ignored outright. --> - <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" /> - - <!-- This point type indexes the coordinates as separate fields (subFields) - If subFieldType is defined, it references a type, and a dynamic field - definition is created matching *___<typename>. Alternately, if - subFieldSuffix is defined, that is used to create the subFields. - Example: if subFieldType="double", then the coordinates would be - indexed in fields myloc_0___double,myloc_1___double. - Example: if subFieldSuffix="_d" then the coordinates would be indexed - in fields myloc_0_d,myloc_1_d - The subFields are an implementation detail of the fieldType, and end - users normally should not need to know about them. - --> - <fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/> - - <!-- A specialized field for geospatial search. If indexed, this fieldType must not be multivalued. --> - <fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/> - - <!-- - A Geohash is a compact representation of a latitude longitude pair in a single field. - See http://wiki.apache.org/solr/SpatialSearch - --> - <fieldtype name="geohash" class="solr.GeoHashField"/> - - </types> - - - -</schema> diff --git a/solrupdate.php b/solrupdate.php deleted file mode 100644 index 8bf4716..0000000 --- a/solrupdate.php +++ /dev/null @@ -1,243 +0,0 @@ -<?php - -$IP = getenv( 'MW_INSTALL_PATH' ); -if ( $IP === false ) { - $IP = dirname( __FILE__ ) . '/../..'; -} -require_once( "$IP/maintenance/Maintenance.php" ); - -class SolrUpdate extends Maintenance { - const WRITE_BATCH_SIZE = 500; - const READ_BATCH_SIZE = 1000; - const READ_DELAY = 0; // In microseconds - - private $jobMode = false; - - public function __construct() { - $this->mDescription = 'Performs updates and other operations with Solr index'; - $this->addOption( 'reset', 'Reset last update timestamp (next feed will return whole database)' ); - $this->addOption( 'clear-killlist', 'Purge killlist entries older than this value (in days)', false, true ); - $this->addOption( 'noindex', 'Don\'t update index' ); - } - - public function enableJobMode() { - $this->mQuiet = true; - $this->jobMode = true; - } - - public function execute() { - // Make sure that the index is being updated only once - $work = new SolrUpdateWork( $this ); - if ( !$work->execute() ) { - $this->error( __METHOD__ . '(): PoolCounter error!', true ); - } - } - - /** - * Called internally - */ - public function safeExecute() { - global $wgGeoDataBackend, $wgGeoDataSolrCommitPolicy; - if ( $wgGeoDataBackend != 'solr' ) { - $this->error( "This script is only for wikis with Solr GeoData backend", true ); - } - - $dbr = $this->getDB( DB_SLAVE ); - $dbw = $this->getDB( DB_MASTER ); - - $wikiId = wfWikiID(); - - if ( $this->hasOption( 'reset' ) ) { - $this->output( "Resetting update tracking...\n" ); - $dbw->delete( 'geo_updates', array( 'gu_wiki' => $wikiId ), __METHOD__ ); - $this->output( "Truncating killlist...\n" ); - $table = $dbw->tableName( 'geo_killlist' ); - $dbw->query( "TRUNCATE TABLE $table", __METHOD__ ); - $cutoffKilllist = false; - } else { - $cutoffKilllist = $dbr->selectField( 'geo_killlist', 'MAX( gk_killed_id )', '', __METHOD__ ); - } - $cutoffTags = $dbr->selectField( 'geo_tags', 'MAX( gt_id )', '', __METHOD__ ); - - if ( $this->hasOption( 'clear-killlist' ) ) { - $days = intval( $this->getOption( 'clear-killlist' ) ); - if ( $days <= 0 ) { - $this->error( '--clear-killlist: please specify a positive integer number of days', true ); - } - $this->output( "Deleting killlist entries older than $days days...\n" ); - $timestamp = $dbw->addQuotes( wfTimestamp( TS_DB, strtotime( "$days days ago" ) ) ); - $table = $dbr->tableName( 'geo_killlist' ); - $count = 0; - do { - $sql = "DELETE FROM $table WHERE gk_touched < $timestamp LIMIT " - . self::WRITE_BATCH_SIZE; - $dbw->query( $sql, __METHOD__ ); - $deleted = $dbw->affectedRows(); - $count += $deleted; - if ( $deleted ) { - wfWaitForSlaves(); - $this->output( " $count\n" ); - } - } while ( $deleted > 0 ); - } - - if ( $this->hasOption( 'noindex' ) ) { - return; - } - $res = $dbr->select( 'geo_updates', - array( 'gu_last_tag', 'gu_last_kill' ), - array( 'gu_wiki' => $wikiId ), - __METHOD__ - ); - if ( !$res || !( $row = $res->fetchObject() ) ) { - $lastTag = $lastKill = 0; - } else { - $lastTag = $row->gu_last_tag; - $lastKill = $row->gu_last_kill; - } - - $solr = SolrGeoData::newClient( 'master' ); - - $fields = Coord::getFieldMapping(); - $fields['page_id'] = 'gt_page_id'; - - if ( $cutoffTags ) { - $this->output( "Indexing new documents...\n" ); - $count = 0; - do { - $conds = array( - "gt_id <= $cutoffTags", - 'gt_globe' => 'earth', - ); - if ( $lastTag ) { - $conds[] = "gt_id > $lastTag"; - } - $res = $dbr->select( 'geo_tags', - array_values( $fields ), - $conds, - __METHOD__, - array( 'LIMIT' => self::READ_BATCH_SIZE, 'ORDER BY' => 'gt_id' ) - ); - $docs = array(); - $update = $solr->createUpdate(); - foreach ( $res as $row ) { - $lastTag = $row->gt_id; - $doc = $update->createDocument(); - $row->gt_id = $wikiId . '-' . $row->gt_id; - foreach( $fields as $solrField => $dbField ) { - if ( $solrField != 'lat' && $solrField != 'lon' ) { - $doc->addField( $solrField, $row->$dbField ); - } - } - $doc->addField( 'wiki', $wikiId ); - $doc->addField( 'coord', "{$row->gt_lat},{$row->gt_lon}" ); - $docs[] = $doc; - } - if ( $docs ) { - $update->addDocuments( $docs, null, $this->commitWithin() ); - $this->addCommit( $update ); - $solr->update( $update ); - - $count += count( $docs ); - $this->output( " $count\n" ); - usleep( self::READ_DELAY ); - } - } while ( $res->numRows() > 0 ); - } - - if ( $cutoffKilllist ) { - $this->output( "Deleting old documents...\n" ); - $count = 0; - do { - $conds = array( - "gk_killed_id <= $cutoffKilllist", - ); - if ( $lastKill ) { - $conds[] = "gk_killed_id > $lastKill"; - } - $res = $dbr->select( 'geo_killlist', - array( 'gk_killed_id' ), - $conds, - __METHOD__, - array( 'LIMIT' => self::READ_BATCH_SIZE, 'ORDER BY' => 'gk_killed_id' ) - ); - $killedIds = array(); - $update = $solr->createUpdate(); - foreach ( $res as $row ) { - $lastKill = $row->gk_killed_id; - $killedIds[] = $wikiId . '-' . $row->gk_killed_id; - } - if ( $killedIds ) { - $update->addDeleteByIds( $killedIds ); - if ( $wgGeoDataSolrCommitPolicy === 'immediate' ) { - $update->addCommit(); - } - $solrResult = $solr->update( $update ); - wfDebugLog( 'geodata', "Deleting " . count( $killedIds ) . " docs, response: {$solrResult->getResponse()->getBody()}" ); - - $count += count( $killedIds ); - $this->output( " $count\n" ); - usleep( self::READ_DELAY ); - } - } while ( $res->numRows() > 0 ); - // delete queries don't support commitWithin, so if we're in commitWithin mode, - // just commit after we're done deleting - if ( $count && is_int( $wgGeoDataSolrCommitPolicy ) ) { - $update = $solr->createUpdate(); - $update->addCommit(); - $solr->update( $update ); - } - } - - $dbw->replace( 'geo_updates', - array( 'gu_wiki' ), - array( 'gu_wiki' => $wikiId, 'gu_last_tag' => $lastTag, 'gu_last_kill' => $lastKill ), - __METHOD__ - ); - } - - /** - * @param Solarium_Query_Update $update - */ - private function addCommit( $update ) { - global $wgGeoDataSolrCommitPolicy; - - if ( $wgGeoDataSolrCommitPolicy === 'immediate' ) { - $update->addCommit(); - } elseif ( !( is_int( $wgGeoDataSolrCommitPolicy ) && $wgGeoDataSolrCommitPolicy > 0 ) - && $wgGeoDataSolrCommitPolicy !== 'never' ) { - throw new MWException( "'$wgGeoDataSolrCommitPolicy' is not a valid \$wgGeoDataSolrCommitPolicy value" ); - } - } - - /** - * @return int|null: Number of milliseconds to commit within or null if not applicable - */ - private function commitWithin() { - global $wgGeoDataSolrCommitPolicy; - - if ( is_int( $wgGeoDataSolrCommitPolicy ) ) { - return $wgGeoDataSolrCommitPolicy; - } - return null; - } - - /** - * Overrides Maintenace::error() to throw exceptions instead of writing to stderr when called from a job - * @param String $err - * @param int $die - */ - protected function error( $err, $die = 0 ) { - if ( $this->jobMode ) { - if ( $die ) { - throw new MWException( $err ); - } else { - wfDebug( "$err\n" ); - } - } - parent::error( $err, $die ); - } -} - -$maintClass = 'SolrUpdate'; -require_once( DO_MAINTENANCE ); diff --git a/sql/drop-updates-killlist.sql b/sql/drop-updates-killlist.sql new file mode 100644 index 0000000..b2ac148 --- /dev/null +++ b/sql/drop-updates-killlist.sql @@ -0,0 +1,2 @@ +DROP TABLE /*_*/geo_killlist; +DROP TABLE /*_*/geo_updates; \ No newline at end of file diff --git a/sql/externally-backed.sql b/sql/externally-backed.sql index a279930..2993a65 100644 --- a/sql/externally-backed.sql +++ b/sql/externally-backed.sql @@ -1,4 +1,4 @@ --- SQL schema for GeoData extension, Solr-aware +-- SQL schema for GeoData extension, Elasticsearch backend -- Stores information about geographical coordinates in articles CREATE TABLE /*_*/geo_tags ( @@ -29,22 +29,3 @@ CREATE INDEX /*i*/gt_page_primary ON /*_*/geo_tags ( gt_page_id, gt_primary ); CREATE INDEX /*i*/gt_page_id_id ON /*_*/geo_tags ( gt_page_id, gt_id ); - --- Stores kill-list (ids of records deleted from geo_tags that need to be deleted from the Solr index) -CREATE TABLE /*_*/geo_killlist ( - -- Row ID - gk_id int unsigned NOT NULL PRIMARY KEY AUTO_INCREMENT, - -- gt_id of a row deleted from geo_tags - gk_killed_id int unsigned NOT NULL, - -- Last change timestamp - gk_touched timestamp NOT NULL default CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP -)/*$wgDBTableOptions*/; - -CREATE INDEX /*i*/gk_touched ON /*_*/geo_killlist ( gk_touched ); - --- Stores information about the last index update time -CREATE TABLE /*_*/geo_updates ( - gu_wiki varchar(64) NOT NULL PRIMARY KEY, - gu_last_tag int NOT NULL, - gu_last_kill int NOT NULL -)/*$wgDBTableOptions*/; diff --git a/sql/wmfFixTables.sql b/sql/wmfFixTables.sql deleted file mode 100644 index 72f9177..0000000 --- a/sql/wmfFixTables.sql +++ /dev/null @@ -1,8 +0,0 @@ --- Some tables were created on WMF using MyISAM --- This script fixes them - -TRUNCATE TABLE /*_*/geo_killlist; - -ALTER TABLE /*_*/geo_killlist ENGINE=InnoDB; - -ALTER TABLE /*_*/geo_updates ENGINE=InnoDB; -- To view, visit https://gerrit.wikimedia.org/r/134851 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I29fc406ba085795db68911487ee074b3eae6e8a2 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/GeoData Gerrit-Branch: master Gerrit-Owner: MaxSem <maxsem.w...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits