jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/399537 )
Change subject: Add collation for Abkhaz (ab) ...................................................................... Add collation for Abkhaz (ab) * Adding new class AbkhazUppercaseCollation, mapped to 'uppercase-ab'. * Extended CustomUppercaseCollation with support for sorting digraphs and for alphabets larger than 64 letters (up to 4096). Bug: T183430 Change-Id: I16d44568e44d7ef5b39c38b1a6257b9fe10a34d4 --- M autoload.php A includes/collation/AbkhazUppercaseCollation.php M includes/collation/Collation.php M includes/collation/CustomUppercaseCollation.php M tests/phpunit/includes/collation/CustomUppercaseCollationTest.php 5 files changed, 132 insertions(+), 17 deletions(-) Approvals: Brian Wolff: Looks good to me, approved jenkins-bot: Verified diff --git a/autoload.php b/autoload.php index 6b8387b..47c04b9 100644 --- a/autoload.php +++ b/autoload.php @@ -6,6 +6,7 @@ $wgAutoloadLocalClasses = [ 'APCBagOStuff' => __DIR__ . '/includes/libs/objectcache/APCBagOStuff.php', 'APCUBagOStuff' => __DIR__ . '/includes/libs/objectcache/APCUBagOStuff.php', + 'AbkhazUppercaseCollation' => __DIR__ . '/includes/collation/AbkhazUppercaseCollation.php', 'AbstractContent' => __DIR__ . '/includes/content/AbstractContent.php', 'Action' => __DIR__ . '/includes/actions/Action.php', 'ActiveUsersPager' => __DIR__ . '/includes/specials/pagers/ActiveUsersPager.php', diff --git a/includes/collation/AbkhazUppercaseCollation.php b/includes/collation/AbkhazUppercaseCollation.php new file mode 100644 index 0000000..e0ea237 --- /dev/null +++ b/includes/collation/AbkhazUppercaseCollation.php @@ -0,0 +1,93 @@ +<?php +/** + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @since 1.31 + * + * @file + */ + +class AbkhazUppercaseCollation extends CustomUppercaseCollation { + + public function __construct() { + parent::__construct( [ + 'А', + 'Б', + 'В', + 'Г', + 'Гь', + 'Гә', + 'Ҕ', + 'Ҕь', + 'Ҕә', + 'Д', + 'Дә', + 'Е', + 'Ж', + 'Жь', + 'Жә', + 'З', + 'Ӡ', + 'Ӡә', + 'И', + 'К', + 'Кь', + 'Кә', + 'Қ', + 'Қь', + 'Қә', + 'Ҟ', + 'Ҟь', + 'Ҟә', + 'Л', + 'М', + 'Н', + 'О', + 'П', + 'Ҧ', + 'Р', + 'С', + 'Т', + 'Тә', + 'Ҭ', + 'Ҭә', + 'У', + 'Ф', + 'Х', + 'Хь', + 'Хә', + 'Ҳ', + 'Ҳә', + 'Ц', + 'Цә', + 'Ҵ', + 'Ҵә', + 'Ч', + 'Ҷ', + 'Ҽ', + 'Ҿ', + 'Ш', + 'Шь', + 'Шә', + 'Ы', + 'Ҩ', + 'Џ', + 'Џь', + 'ь', + 'ә', + ], Language::factory( 'ab' ) ); + } +} diff --git a/includes/collation/Collation.php b/includes/collation/Collation.php index 7171a21..30cae5a 100644 --- a/includes/collation/Collation.php +++ b/includes/collation/Collation.php @@ -65,6 +65,8 @@ return new CollationEt; case 'xx-uca-fa': return new CollationFa; + case 'uppercase-ab': + return new AbkhazUppercaseCollation; case 'uppercase-ba': return new BashkirUppercaseCollation; case 'uppercase-se': diff --git a/includes/collation/CustomUppercaseCollation.php b/includes/collation/CustomUppercaseCollation.php index 301972d..170d5c2 100644 --- a/includes/collation/CustomUppercaseCollation.php +++ b/includes/collation/CustomUppercaseCollation.php @@ -32,6 +32,7 @@ * conflicts with other people using private use area) * * This does not support fancy things like secondary differences, etc. + * (It supports digraphs, trigraphs etc. though.) * * It is expected most people will subclass this and just override the * constructor to hard-code an alphabet. @@ -45,25 +46,30 @@ private $puaSubset; /** - * @note This assumes $alphabet does not contain U+F3000-U+F303F + * @note This assumes $alphabet does not contain U+F3000-U+F3FFF * * @param array $alphabet Sorted array of uppercase characters. * @param Language $lang What language for number sorting. */ public function __construct( array $alphabet, Language $lang ) { - // It'd be trivial to extend this past 64, you'd just - // need a bit of bit-fiddling. Doesn't seem necessary right - // now. - if ( count( $alphabet ) < 1 || count( $alphabet ) >= 64 ) { - throw new UnexpectedValueException( "Alphabet must be < 64 items" ); + if ( count( $alphabet ) < 1 || count( $alphabet ) >= 4096 ) { + throw new UnexpectedValueException( "Alphabet must be < 4096 items" ); } - $this->alphabet = $alphabet; + $this->firstLetters = $alphabet; + // For digraphs, only the first letter is capitalized in input + $this->alphabet = array_map( [ $lang, 'uc' ], $alphabet ); $this->puaSubset = []; $len = count( $alphabet ); for ( $i = 0; $i < $len; $i++ ) { - $this->puaSubset[] = "\xF3\xB3\x80" . chr( $i + 128 ); + $this->puaSubset[] = "\xF3\xB3" . chr( floor( $i / 64 ) + 128 ) . chr( ( $i % 64 ) + 128 ); } + + // Sort these arrays so that any trigraphs, digraphs etc. are first + // (and they get replaced first in convertToPua()). + $lengths = array_map( 'mb_strlen', $this->alphabet ); + array_multisort( $lengths, SORT_DESC, $this->firstLetters, $this->alphabet, $this->puaSubset ); + parent::__construct( $lang ); } @@ -76,12 +82,17 @@ } public function getFirstLetter( $string ) { - // In case a title has a PUA code in it, make it sort - // under the header for the character it would replace - // to avoid inconsistent behaviour. This class mostly - // assumes that people will not use PUA codes. - return parent::getFirstLetter( - str_replace( $this->puaSubset, $this->alphabet, $string ) - ); + $sortkey = $this->getSortKey( $string ); + + // In case a title begins with a character from our alphabet, return the corresponding + // first-letter. (This also happens if the title has a corresponding PUA code in it, to avoid + // inconsistent behaviour. This class mostly assumes that people will not use PUA codes.) + $index = array_search( substr( $sortkey, 0, 4 ), $this->puaSubset ); + if ( $index !== false ) { + return $this->firstLetters[ $index ]; + } + + // String begins with a character outside of our alphabet, fall back + return parent::getFirstLetter( $string ); } } diff --git a/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php b/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php index 5d5317b..90c097d 100644 --- a/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php +++ b/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php @@ -6,6 +6,7 @@ $this->collation = new CustomUppercaseCollation( [ 'D', 'C', + 'Cs', 'B' ], Language::factory( 'en' ) ); @@ -31,6 +32,7 @@ [ '💩 ', 'C', 'Test relocated to end' ], [ 'c', 'b', 'lowercase' ], [ 'x', 'z', 'lowercase original' ], + [ 'Cz', 'Cs', 'digraphs' ], [ 'C50D', 'C100', 'Numbers' ] ]; } @@ -50,8 +52,14 @@ [ 'afdsa', 'A' ], [ "\xF3\xB3\x80\x80Foo", 'D' ], [ "\xF3\xB3\x80\x81Foo", 'C' ], - [ "\xF3\xB3\x80\x82Foo", 'B' ], - [ "\xF3\xB3\x80\x83Foo", "\xF3\xB3\x80\x83" ], + [ "\xF3\xB3\x80\x82Foo", 'Cs' ], + [ "\xF3\xB3\x80\x83Foo", 'B' ], + [ "\xF3\xB3\x80\x84Foo", "\xF3\xB3\x80\x84" ], + [ 'C', 'C' ], + [ 'Cz', 'C' ], + [ 'Cs', 'Cs' ], + [ 'CS', 'Cs' ], + [ 'cs', 'Cs' ], ]; } } -- To view, visit https://gerrit.wikimedia.org/r/399537 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I16d44568e44d7ef5b39c38b1a6257b9fe10a34d4 Gerrit-PatchSet: 2 Gerrit-Project: mediawiki/core Gerrit-Branch: master Gerrit-Owner: Bartosz Dziewoński <matma....@gmail.com> Gerrit-Reviewer: Bartosz Dziewoński <matma....@gmail.com> Gerrit-Reviewer: Brian Wolff <bawolff...@gmail.com> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits