jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/399537 )

Change subject: Add collation for Abkhaz (ab)
......................................................................


Add collation for Abkhaz (ab)

* Adding new class AbkhazUppercaseCollation, mapped to 'uppercase-ab'.
* Extended CustomUppercaseCollation with support for sorting digraphs
  and for alphabets larger than 64 letters (up to 4096).

Bug: T183430
Change-Id: I16d44568e44d7ef5b39c38b1a6257b9fe10a34d4
---
M autoload.php
A includes/collation/AbkhazUppercaseCollation.php
M includes/collation/Collation.php
M includes/collation/CustomUppercaseCollation.php
M tests/phpunit/includes/collation/CustomUppercaseCollationTest.php
5 files changed, 132 insertions(+), 17 deletions(-)

Approvals:
  Brian Wolff: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/autoload.php b/autoload.php
index 6b8387b..47c04b9 100644
--- a/autoload.php
+++ b/autoload.php
@@ -6,6 +6,7 @@
 $wgAutoloadLocalClasses = [
        'APCBagOStuff' => __DIR__ . 
'/includes/libs/objectcache/APCBagOStuff.php',
        'APCUBagOStuff' => __DIR__ . 
'/includes/libs/objectcache/APCUBagOStuff.php',
+       'AbkhazUppercaseCollation' => __DIR__ . 
'/includes/collation/AbkhazUppercaseCollation.php',
        'AbstractContent' => __DIR__ . '/includes/content/AbstractContent.php',
        'Action' => __DIR__ . '/includes/actions/Action.php',
        'ActiveUsersPager' => __DIR__ . 
'/includes/specials/pagers/ActiveUsersPager.php',
diff --git a/includes/collation/AbkhazUppercaseCollation.php 
b/includes/collation/AbkhazUppercaseCollation.php
new file mode 100644
index 0000000..e0ea237
--- /dev/null
+++ b/includes/collation/AbkhazUppercaseCollation.php
@@ -0,0 +1,93 @@
+<?php
+/**
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @since 1.31
+ *
+ * @file
+ */
+
+class AbkhazUppercaseCollation extends CustomUppercaseCollation {
+
+       public function __construct() {
+               parent::__construct( [
+                       'А',
+                       'Б',
+                       'В',
+                       'Г',
+                       'Гь',
+                       'Гә',
+                       'Ҕ',
+                       'Ҕь',
+                       'Ҕә',
+                       'Д',
+                       'Дә',
+                       'Е',
+                       'Ж',
+                       'Жь',
+                       'Жә',
+                       'З',
+                       'Ӡ',
+                       'Ӡә',
+                       'И',
+                       'К',
+                       'Кь',
+                       'Кә',
+                       'Қ',
+                       'Қь',
+                       'Қә',
+                       'Ҟ',
+                       'Ҟь',
+                       'Ҟә',
+                       'Л',
+                       'М',
+                       'Н',
+                       'О',
+                       'П',
+                       'Ҧ',
+                       'Р',
+                       'С',
+                       'Т',
+                       'Тә',
+                       'Ҭ',
+                       'Ҭә',
+                       'У',
+                       'Ф',
+                       'Х',
+                       'Хь',
+                       'Хә',
+                       'Ҳ',
+                       'Ҳә',
+                       'Ц',
+                       'Цә',
+                       'Ҵ',
+                       'Ҵә',
+                       'Ч',
+                       'Ҷ',
+                       'Ҽ',
+                       'Ҿ',
+                       'Ш',
+                       'Шь',
+                       'Шә',
+                       'Ы',
+                       'Ҩ',
+                       'Џ',
+                       'Џь',
+                       'ь',
+                       'ә',
+               ], Language::factory( 'ab' ) );
+       }
+}
diff --git a/includes/collation/Collation.php b/includes/collation/Collation.php
index 7171a21..30cae5a 100644
--- a/includes/collation/Collation.php
+++ b/includes/collation/Collation.php
@@ -65,6 +65,8 @@
                                return new CollationEt;
                        case 'xx-uca-fa':
                                return new CollationFa;
+                       case 'uppercase-ab':
+                               return new AbkhazUppercaseCollation;
                        case 'uppercase-ba':
                                return new BashkirUppercaseCollation;
                        case 'uppercase-se':
diff --git a/includes/collation/CustomUppercaseCollation.php 
b/includes/collation/CustomUppercaseCollation.php
index 301972d..170d5c2 100644
--- a/includes/collation/CustomUppercaseCollation.php
+++ b/includes/collation/CustomUppercaseCollation.php
@@ -32,6 +32,7 @@
  * conflicts with other people using private use area)
  *
  * This does not support fancy things like secondary differences, etc.
+ * (It supports digraphs, trigraphs etc. though.)
  *
  * It is expected most people will subclass this and just override the
  * constructor to hard-code an alphabet.
@@ -45,25 +46,30 @@
        private $puaSubset;
 
        /**
-        * @note This assumes $alphabet does not contain U+F3000-U+F303F
+        * @note This assumes $alphabet does not contain U+F3000-U+F3FFF
         *
         * @param array $alphabet Sorted array of uppercase characters.
         * @param Language $lang What language for number sorting.
         */
        public function __construct( array $alphabet, Language $lang ) {
-               // It'd be trivial to extend this past 64, you'd just
-               // need a bit of bit-fiddling. Doesn't seem necessary right
-               // now.
-               if ( count( $alphabet ) < 1 || count( $alphabet ) >= 64 ) {
-                       throw new UnexpectedValueException( "Alphabet must be < 
64 items" );
+               if ( count( $alphabet ) < 1 || count( $alphabet ) >= 4096 ) {
+                       throw new UnexpectedValueException( "Alphabet must be < 
4096 items" );
                }
-               $this->alphabet = $alphabet;
+               $this->firstLetters = $alphabet;
+               // For digraphs, only the first letter is capitalized in input
+               $this->alphabet = array_map( [ $lang, 'uc' ], $alphabet );
 
                $this->puaSubset = [];
                $len = count( $alphabet );
                for ( $i = 0; $i < $len; $i++ ) {
-                       $this->puaSubset[] = "\xF3\xB3\x80" . chr( $i + 128 );
+                       $this->puaSubset[] = "\xF3\xB3" . chr( floor( $i / 64 ) 
+ 128 ) . chr( ( $i % 64 ) + 128 );
                }
+
+               // Sort these arrays so that any trigraphs, digraphs etc. are 
first
+               // (and they get replaced first in convertToPua()).
+               $lengths = array_map( 'mb_strlen', $this->alphabet );
+               array_multisort( $lengths, SORT_DESC, $this->firstLetters, 
$this->alphabet, $this->puaSubset );
+
                parent::__construct( $lang );
        }
 
@@ -76,12 +82,17 @@
        }
 
        public function getFirstLetter( $string ) {
-               // In case a title has a PUA code in it, make it sort
-               // under the header for the character it would replace
-               // to avoid inconsistent behaviour. This class mostly
-               // assumes that people will not use PUA codes.
-               return parent::getFirstLetter(
-                       str_replace( $this->puaSubset, $this->alphabet, $string 
)
-               );
+               $sortkey = $this->getSortKey( $string );
+
+               // In case a title begins with a character from our alphabet, 
return the corresponding
+               // first-letter. (This also happens if the title has a 
corresponding PUA code in it, to avoid
+               // inconsistent behaviour. This class mostly assumes that 
people will not use PUA codes.)
+               $index = array_search( substr( $sortkey, 0, 4 ), 
$this->puaSubset );
+               if ( $index !== false ) {
+                       return $this->firstLetters[ $index ];
+               }
+
+               // String begins with a character outside of our alphabet, fall 
back
+               return parent::getFirstLetter( $string );
        }
 }
diff --git a/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php 
b/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php
index 5d5317b..90c097d 100644
--- a/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php
+++ b/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php
@@ -6,6 +6,7 @@
                $this->collation = new CustomUppercaseCollation( [
                        'D',
                        'C',
+                       'Cs',
                        'B'
                ], Language::factory( 'en' ) );
 
@@ -31,6 +32,7 @@
                        [ '💩 ', 'C', 'Test relocated to end' ],
                        [ 'c', 'b', 'lowercase' ],
                        [ 'x', 'z', 'lowercase original' ],
+                       [ 'Cz', 'Cs', 'digraphs' ],
                        [ 'C50D', 'C100', 'Numbers' ]
                ];
        }
@@ -50,8 +52,14 @@
                        [ 'afdsa', 'A' ],
                        [ "\xF3\xB3\x80\x80Foo", 'D' ],
                        [ "\xF3\xB3\x80\x81Foo", 'C' ],
-                       [ "\xF3\xB3\x80\x82Foo", 'B' ],
-                       [ "\xF3\xB3\x80\x83Foo", "\xF3\xB3\x80\x83" ],
+                       [ "\xF3\xB3\x80\x82Foo", 'Cs' ],
+                       [ "\xF3\xB3\x80\x83Foo", 'B' ],
+                       [ "\xF3\xB3\x80\x84Foo", "\xF3\xB3\x80\x84" ],
+                       [ 'C', 'C' ],
+                       [ 'Cz', 'C' ],
+                       [ 'Cs', 'Cs' ],
+                       [ 'CS', 'Cs' ],
+                       [ 'cs', 'Cs' ],
                ];
        }
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/399537
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I16d44568e44d7ef5b39c38b1a6257b9fe10a34d4
Gerrit-PatchSet: 2
Gerrit-Project: mediawiki/core
Gerrit-Branch: master
Gerrit-Owner: Bartosz Dziewoński <matma....@gmail.com>
Gerrit-Reviewer: Bartosz Dziewoński <matma....@gmail.com>
Gerrit-Reviewer: Brian Wolff <bawolff...@gmail.com>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to