Jdlrobson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/363083 )

Change subject: WIP: more test cases
......................................................................

WIP: more test cases

Change-Id: I928299d5f008c37d3bc7d28ce226bf4f6be02bcc
---
M includes/ExtractFormatter.php
M tests/phpunit/ExtractFormatterTest.php
2 files changed, 8 insertions(+), 25 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/TextExtracts 
refs/changes/83/363083/1

diff --git a/includes/ExtractFormatter.php b/includes/ExtractFormatter.php
index 48a4e88..dd714bb 100644
--- a/includes/ExtractFormatter.php
+++ b/includes/ExtractFormatter.php
@@ -212,31 +212,8 @@
         * @return string
         */
        protected static function getFirstSentencesPlain( $text, 
$requestedSentenceCount ) {
-               // Based on code from OpenSearchXml by Brion Vibber
-               $endchars = [
-                       '[^\p{Lu}]\.(?:[ \n]|$)', '[\!\?](?:[ \n]|$)', // 
regular ASCII
-                       '。', // full-width ideographic full-stop
-                       '.', '!', '?', // double-width roman forms
-                       '。', // half-width ideographic full stop
-                       ];
-
-               $endgroup = implode( '|', $endchars );
-               $regexp = "/($endgroup)+/u";
-
-               $matches = [];
-               $res = preg_match_all( $regexp, $text, $matches, 
PREG_OFFSET_CAPTURE );
-
-               if ( $res ) {
-                       $index = min( $requestedSentenceCount, $res ) - 1;
-                       list( $tail, $length ) = $matches[0][ $index ];
-                       // PCRE returns raw offsets, so using substr() instead 
of mb_substr()
-                       $text = substr( $text, 0, $length ) . trim( $tail );
-               } else {
-                       // Just return the first line
-                       $lines = explode( "\n", $text, 2 );
-                       $text = trim( $lines[0] );
-               }
-               return $text;
+               $sentences = self::explodeSentences( $text );
+               return implode( '', array_slice( $sentences, 0, 
$requestedSentenceCount ) );
        }
 
        /**
diff --git a/tests/phpunit/ExtractFormatterTest.php 
b/tests/phpunit/ExtractFormatterTest.php
index 97826ec..4c0318c 100644
--- a/tests/phpunit/ExtractFormatterTest.php
+++ b/tests/phpunit/ExtractFormatterTest.php
@@ -110,6 +110,12 @@
                                1,
                                "<p>It's good to stay at the Y.M.C.A. 
establishment.</p>"
                        ],
+                       // brackets are ignored
+                       [
+                               "<p><span>Jon Robson (b. 1985) wrote this 
test.</span><span>Feel free to refactor it.</span><span>honestly</span></p>",
+                               1,
+                               "<p><span>Jon Robson (b. 1985) wrote this 
test.</span></p>"
+                       ],
                        // Inappropriate use of `...` is fine
                        [
                                '<p>a. b... c. d. e. f. g.</p>',

-- 
To view, visit https://gerrit.wikimedia.org/r/363083
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I928299d5f008c37d3bc7d28ce226bf4f6be02bcc
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/TextExtracts
Gerrit-Branch: master
Gerrit-Owner: Jdlrobson <jrob...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to