Jdlrobson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/363083 )
Change subject: WIP: more test cases ...................................................................... WIP: more test cases Change-Id: I928299d5f008c37d3bc7d28ce226bf4f6be02bcc --- M includes/ExtractFormatter.php M tests/phpunit/ExtractFormatterTest.php 2 files changed, 8 insertions(+), 25 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/TextExtracts refs/changes/83/363083/1 diff --git a/includes/ExtractFormatter.php b/includes/ExtractFormatter.php index 48a4e88..dd714bb 100644 --- a/includes/ExtractFormatter.php +++ b/includes/ExtractFormatter.php @@ -212,31 +212,8 @@ * @return string */ protected static function getFirstSentencesPlain( $text, $requestedSentenceCount ) { - // Based on code from OpenSearchXml by Brion Vibber - $endchars = [ - '[^\p{Lu}]\.(?:[ \n]|$)', '[\!\?](?:[ \n]|$)', // regular ASCII - '。', // full-width ideographic full-stop - '.', '!', '?', // double-width roman forms - '。', // half-width ideographic full stop - ]; - - $endgroup = implode( '|', $endchars ); - $regexp = "/($endgroup)+/u"; - - $matches = []; - $res = preg_match_all( $regexp, $text, $matches, PREG_OFFSET_CAPTURE ); - - if ( $res ) { - $index = min( $requestedSentenceCount, $res ) - 1; - list( $tail, $length ) = $matches[0][ $index ]; - // PCRE returns raw offsets, so using substr() instead of mb_substr() - $text = substr( $text, 0, $length ) . trim( $tail ); - } else { - // Just return the first line - $lines = explode( "\n", $text, 2 ); - $text = trim( $lines[0] ); - } - return $text; + $sentences = self::explodeSentences( $text ); + return implode( '', array_slice( $sentences, 0, $requestedSentenceCount ) ); } /** diff --git a/tests/phpunit/ExtractFormatterTest.php b/tests/phpunit/ExtractFormatterTest.php index 97826ec..4c0318c 100644 --- a/tests/phpunit/ExtractFormatterTest.php +++ b/tests/phpunit/ExtractFormatterTest.php @@ -110,6 +110,12 @@ 1, "<p>It's good to stay at the Y.M.C.A. establishment.</p>" ], + // brackets are ignored + [ + "<p><span>Jon Robson (b. 1985) wrote this test.</span><span>Feel free to refactor it.</span><span>honestly</span></p>", + 1, + "<p><span>Jon Robson (b. 1985) wrote this test.</span></p>" + ], // Inappropriate use of `...` is fine [ '<p>a. b... c. d. e. f. g.</p>', -- To view, visit https://gerrit.wikimedia.org/r/363083 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I928299d5f008c37d3bc7d28ce226bf4f6be02bcc Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/TextExtracts Gerrit-Branch: master Gerrit-Owner: Jdlrobson <jrob...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits