jenkins-bot has submitted this change and it was merged.
Change subject: Remove parts of rendered page from search.
......................................................................
Remove parts of rendered page from search.
We remove the toc, the edit tokens, and the contents of <video> tags
from search. Like the contents of the script tags that are stripped
when removing html tags we just don't want to search (or highlight!)
these.
Bug: 52906
Change-Id: Idf7b6b4e56422e0510bdcd3a4c7aecf24f8d72ca
---
M CirrusSearch.body.php
1 file changed, 29 insertions(+), 1 deletion(-)
Approvals:
Demon: Looks good to me, approved
jenkins-bot: Verified
diff --git a/CirrusSearch.body.php b/CirrusSearch.body.php
index 458b9da..199d348 100644
--- a/CirrusSearch.body.php
+++ b/CirrusSearch.body.php
@@ -26,6 +26,13 @@
const CONTENT_INDEX_TYPE = 'content';
const GENERAL_INDEX_TYPE = 'general';
/**
+ * Regex to remove text we don't want to search but that isn't already
+ * removed when stripping HTML or the toc.
+ */
+ const SANITIZE = '/
+ <video .*?<\/video> # remove the sorry, not supported message
+ /x';
+ /**
* Maximum title length that we'll check in prefix search. Since
titles can
* be 255 bytes in length we're setting this to 255 characters but this
* might cause bloat in the title's prefix index so we'll have to keep
an
@@ -450,7 +457,11 @@
switch ( $c->getModel() ) {
case CONTENT_MODEL_WIKITEXT:
$article = new Article( $t, 0 );
- $text =
$article->getParserOutput()->getText();
+ $parserOutput =
$article->getParserOutput();
+ $parserOutput->setEditSectionTokens(
false ); // Don't add edit tokens
+ $text = $parserOutput->getText();
// Fetch the page
+ $text = $this->stripToc( $text );
// Strip the table of contents
+ $text = preg_replace( self::SANITIZE,
'', $text ); // Strip other non-searchable text
break;
default:
$text = SearchUpdate::updateText( $text
);
@@ -463,6 +474,23 @@
public function textAlreadyUpdatedForIndex() {
return true;
}
+
+ /**
+ * Strip the table of contents from a rendered page. Note that we
don't use
+ * regexes for this because we're removing whole lines.
+ *
+ * @var $text string the rendered page
+ * @return string the rendered page without the toc
+ */
+ private function stripToc( $text ) {
+ $t = explode( "\n", $text );
+ $t = array_filter( $t, function( $line ) {
+ return strpos( $line, 'id="toctitle"' ) === false &&
// Strip the beginning of the toc
+ strpos( $line, 'class="tocnumber"') === false
&& // And any lines with toc numbers
+ strpos( $line, 'class="toctext"') === false;
// And finally lines with toc text
+ });
+ return implode( "\n", $t );
+ }
}
/**
--
To view, visit https://gerrit.wikimedia.org/r/80018
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Idf7b6b4e56422e0510bdcd3a4c7aecf24f8d72ca
Gerrit-PatchSet: 2
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
Gerrit-Reviewer: Demon <[email protected]>
Gerrit-Reviewer: jenkins-bot
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits