EBernhardson has submitted this change and it was merged. Change subject: Ignore results that arnt valid wiki articles ......................................................................
Ignore results that arnt valid wiki articles Occasionally some engines return urls that point to search, or to the top level domain. Ignore them rather than bailing out. Change-Id: Ifadfc60cf96cbe6463457c217d72764c9d79749b --- M src/RelevanceScoring/Import/HtmlResultGetter.php A tests/unit/RelevanceScoring/Import/HtmlResultGetter.php 2 files changed, 131 insertions(+), 9 deletions(-) Approvals: EBernhardson: Verified; Looks good to me, approved diff --git a/src/RelevanceScoring/Import/HtmlResultGetter.php b/src/RelevanceScoring/Import/HtmlResultGetter.php index c287881..6db5bb2 100644 --- a/src/RelevanceScoring/Import/HtmlResultGetter.php +++ b/src/RelevanceScoring/Import/HtmlResultGetter.php @@ -58,10 +58,6 @@ ]); } - private function getWikiDomain($wiki) - { - return parse_url($this->wikis[$wiki], PHP_URL_HOST); - } /** * @param ResponseInterface $response * @param string $wiki @@ -74,7 +70,6 @@ public function handleResponse(ResponseInterface $response, $wiki, $query) { if ($response->getStatusCode() !== 200) { - var_dump($response); throw new RuntimeException('Failed search'); } @@ -87,16 +82,14 @@ throw new RuntimeException('No results section'); } - $domain = strtolower($this->getWikiDomain($wiki)); $results = []; foreach ($doc[$this->selectors['results']] as $result) { $pq = \pq($result); $url = $pq[$this->selectors['url']]->attr('href'); - $urlDomain = strtolower(parse_url($url, PHP_URL_HOST)); - if ($urlDomain === $domain) { + if ($this->isValidWikiArticle($wiki, $url)) { $results[] = ImportedResult::createFromURL( $this->source, - $pq[$this->selectors['url']]->attr('href'), + $url, $pq[$this->selectors['snippet']]->text(), count($results) ); @@ -105,4 +98,40 @@ return $results; } + + /** + * @param string $wiki + * @return string + */ + private function getWikiDomain($wiki) + { + return parse_url($this->wikis[$wiki], PHP_URL_HOST); + } + + /** + * @param string $url + * @return bool + */ + private function isValidWikiArticle($wiki, $url) + { + + $parts = parse_url($url); + + $domain = strtolower($this->getWikiDomain($wiki)); + $urlDomain = strtolower($parts['host']); + if ($urlDomain !== $domain) { + return false; + } + + if (strlen($parts['path']) > 6 && substr($parts['path'], 0, 6) === '/wiki/') { + return true; + } + + if (empty($parts['query'])) { + return false; + } + + parse_str($parts['query'], $query); + return !empty($query['title']); + } } diff --git a/tests/unit/RelevanceScoring/Import/HtmlResultGetter.php b/tests/unit/RelevanceScoring/Import/HtmlResultGetter.php new file mode 100644 index 0000000..93dda2c --- /dev/null +++ b/tests/unit/RelevanceScoring/Import/HtmlResultGetter.php @@ -0,0 +1,93 @@ +<?php + +namespace WikiMedia\RelevanceScoring\Import; + +class HtmlResultGetterTest extends \PHPUnit_Framework_TestCase +{ + + public static function somethingProvider() + { + $selectors = [ + 'is_valid' => 'body', + 'results' => 'li', + 'url' => 'a', + 'snippet' => 'p', + ]; + + $genHtml = function (array $results) { + $content = ''; + foreach ($results as $url => $snippet) { + $content .= "<li><a href='$url'>some text</a>"; + $content .= "<p>$snippet</p></li>"; + } + return "<html><body><ul>$content</ul></body></html>"; + }; + + return [ + 'simple wiki article' => [ + $selectors, + $genHtml(['https://test.wikipedia.org/wiki/Subject' => 'blah blah blah']), + // expected results + [new ImportedResult('unittest', 'Subject', 'blah blah blah', 0)] + ], + 'article in query string' => [ + $selectors, + $genHtml(['https://test.wikipedia.org/w/index.php?title=Other' => 'foo bar baz']), + [new ImportedResult('unittest', 'Other', 'foo bar baz', 0)] + ], + 'multiple articles' => [ + $selectors, + $genHtml([ + 'https://test.wikipedia.org/wiki/Other' => 'foo bar baz', + 'https://test.wikipedia.org/w/index.php?title=Thing' => 'bamboozle', + ]), + [ + new ImportedResult('unittest', 'Other', 'foo bar baz', 0), + new ImportedResult('unittest', 'Thing', 'bamboozle', 1), + ] + ], + 'decodes entities' => [ + $selectors, + $genHtml(['https://test.wikipedia.org/wiki/This_%26_That' => 'a > b']), + [new ImportedResult('unittest', 'This & That', 'a > b', 0)] + ], + 'ignores unexpected urls' => [ + $selectors, + $genHtml([ + 'https://test.wikipedia.org/?search=stuff' => 'fofofofo', + 'https://not.us/wiki/Coffee' => 'tea', + 'https://test.wikipedia.org/wiki/' => 'still wrong', + ]), + [] + ], + ]; + } + + /** + * @dataProvider somethingProvider + */ + public function testSomething(array $selectors, $html, $expected) + { + $client = $this->getMock('GuzzleHTTP\\Client'); + $response = $this->getMock('Psr\\Http\\Message\\ResponseInterface'); + $response->expects($this->any()) + ->method('getStatusCode') + ->will($this->returnValue(200)); + $response->expects($this->any()) + ->method('getBody') + ->will($this->returnValue($html)); + + + $getter = new HtmlResultGetter( + $client, + ['testwiki' => 'https://test.wikipedia.org/w/api.php'], + 'unittest', + 'https://test.wikipedia.org/w/index.php', + $selectors, + [] + ); + + $this->assertEquals($expected, $getter->handleResponse($response, 'testwiki', '')); + } +} + -- To view, visit https://gerrit.wikimedia.org/r/286758 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ifadfc60cf96cbe6463457c217d72764c9d79749b Gerrit-PatchSet: 1 Gerrit-Project: wikimedia/discovery/discernatron Gerrit-Branch: master Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits