EBernhardson has submitted this change and it was merged.

Change subject: Ignore results that arnt valid wiki articles
......................................................................


Ignore results that arnt valid wiki articles

Occasionally some engines return urls that point to search, or to the
top level domain. Ignore them rather than bailing out.

Change-Id: Ifadfc60cf96cbe6463457c217d72764c9d79749b
---
M src/RelevanceScoring/Import/HtmlResultGetter.php
A tests/unit/RelevanceScoring/Import/HtmlResultGetter.php
2 files changed, 131 insertions(+), 9 deletions(-)

Approvals:
  EBernhardson: Verified; Looks good to me, approved



diff --git a/src/RelevanceScoring/Import/HtmlResultGetter.php 
b/src/RelevanceScoring/Import/HtmlResultGetter.php
index c287881..6db5bb2 100644
--- a/src/RelevanceScoring/Import/HtmlResultGetter.php
+++ b/src/RelevanceScoring/Import/HtmlResultGetter.php
@@ -58,10 +58,6 @@
         ]);
     }
 
-    private function getWikiDomain($wiki)
-    {
-        return parse_url($this->wikis[$wiki], PHP_URL_HOST);
-    }
     /**
      * @param ResponseInterface $response
      * @param string            $wiki
@@ -74,7 +70,6 @@
     public function handleResponse(ResponseInterface $response, $wiki, $query)
     {
         if ($response->getStatusCode() !== 200) {
-            var_dump($response);
             throw new RuntimeException('Failed search');
         }
 
@@ -87,16 +82,14 @@
             throw new RuntimeException('No results section');
         }
 
-        $domain = strtolower($this->getWikiDomain($wiki));
         $results = [];
         foreach ($doc[$this->selectors['results']] as $result) {
             $pq = \pq($result);
             $url = $pq[$this->selectors['url']]->attr('href');
-            $urlDomain = strtolower(parse_url($url, PHP_URL_HOST));
-            if ($urlDomain === $domain) {
+            if ($this->isValidWikiArticle($wiki, $url)) {
                 $results[] = ImportedResult::createFromURL(
                     $this->source,
-                    $pq[$this->selectors['url']]->attr('href'),
+                    $url,
                     $pq[$this->selectors['snippet']]->text(),
                     count($results)
                 );
@@ -105,4 +98,40 @@
 
         return $results;
     }
+
+    /**
+     * @param string $wiki
+     * @return string
+     */
+    private function getWikiDomain($wiki)
+    {
+        return parse_url($this->wikis[$wiki], PHP_URL_HOST);
+    }
+
+    /**
+     * @param string $url
+     * @return bool
+     */
+    private function isValidWikiArticle($wiki, $url)
+    {
+
+        $parts = parse_url($url);
+
+        $domain = strtolower($this->getWikiDomain($wiki));
+        $urlDomain = strtolower($parts['host']);
+        if ($urlDomain !== $domain) {
+            return false;
+        }
+
+        if (strlen($parts['path']) > 6 && substr($parts['path'], 0, 6) === 
'/wiki/') {
+            return true;
+        }
+
+        if (empty($parts['query'])) {
+            return false;
+        }
+
+        parse_str($parts['query'], $query);
+        return !empty($query['title']);
+    }
 }
diff --git a/tests/unit/RelevanceScoring/Import/HtmlResultGetter.php 
b/tests/unit/RelevanceScoring/Import/HtmlResultGetter.php
new file mode 100644
index 0000000..93dda2c
--- /dev/null
+++ b/tests/unit/RelevanceScoring/Import/HtmlResultGetter.php
@@ -0,0 +1,93 @@
+<?php
+
+namespace WikiMedia\RelevanceScoring\Import;
+
+class HtmlResultGetterTest extends \PHPUnit_Framework_TestCase
+{
+
+       public static function somethingProvider()
+       {
+               $selectors = [
+                       'is_valid' => 'body',
+                       'results' => 'li',
+                       'url' => 'a',
+                       'snippet' => 'p',
+               ];
+
+               $genHtml = function (array $results) {
+                       $content = '';
+                       foreach ($results as $url => $snippet) {
+                               $content .= "<li><a href='$url'>some text</a>";
+                               $content .= "<p>$snippet</p></li>";
+                       }
+                       return "<html><body><ul>$content</ul></body></html>";
+               };
+
+               return [
+                       'simple wiki article' => [
+                               $selectors,
+                               
$genHtml(['https://test.wikipedia.org/wiki/Subject' => 'blah blah blah']),
+                               // expected results
+                               [new ImportedResult('unittest', 'Subject', 
'blah blah blah', 0)]
+                       ],
+                       'article in query string' => [
+                               $selectors,
+                               
$genHtml(['https://test.wikipedia.org/w/index.php?title=Other' => 'foo bar 
baz']),
+                               [new ImportedResult('unittest', 'Other', 'foo 
bar baz', 0)]
+                       ],
+                       'multiple articles' => [
+                               $selectors,
+                               $genHtml([
+                                       'https://test.wikipedia.org/wiki/Other' 
=> 'foo bar baz',
+                                       
'https://test.wikipedia.org/w/index.php?title=Thing' => 'bamboozle',
+                               ]),
+                               [
+                                       new ImportedResult('unittest', 'Other', 
'foo bar baz', 0),
+                                       new ImportedResult('unittest', 'Thing', 
'bamboozle', 1),
+                               ]
+                       ],
+                       'decodes entities' => [
+                               $selectors,
+                               
$genHtml(['https://test.wikipedia.org/wiki/This_%26_That' => 'a &gt; b']),
+                               [new ImportedResult('unittest', 'This & That', 
'a > b', 0)]
+                       ],
+                       'ignores unexpected urls' => [
+                               $selectors,
+                               $genHtml([
+                                       
'https://test.wikipedia.org/?search=stuff' => 'fofofofo',
+                                       'https://not.us/wiki/Coffee' => 'tea',
+                                       'https://test.wikipedia.org/wiki/' => 
'still wrong',
+                               ]),
+                               []
+                       ],
+               ];
+       }
+
+       /**
+        * @dataProvider somethingProvider
+        */
+       public function testSomething(array $selectors, $html, $expected)
+       {
+               $client = $this->getMock('GuzzleHTTP\\Client');
+               $response = 
$this->getMock('Psr\\Http\\Message\\ResponseInterface');
+               $response->expects($this->any())
+                       ->method('getStatusCode')
+                       ->will($this->returnValue(200));
+               $response->expects($this->any())
+                       ->method('getBody')
+                       ->will($this->returnValue($html));
+
+
+               $getter = new HtmlResultGetter(
+                       $client,
+                       ['testwiki' => 'https://test.wikipedia.org/w/api.php'],
+                       'unittest',
+                       'https://test.wikipedia.org/w/index.php',
+                       $selectors,
+                       []
+               );
+
+               $this->assertEquals($expected, 
$getter->handleResponse($response, 'testwiki', ''));
+       }
+}
+

-- 
To view, visit https://gerrit.wikimedia.org/r/286758
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ifadfc60cf96cbe6463457c217d72764c9d79749b
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/discovery/discernatron
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org>
Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to