Florianschmidtwelzow has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/363033 )

Change subject: Rewrite domain host parsing part of EmailDomain
......................................................................

Rewrite domain host parsing part of EmailDomain

This also adds some tests to EmailDomain (before I refactored the
domain host parsing part).

The domain list (public suffix list) is not downloaded "on-the-fly"
anymore, but is now required to be downloaded beforehand using the
new maintenance script. The parsing part is also simplified, credits
goes to mgutt [1], where I was free to get inspiration.

[1] https://stackoverflow.com/a/9632782/3394281

Change-Id: I006de79ca6217ec7da662a04b5baff000058c98c
---
M .gitignore
M includes/Constants.php
M includes/alloweddomains/EmailDomain.php
A maintenance/updatePublicSuffixArray.php
A tests/phpunit/includes/alloweddomains/EmailDomainTest.php
5 files changed, 155 insertions(+), 107 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/GoogleLogin 
refs/changes/33/363033/1

diff --git a/.gitignore b/.gitignore
index d62d8a9..8a3f223 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@
 composer.phar
 cache
 composer.lock
+publicSuffixArray.php
\ No newline at end of file
diff --git a/includes/Constants.php b/includes/Constants.php
index 710b7e8..c7055da 100644
--- a/includes/Constants.php
+++ b/includes/Constants.php
@@ -10,4 +10,5 @@
 
 class Constants {
        const SERVICE_ALLOWED_DOMAINS_STORE = 'GoogleLogin.AllowedDomainsStore';
+       const PUBLIC_SUFFIX_ARRAY_FILE = 'publicSuffixArray.php';
 }
diff --git a/includes/alloweddomains/EmailDomain.php 
b/includes/alloweddomains/EmailDomain.php
index cd9c4fb..e483e13 100644
--- a/includes/alloweddomains/EmailDomain.php
+++ b/includes/alloweddomains/EmailDomain.php
@@ -2,7 +2,7 @@
 
 namespace GoogleLogin\AllowedDomains;
 
-use MWException;
+use GoogleLogin\Constants;
 
 /**
  * Represents a single E-Mail address.
@@ -11,7 +11,12 @@
  */
 class EmailDomain {
        private $emailAddress;
-       private $domainHost;
+       private $domainHost = '';
+
+       /**
+        * @var array
+        */
+       private $publicSuffixes;
 
        /**
         * EmailDomain constructor.
@@ -21,8 +26,14 @@
         *  t...@test.example.com will be converted to example.com if this is 
false)
         */
        public function __construct( $mail, $strict = false ) {
+               $this->publicSuffixes =
+                       array_flip( include __DIR__ . '/../../' . 
Constants::PUBLIC_SUFFIX_ARRAY_FILE );
+
                $this->emailAddress = $mail;
-               $this->domainHost = $this->parseHost( $mail, $strict );
+               $domain = explode( '@', $mail );
+               if ( isset( $domain[1] ) ) {
+                       $this->domainHost = $this->parseHost( $domain[1], 
$strict );
+               }
        }
 
        /**
@@ -46,121 +57,32 @@
         * Returns the domain and tld (without subdomains) of the provided 
E-Mailadress
         * @param string $domain The domain part of the email address to 
extract from.
         * @return string The Tld and domain of $domain without subdomains
-        * @see 
http://www.programmierer-forum.de/domainnamen-ermitteln-t244185.htm
         */
        private function parseHost( $domain = '', $strict ) {
-               $dir = __DIR__ . "/../..";
                if ( $strict ) {
-                       $domain = explode( '@', $domain );
                        // we can trust google to give us only valid email 
address, so give the last element
-                       return array_pop( $domain );
+                       return $domain;
                }
-               // for parse_url()
-               $domain =
-                       !isset( $domain[5] ) ||
-                       (
-                               $domain[3] != ':' &&
-                               $domain[4] != ':' &&
-                               $domain[5] != ':'
-                       ) ? 'http://' . $domain : $domain;
-               // remove "/path/file.html", "/:80", etc.
-               $domain = parse_url( $domain, PHP_URL_HOST );
-               // separate domain level
-               $lvl = explode( '.', $domain ); // 0 => www, 1 => example, 2 => 
co, 3 => uk
-               // set levels
-               krsort( $lvl ); // 3 => uk, 2 => co, 1 => example, 0 => www
-               $lvl = array_values( $lvl ); // 0 => uk, 1 => co, 2 => example, 
3 => www
-               $_1st = $lvl[0];
-               $_2nd = isset( $lvl[1] ) ? $lvl[1] . '.' . $_1st : false;
-               $_3rd = isset( $lvl[2] ) ? $lvl[2] . '.' . $_2nd : false;
-               $_4th = isset( $lvl[3] ) ? $lvl[3] . '.' . $_3rd : false;
 
-               // tld extract
-               if ( !file_exists( "$dir/cache/tld.txt" ) ) {
-                       $this->createTLDCache( "$dir/cache/tld.txt" );
-               }
-               require "$dir/cache/tld.txt";
-               $tlds = array_flip( $tlds );
-               if ( // fourth level is TLD
-                       $_4th &&
-                       !isset( $tlds[ '!' . $_4th ] ) &&
-                       (
-                               isset( $tlds[ $_4th ] ) ||
-                               isset( $tlds[ '*.' . $_3rd ] )
-                       )
-               ) {
-                       $domain = isset( $lvl[4] ) ? $lvl[4] . '.' . $_4th : 
false;
-               } elseif ( // third level is TLD
-                       $_3rd &&
-                       !isset( $tlds[ '!' . $_3rd ] ) &&
-                       (
-                               isset( $tlds[ $_3rd ] ) ||
-                               isset( $tlds[ '*.' . $_2nd ] )
-                       )
-               ) {
-                       $domain = $_4th;
-               } elseif ( // second level is TLD
-                       !isset( $tlds[ '!' . $_2nd ] ) &&
-                       (
-                               isset( $tlds[ $_2nd ] ) ||
-                               isset( $tlds[ '*.' . $_1st ] )
-                       )
-               ) {
-                       $domain = $_3rd;
-               } else { // first level is TLD
-                       $domain = $_2nd;
-               }
-               return $domain;
+               $url = explode( '.', $domain );
+
+               return $this->getDomainPart( $url );
        }
 
        /**
-        * Creates the TLD cache from which the valid tld of mail domain comes 
from.
-        * @param string $cacheFile The file to create the cache too (must be 
writeable for the
-        * webserver!)
-        * @param int $max_tl How deep the domain list is (enclude 
example.co.uk (2) or
-        * example.lib.wy.us (3)?)
-        * @see 
http://www.programmierer-forum.de/domainnamen-ermitteln-t244185.htm
-        * @throws MWException
+        * @param $url
+        * @return string
         */
-       private function createTLDCache( $cacheFile, $max_tl = 2 ) {
-               $cacheFolder = str_replace( basename( $cacheFile ), '', 
$cacheFile );
-               if ( !is_writable( $cacheFolder ) ) {
-                       throw new MWException( $cacheFolder . ' is not 
writeable!' );
-               }
-               $tlds = file(
-                       
'http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1'
-               );
-               if ( $tlds === false ) {
-                       throw new MWException( 'Domainlist can not be 
downloaded!' );
-               }
-               $i = 0;
-               // remove unnecessary lines
-               foreach ( $tlds as $tld ) {
-                       $tlds[ $i ] = trim( $tld );
-                       /**
-                        *      empty
-                        *      comments
-                        *      top level domains
-                        *      is overboard
-                        */
-                       if (
-                               !$tlds[ $i ] ||
-                               $tld[0] == '/' ||
-                               strpos( $tld, '.' ) === false ||
-                               substr_count( $tld, '.' ) >= $max_tl
-                       ) {
-                               unset( $tlds[ $i ] );
+       private function getDomainPart( $url ) {
+               $parts = array_reverse( $url );
+               foreach ( $parts as $key => $part ) {
+                       $tld = implode( '.', $parts );
+                       if ( isset( $this->publicSuffixes[$tld] ) ) {
+                               return implode( '.', array_slice( $url, $key - 
1 ) );
                        }
-                       $i++;
+                       array_pop( $parts );
                }
-               $tlds = array_values( $tlds );
-               file_put_contents(
-                       $cacheFile,
-                       "<?php\n" . '$tlds = ' . str_replace(
-                               [ ' ', "\n" ],
-                               '',
-                               var_export( $tlds, true )
-                       ) . ";\n?" . ">"
-               );
+
+               return implode( '.', $url );
        }
 }
diff --git a/maintenance/updatePublicSuffixArray.php 
b/maintenance/updatePublicSuffixArray.php
new file mode 100644
index 0000000..0644ebf
--- /dev/null
+++ b/maintenance/updatePublicSuffixArray.php
@@ -0,0 +1,66 @@
+<?php
+/**
+ * Remove invalid events from echo_event and echo_notification
+ *
+ * @ingroup Maintenance
+ */
+
+require_once getenv( 'MW_INSTALL_PATH' ) !== false
+       ? getenv( 'MW_INSTALL_PATH' ) . '/maintenance/Maintenance.php'
+       : __DIR__ . '/../../../maintenance/Maintenance.php';
+
+/**
+ * Maintenance script that updates the public suffix array.
+ *
+ * @ingroup Maintenance
+ */
+class UpdatePublicSuffixArray extends Maintenance {
+       public function __construct() {
+               parent::__construct();
+
+               $this->mDescription = 'Updates the list of public suffixes used 
for domain recognition.';
+               $this->requireExtension( 'GoogleLogin' );
+       }
+
+       public function execute() {
+               $arrayDirectory = __DIR__ . '/../';
+               if ( !is_writable( $arrayDirectory ) ) {
+                       throw new MWException( $arrayDirectory . ' is not 
writeable!' );
+               }
+               $publicSuffixList = file(
+                       'https://publicsuffix.org/list/public_suffix_list.dat'
+               );
+               if ( $publicSuffixList === false ) {
+                       throw new MWException( 'Domainlist can not be 
downloaded!' );
+               }
+               $publicSuffixes = [];
+
+               foreach ( $publicSuffixList as $suffix ) {
+                       $suffix = trim( $suffix );
+
+                       if ( !$suffix || strpos( $suffix, '/' ) === 0 ) {
+                               continue;
+                       }
+                       if ( strpos( $suffix, '*.' ) !== false ) {
+                               $suffix = substr( $suffix, 2 );
+                       }
+                       if ( strpos( $suffix, '!' ) === 0 ) {
+                               $suffix = substr( $suffix, 1 );
+                       }
+                       $suffix = implode( '.', array_reverse(
+                               explode(
+                                       '.',
+                                       $suffix
+                               )
+                       ) );
+                       $publicSuffixes[] = $suffix;
+               }
+
+               file_put_contents(
+                       $arrayDirectory . 
\GoogleLogin\Constants::PUBLIC_SUFFIX_ARRAY_FILE,
+                       "<?php\n" . 'return [ "' . implode( "\",\n\"", 
$publicSuffixes ) . '" ];'
+               );
+       }
+}
+$maintClass = 'UpdatePublicSuffixArray'; // Tells it to run the class
+require_once RUN_MAINTENANCE_IF_MAIN;
\ No newline at end of file
diff --git a/tests/phpunit/includes/alloweddomains/EmailDomainTest.php 
b/tests/phpunit/includes/alloweddomains/EmailDomainTest.php
new file mode 100644
index 0000000..798a8f5
--- /dev/null
+++ b/tests/phpunit/includes/alloweddomains/EmailDomainTest.php
@@ -0,0 +1,58 @@
+<?php
+
+namespace GoogleLogin\AllowedDomains;
+
+class EmailDomainTest extends \MediaWikiTestCase {
+       /**
+        * @var EmailDomain
+        */
+       private $googleMail;
+
+       /**
+        * @var EmailDomain
+        */
+       private $subdomainMail;
+
+       /**
+        * @var EmailDomain
+        */
+       private $emptyMail;
+
+       /**
+        * @var EmailDomain
+        */
+       private $twoSuffixMail;
+
+       public function setUp() {
+               parent::setUp();
+               $this->googleMail = new EmailDomain( 't...@gmail.com', false );
+               $this->emptyMail = new EmailDomain( '', false );
+               $this->subdomainMail = new EmailDomain( 
't...@my.subdomain.com', false );
+               $this->twoSuffixMail = new EmailDomain( 
't...@my.subdomain.co.us', false );
+       }
+
+       public function testGetEMail() {
+               $this->assertEquals( 't...@gmail.com', 
$this->googleMail->getEmail() );
+               $this->assertEquals( '', $this->emptyMail->getEmail() );
+               $this->assertEquals( 't...@my.subdomain.com', 
$this->subdomainMail->getEmail() );
+               $this->assertEquals( 't...@my.subdomain.co.us', 
$this->twoSuffixMail->getEmail() );
+       }
+
+       public function testGetHost() {
+               $this->assertEquals( '', $this->emptyMail->getHost() );
+               $this->assertEquals( 'gmail.com', $this->googleMail->getHost() 
);
+               $this->assertEquals( 'subdomain.com', 
$this->subdomainMail->getHost() );
+               $this->assertEquals( 'subdomain.co.us', 
$this->twoSuffixMail->getHost() );
+       }
+
+       public function testGetHostStrict() {
+               $emptyMailStrict = new EmailDomain( '', true );
+               $googleMailStrict = new EmailDomain( 't...@gmail.com', true );
+               $subdomainMailStrict = new EmailDomain( 
't...@my.subdomain.com', true );
+               $twoSuffixMailStrict = new EmailDomain( 
't...@my.subdomain.co.us', true );
+               $this->assertEquals( '', $emptyMailStrict->getHost() );
+               $this->assertEquals( 'gmail.com', $googleMailStrict->getHost() 
);
+               $this->assertEquals( 'my.subdomain.com', 
$subdomainMailStrict->getHost() );
+               $this->assertEquals( 'my.subdomain.co.us', 
$twoSuffixMailStrict->getHost() );
+       }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/363033
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I006de79ca6217ec7da662a04b5baff000058c98c
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/GoogleLogin
Gerrit-Branch: master
Gerrit-Owner: Florianschmidtwelzow <florian.schmidt.stargatewis...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to