Florianschmidtwelzow has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/363033 )
Change subject: Rewrite domain host parsing part of EmailDomain ...................................................................... Rewrite domain host parsing part of EmailDomain This also adds some tests to EmailDomain (before I refactored the domain host parsing part). The domain list (public suffix list) is not downloaded "on-the-fly" anymore, but is now required to be downloaded beforehand using the new maintenance script. The parsing part is also simplified, credits goes to mgutt [1], where I was free to get inspiration. [1] https://stackoverflow.com/a/9632782/3394281 Change-Id: I006de79ca6217ec7da662a04b5baff000058c98c --- M .gitignore M includes/Constants.php M includes/alloweddomains/EmailDomain.php A maintenance/updatePublicSuffixArray.php A tests/phpunit/includes/alloweddomains/EmailDomainTest.php 5 files changed, 155 insertions(+), 107 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/GoogleLogin refs/changes/33/363033/1 diff --git a/.gitignore b/.gitignore index d62d8a9..8a3f223 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ composer.phar cache composer.lock +publicSuffixArray.php \ No newline at end of file diff --git a/includes/Constants.php b/includes/Constants.php index 710b7e8..c7055da 100644 --- a/includes/Constants.php +++ b/includes/Constants.php @@ -10,4 +10,5 @@ class Constants { const SERVICE_ALLOWED_DOMAINS_STORE = 'GoogleLogin.AllowedDomainsStore'; + const PUBLIC_SUFFIX_ARRAY_FILE = 'publicSuffixArray.php'; } diff --git a/includes/alloweddomains/EmailDomain.php b/includes/alloweddomains/EmailDomain.php index cd9c4fb..e483e13 100644 --- a/includes/alloweddomains/EmailDomain.php +++ b/includes/alloweddomains/EmailDomain.php @@ -2,7 +2,7 @@ namespace GoogleLogin\AllowedDomains; -use MWException; +use GoogleLogin\Constants; /** * Represents a single E-Mail address. @@ -11,7 +11,12 @@ */ class EmailDomain { private $emailAddress; - private $domainHost; + private $domainHost = ''; + + /** + * @var array + */ + private $publicSuffixes; /** * EmailDomain constructor. @@ -21,8 +26,14 @@ * t...@test.example.com will be converted to example.com if this is false) */ public function __construct( $mail, $strict = false ) { + $this->publicSuffixes = + array_flip( include __DIR__ . '/../../' . Constants::PUBLIC_SUFFIX_ARRAY_FILE ); + $this->emailAddress = $mail; - $this->domainHost = $this->parseHost( $mail, $strict ); + $domain = explode( '@', $mail ); + if ( isset( $domain[1] ) ) { + $this->domainHost = $this->parseHost( $domain[1], $strict ); + } } /** @@ -46,121 +57,32 @@ * Returns the domain and tld (without subdomains) of the provided E-Mailadress * @param string $domain The domain part of the email address to extract from. * @return string The Tld and domain of $domain without subdomains - * @see http://www.programmierer-forum.de/domainnamen-ermitteln-t244185.htm */ private function parseHost( $domain = '', $strict ) { - $dir = __DIR__ . "/../.."; if ( $strict ) { - $domain = explode( '@', $domain ); // we can trust google to give us only valid email address, so give the last element - return array_pop( $domain ); + return $domain; } - // for parse_url() - $domain = - !isset( $domain[5] ) || - ( - $domain[3] != ':' && - $domain[4] != ':' && - $domain[5] != ':' - ) ? 'http://' . $domain : $domain; - // remove "/path/file.html", "/:80", etc. - $domain = parse_url( $domain, PHP_URL_HOST ); - // separate domain level - $lvl = explode( '.', $domain ); // 0 => www, 1 => example, 2 => co, 3 => uk - // set levels - krsort( $lvl ); // 3 => uk, 2 => co, 1 => example, 0 => www - $lvl = array_values( $lvl ); // 0 => uk, 1 => co, 2 => example, 3 => www - $_1st = $lvl[0]; - $_2nd = isset( $lvl[1] ) ? $lvl[1] . '.' . $_1st : false; - $_3rd = isset( $lvl[2] ) ? $lvl[2] . '.' . $_2nd : false; - $_4th = isset( $lvl[3] ) ? $lvl[3] . '.' . $_3rd : false; - // tld extract - if ( !file_exists( "$dir/cache/tld.txt" ) ) { - $this->createTLDCache( "$dir/cache/tld.txt" ); - } - require "$dir/cache/tld.txt"; - $tlds = array_flip( $tlds ); - if ( // fourth level is TLD - $_4th && - !isset( $tlds[ '!' . $_4th ] ) && - ( - isset( $tlds[ $_4th ] ) || - isset( $tlds[ '*.' . $_3rd ] ) - ) - ) { - $domain = isset( $lvl[4] ) ? $lvl[4] . '.' . $_4th : false; - } elseif ( // third level is TLD - $_3rd && - !isset( $tlds[ '!' . $_3rd ] ) && - ( - isset( $tlds[ $_3rd ] ) || - isset( $tlds[ '*.' . $_2nd ] ) - ) - ) { - $domain = $_4th; - } elseif ( // second level is TLD - !isset( $tlds[ '!' . $_2nd ] ) && - ( - isset( $tlds[ $_2nd ] ) || - isset( $tlds[ '*.' . $_1st ] ) - ) - ) { - $domain = $_3rd; - } else { // first level is TLD - $domain = $_2nd; - } - return $domain; + $url = explode( '.', $domain ); + + return $this->getDomainPart( $url ); } /** - * Creates the TLD cache from which the valid tld of mail domain comes from. - * @param string $cacheFile The file to create the cache too (must be writeable for the - * webserver!) - * @param int $max_tl How deep the domain list is (enclude example.co.uk (2) or - * example.lib.wy.us (3)?) - * @see http://www.programmierer-forum.de/domainnamen-ermitteln-t244185.htm - * @throws MWException + * @param $url + * @return string */ - private function createTLDCache( $cacheFile, $max_tl = 2 ) { - $cacheFolder = str_replace( basename( $cacheFile ), '', $cacheFile ); - if ( !is_writable( $cacheFolder ) ) { - throw new MWException( $cacheFolder . ' is not writeable!' ); - } - $tlds = file( - 'http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1' - ); - if ( $tlds === false ) { - throw new MWException( 'Domainlist can not be downloaded!' ); - } - $i = 0; - // remove unnecessary lines - foreach ( $tlds as $tld ) { - $tlds[ $i ] = trim( $tld ); - /** - * empty - * comments - * top level domains - * is overboard - */ - if ( - !$tlds[ $i ] || - $tld[0] == '/' || - strpos( $tld, '.' ) === false || - substr_count( $tld, '.' ) >= $max_tl - ) { - unset( $tlds[ $i ] ); + private function getDomainPart( $url ) { + $parts = array_reverse( $url ); + foreach ( $parts as $key => $part ) { + $tld = implode( '.', $parts ); + if ( isset( $this->publicSuffixes[$tld] ) ) { + return implode( '.', array_slice( $url, $key - 1 ) ); } - $i++; + array_pop( $parts ); } - $tlds = array_values( $tlds ); - file_put_contents( - $cacheFile, - "<?php\n" . '$tlds = ' . str_replace( - [ ' ', "\n" ], - '', - var_export( $tlds, true ) - ) . ";\n?" . ">" - ); + + return implode( '.', $url ); } } diff --git a/maintenance/updatePublicSuffixArray.php b/maintenance/updatePublicSuffixArray.php new file mode 100644 index 0000000..0644ebf --- /dev/null +++ b/maintenance/updatePublicSuffixArray.php @@ -0,0 +1,66 @@ +<?php +/** + * Remove invalid events from echo_event and echo_notification + * + * @ingroup Maintenance + */ + +require_once getenv( 'MW_INSTALL_PATH' ) !== false + ? getenv( 'MW_INSTALL_PATH' ) . '/maintenance/Maintenance.php' + : __DIR__ . '/../../../maintenance/Maintenance.php'; + +/** + * Maintenance script that updates the public suffix array. + * + * @ingroup Maintenance + */ +class UpdatePublicSuffixArray extends Maintenance { + public function __construct() { + parent::__construct(); + + $this->mDescription = 'Updates the list of public suffixes used for domain recognition.'; + $this->requireExtension( 'GoogleLogin' ); + } + + public function execute() { + $arrayDirectory = __DIR__ . '/../'; + if ( !is_writable( $arrayDirectory ) ) { + throw new MWException( $arrayDirectory . ' is not writeable!' ); + } + $publicSuffixList = file( + 'https://publicsuffix.org/list/public_suffix_list.dat' + ); + if ( $publicSuffixList === false ) { + throw new MWException( 'Domainlist can not be downloaded!' ); + } + $publicSuffixes = []; + + foreach ( $publicSuffixList as $suffix ) { + $suffix = trim( $suffix ); + + if ( !$suffix || strpos( $suffix, '/' ) === 0 ) { + continue; + } + if ( strpos( $suffix, '*.' ) !== false ) { + $suffix = substr( $suffix, 2 ); + } + if ( strpos( $suffix, '!' ) === 0 ) { + $suffix = substr( $suffix, 1 ); + } + $suffix = implode( '.', array_reverse( + explode( + '.', + $suffix + ) + ) ); + $publicSuffixes[] = $suffix; + } + + file_put_contents( + $arrayDirectory . \GoogleLogin\Constants::PUBLIC_SUFFIX_ARRAY_FILE, + "<?php\n" . 'return [ "' . implode( "\",\n\"", $publicSuffixes ) . '" ];' + ); + } +} +$maintClass = 'UpdatePublicSuffixArray'; // Tells it to run the class +require_once RUN_MAINTENANCE_IF_MAIN; \ No newline at end of file diff --git a/tests/phpunit/includes/alloweddomains/EmailDomainTest.php b/tests/phpunit/includes/alloweddomains/EmailDomainTest.php new file mode 100644 index 0000000..798a8f5 --- /dev/null +++ b/tests/phpunit/includes/alloweddomains/EmailDomainTest.php @@ -0,0 +1,58 @@ +<?php + +namespace GoogleLogin\AllowedDomains; + +class EmailDomainTest extends \MediaWikiTestCase { + /** + * @var EmailDomain + */ + private $googleMail; + + /** + * @var EmailDomain + */ + private $subdomainMail; + + /** + * @var EmailDomain + */ + private $emptyMail; + + /** + * @var EmailDomain + */ + private $twoSuffixMail; + + public function setUp() { + parent::setUp(); + $this->googleMail = new EmailDomain( 't...@gmail.com', false ); + $this->emptyMail = new EmailDomain( '', false ); + $this->subdomainMail = new EmailDomain( 't...@my.subdomain.com', false ); + $this->twoSuffixMail = new EmailDomain( 't...@my.subdomain.co.us', false ); + } + + public function testGetEMail() { + $this->assertEquals( 't...@gmail.com', $this->googleMail->getEmail() ); + $this->assertEquals( '', $this->emptyMail->getEmail() ); + $this->assertEquals( 't...@my.subdomain.com', $this->subdomainMail->getEmail() ); + $this->assertEquals( 't...@my.subdomain.co.us', $this->twoSuffixMail->getEmail() ); + } + + public function testGetHost() { + $this->assertEquals( '', $this->emptyMail->getHost() ); + $this->assertEquals( 'gmail.com', $this->googleMail->getHost() ); + $this->assertEquals( 'subdomain.com', $this->subdomainMail->getHost() ); + $this->assertEquals( 'subdomain.co.us', $this->twoSuffixMail->getHost() ); + } + + public function testGetHostStrict() { + $emptyMailStrict = new EmailDomain( '', true ); + $googleMailStrict = new EmailDomain( 't...@gmail.com', true ); + $subdomainMailStrict = new EmailDomain( 't...@my.subdomain.com', true ); + $twoSuffixMailStrict = new EmailDomain( 't...@my.subdomain.co.us', true ); + $this->assertEquals( '', $emptyMailStrict->getHost() ); + $this->assertEquals( 'gmail.com', $googleMailStrict->getHost() ); + $this->assertEquals( 'my.subdomain.com', $subdomainMailStrict->getHost() ); + $this->assertEquals( 'my.subdomain.co.us', $twoSuffixMailStrict->getHost() ); + } +} -- To view, visit https://gerrit.wikimedia.org/r/363033 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I006de79ca6217ec7da662a04b5baff000058c98c Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/GoogleLogin Gerrit-Branch: master Gerrit-Owner: Florianschmidtwelzow <florian.schmidt.stargatewis...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits