BryanDavis has uploaded a new change for review. https://gerrit.wikimedia.org/r/172101
Change subject: Avoid glibc iconv bug by using mb_convert_encoding ...................................................................... Avoid glibc iconv bug by using mb_convert_encoding Iconv from Glibc 2.14 has a bug in handling the '//IGNORE' directive which causes valid characters following the first invalid charter to be skipped if the caller follows the documented semantics of their EILSEQ error code [0]. PHP has effectively WONTFIX'ed working around this bug in their iconv() implementation [1]. This makes the use of '//IGNORE' with iconv basically useless. Luckily there is a work alike solution documented by "orrd101 at gmail dot com" [2] that can be used to achieve the desired result of stripping invalid characters from a string. [0]: https://sourceware.org/bugzilla/show_bug.cgi?id=13541 [1]: https://bugs.php.net/bug.php?id=48147 [2]: http://php.net/manual/en/function.iconv.php#108643 Bug: 37665 Change-Id: Icf9e7deb73fdcea6ad074eb56153c5353064a929 --- M includes/Revision.php M includes/media/Exif.php M includes/media/IPTC.php M includes/media/JpegMetadataExtractor.php M includes/media/XMP.php M includes/upload/UploadBase.php M languages/Language.php 7 files changed, 31 insertions(+), 29 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core refs/changes/01/172101/1 diff --git a/includes/Revision.php b/includes/Revision.php index e81ed75..0ef70b4 100644 --- a/includes/Revision.php +++ b/includes/Revision.php @@ -1313,8 +1313,7 @@ # Upconvert on demand. # ("utf8" checked for compatibility with some broken # conversion scripts 2008-12-30) - global $wgContLang; - $text = $wgContLang->iconv( $wgLegacyEncoding, 'UTF-8', $text ); + $text = Language::iconv( $wgLegacyEncoding, 'UTF-8', $text ); } return $text; diff --git a/includes/media/Exif.php b/includes/media/Exif.php index 018b58c..58c302b 100644 --- a/includes/media/Exif.php +++ b/includes/media/Exif.php @@ -471,17 +471,13 @@ break; } if ( $charset ) { - wfSuppressWarnings(); - $val = iconv( $charset, 'UTF-8//IGNORE', $val ); - wfRestoreWarnings(); + $val = Language::iconv( $charset, 'UTF-8', $val ); } else { // if valid utf-8, assume that, otherwise assume windows-1252 $valCopy = $val; UtfNormal::quickIsNFCVerify( $valCopy ); //validates $valCopy. if ( $valCopy !== $val ) { - wfSuppressWarnings(); - $val = iconv( 'Windows-1252', 'UTF-8//IGNORE', $val ); - wfRestoreWarnings(); + $val = Language::iconv( 'Windows-1252', 'UTF-8', $val ); } } diff --git a/includes/media/IPTC.php b/includes/media/IPTC.php index 478249f..f9189f4 100644 --- a/includes/media/IPTC.php +++ b/includes/media/IPTC.php @@ -445,9 +445,7 @@ */ private static function convIPTCHelper( $data, $charset ) { if ( $charset ) { - wfSuppressWarnings(); - $data = iconv( $charset, "UTF-8//IGNORE", $data ); - wfRestoreWarnings(); + $data = Language::iconv( $charset, 'UTF-8', $data ); if ( $data === false ) { $data = ""; wfDebugLog( 'iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8" ); diff --git a/includes/media/JpegMetadataExtractor.php b/includes/media/JpegMetadataExtractor.php index 8c5b46b..21cdcad 100644 --- a/includes/media/JpegMetadataExtractor.php +++ b/includes/media/JpegMetadataExtractor.php @@ -102,9 +102,7 @@ // turns $com to valid utf-8. // thus if no change, its utf-8, otherwise its something else. if ( $com !== $oldCom ) { - wfSuppressWarnings(); - $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom ); - wfRestoreWarnings(); + $com = $oldCom = Language::iconv( 'windows-1252', 'UTF-8', $oldCom ); } // Try it again, if its still not a valid string, then probably // binary junk or some really weird encoding, so don't extract. diff --git a/includes/media/XMP.php b/includes/media/XMP.php index cdbd5ab..b119822 100644 --- a/includes/media/XMP.php +++ b/includes/media/XMP.php @@ -300,9 +300,7 @@ } if ( $this->charset !== 'UTF-8' ) { //don't convert if already utf-8 - wfSuppressWarnings(); - $content = iconv( $this->charset, 'UTF-8//IGNORE', $content ); - wfRestoreWarnings(); + $content = Language::iconv( $this->charset, 'UTF-8', $content ); } $ok = xml_parse( $this->xmlParser, $content, $allOfIt ); diff --git a/includes/upload/UploadBase.php b/includes/upload/UploadBase.php index a278652..db8694e 100644 --- a/includes/upload/UploadBase.php +++ b/includes/upload/UploadBase.php @@ -1130,7 +1130,7 @@ } if ( $enc ) { - $chunk = iconv( $enc, "ASCII//IGNORE", $chunk ); + $chunk = Language::iconv( $enc, 'ASCII', $chunk ); } $chunk = trim( $chunk ); diff --git a/languages/Language.php b/languages/Language.php index fb04255..b1d8884 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -2517,17 +2517,30 @@ * @param string $string * @return string */ - function iconv( $in, $out, $string ) { + public static function iconv( $in, $out, $string ) { # This is a wrapper for iconv in all languages except esperanto, # which does some nasty x-conversions beforehand - # Even with //IGNORE iconv can whine about illegal characters in - # *input* string. We just ignore those too. - # REF: http://bugs.php.net/bug.php?id=37166 - # REF: https://bugzilla.wikimedia.org/show_bug.cgi?id=16885 - wfSuppressWarnings(); - $text = iconv( $in, $out . '//IGNORE', $string ); - wfRestoreWarnings(); + if ( function_exists( 'mb_convert_encoding' ) ) { + // Bug 37665: work around problem with glibc's iconv that keeps + // '//IGNORE' from returning characters following an invalid + // byte sequence. + // REF: http://php.net/manual/en/function.iconv.php#108643 + // REF: https://sourceware.org/bugzilla/show_bug.cgi?id=13541 + // REF: https://bugs.php.net/bug.php?id=48147 + $oldSubst = mb_substitute_character(); + mb_substitute_character( 'none' ); + $text = mb_convert_encoding( $string, $out, $in ); + mb_substitute_character( $oldSubst ); + } else { + # Even with //IGNORE iconv can whine about illegal characters in + # *input* string. We just ignore those too. + # REF: http://bugs.php.net/bug.php?id=37166 + # REF: https://bugzilla.wikimedia.org/show_bug.cgi?id=16885 + wfSuppressWarnings(); + $text = iconv( $in, $out . '//IGNORE', $string ); + wfRestoreWarnings(); + } return $text; } @@ -2792,7 +2805,7 @@ return $s; } - return $this->iconv( $this->fallback8bitEncoding(), 'utf-8', $s ); + return self::iconv( $this->fallback8bitEncoding(), 'utf-8', $s ); } /** @@ -2957,7 +2970,7 @@ if ( $wgEditEncoding == '' || $wgEditEncoding == 'UTF-8' ) { return $s; } else { - return $this->iconv( 'UTF-8', $wgEditEncoding, $s ); + return self::iconv( 'UTF-8', $wgEditEncoding, $s ); } } @@ -2976,7 +2989,7 @@ if ( $enc == 'UTF-8' ) { return $s; } else { - return $this->iconv( $enc, 'UTF-8', $s ); + return self::iconv( $enc, 'UTF-8', $s ); } } -- To view, visit https://gerrit.wikimedia.org/r/172101 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Icf9e7deb73fdcea6ad074eb56153c5353064a929 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/core Gerrit-Branch: master Gerrit-Owner: BryanDavis <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
