jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/381229 )
Change subject: Skip simple strings in N3Quoter::escapeLiteral
......................................................................
Skip simple strings in N3Quoter::escapeLiteral
This makes the function about 50% faster on the first 5,000 Wikidata
entities. That makes the truthy dump about 7% faster in general.
This might have more or less effect on the entire truthy dump, but
I'm not willing to benchmark this in detail.
I've also added some more tests here that pass with both the old
and the new version.
Bug: T176844
Change-Id: I7141a58a022a98373c1390ee4e336fb9ee54f6c2
---
M src/N3Quoter.php
M tests/phpunit/N3QuoterTest.php
2 files changed, 23 insertions(+), 1 deletion(-)
Approvals:
Smalyshev: Looks good to me, but someone else must approve
jenkins-bot: Verified
Thiemo Mättig (WMDE): Looks good to me, approved
diff --git a/src/N3Quoter.php b/src/N3Quoter.php
index f7f1899..c264224 100644
--- a/src/N3Quoter.php
+++ b/src/N3Quoter.php
@@ -50,6 +50,12 @@
* @return string
*/
public function escapeLiteral( $s ) {
+ // Performance: If the entire string is just (a safe subset) of
ASCII, let it through.
+ // Ok are space (31), ! (32), # (35) - [ (91) and ] (93) to ~
(126), excludes " (34) and \ (92).
+ if ( preg_match( '/^[ !#-[\]-~]*\z/', $s ) ) {
+ return $s;
+ }
+
// String escapes. Note that the N3 spec is more restrictive
than the Turtle and TR
// specifications, see
<https://www.w3.org/TeamSubmission/n3/#escaping>
// and <https://www.w3.org/TR/turtle/#string>
diff --git a/tests/phpunit/N3QuoterTest.php b/tests/phpunit/N3QuoterTest.php
index e6ac69c..980c1ad 100644
--- a/tests/phpunit/N3QuoterTest.php
+++ b/tests/phpunit/N3QuoterTest.php
@@ -4,6 +4,7 @@
use PHPUnit_Framework_TestCase;
use Wikimedia\Purtle\N3Quoter;
+use Wikimedia\Purtle\UnicodeEscaper;
/**
* @covers Wikimedia\Purtle\N3Quoter
@@ -52,8 +53,16 @@
}
public function provideEscapeLiteral() {
+ $shortCircuitedChars = '
!#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ' .
+ '[]^_`abcdefghijklmnopqrstuvwxyz{|}~';
+
return [
[ 'Hello World', 'Hello World' ],
+ [ 'Hello#World', 'Hello#World' ],
+ [ 'Hello[<f>a+n%cy]World^', 'Hello[<f>a+n%cy]World^' ],
+ [ $shortCircuitedChars, $shortCircuitedChars ],
+ [ $shortCircuitedChars, $shortCircuitedChars, true ],
+ [ 'Hello"World', 'Hello\\"World' ],
[ "Hello\nWorld", 'Hello\nWorld' ],
[ "Hello\tWorld", 'Hello\tWorld' ],
[ 'Hällo Wörld', 'Hällo Wörld', false ],
@@ -70,7 +79,14 @@
$quoter = new N3Quoter();
$quoter->setEscapeUnicode( $escapeUnicode );
- $this->assertEquals( $expected, $quoter->escapeLiteral(
$literal ) );
+ $actual = $quoter->escapeLiteral( $literal );
+ $this->assertSame( $expected, $actual );
+
+ if ( $escapeUnicode ) {
+ // Make sure unicode escaping was correctly applied:
+ $escaper = new UnicodeEscaper();
+ $this->assertSame( $escaper->escapeString( $expected ),
$actual );
+ }
}
}
--
To view, visit https://gerrit.wikimedia.org/r/381229
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I7141a58a022a98373c1390ee4e336fb9ee54f6c2
Gerrit-PatchSet: 4
Gerrit-Project: purtle
Gerrit-Branch: master
Gerrit-Owner: Hoo man <[email protected]>
Gerrit-Reviewer: Daniel Kinzler <[email protected]>
Gerrit-Reviewer: Hoo man <[email protected]>
Gerrit-Reviewer: Smalyshev <[email protected]>
Gerrit-Reviewer: Thiemo Mättig (WMDE) <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits