Dr0ptp4kt has uploaded a new change for review. https://gerrit.wikimedia.org/r/196298
Change subject: Address nested parentheses in Share a Fact ...................................................................... Address nested parentheses in Share a Fact Additionally, trim whitespace before semicolons. Finally, use autoreleasepools, as a reviewer had requested. Change-Id: I3c26877c7e4e220e84af9f192423384ea3b5c64e --- M WikipediaUnitTests/NSString+WMFHTMLParsingTests.m M wikipedia/Categories/NSString+WMFHTMLParsing.m 2 files changed, 112 insertions(+), 93 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/apps/ios/wikipedia refs/changes/98/196298/1 diff --git a/WikipediaUnitTests/NSString+WMFHTMLParsingTests.m b/WikipediaUnitTests/NSString+WMFHTMLParsingTests.m index 698180d..f4dd719 100644 --- a/WikipediaUnitTests/NSString+WMFHTMLParsingTests.m +++ b/WikipediaUnitTests/NSString+WMFHTMLParsingTests.m @@ -33,8 +33,10 @@ } - (void)testAdequateSnippet { - NSString *string = @"<p>Dog (woof) [horse] adequately long string</p>"; - XCTAssertEqualObjects([string wmf_getStringSnippetWithoutHTML], @"Dog adequately long string"); + NSString *string = @"<p>Dog (woof (w00t)) [horse] adequately long string historically 40 characters.</p>"; + NSString *result = [string wmf_getStringSnippetWithoutHTML]; + XCTAssertEqualObjects([string wmf_getStringSnippetWithoutHTML], + @"Dog adequately long string historically 40 characters."); } @end diff --git a/wikipedia/Categories/NSString+WMFHTMLParsing.m b/wikipedia/Categories/NSString+WMFHTMLParsing.m index 9c06619..0e5663b 100644 --- a/wikipedia/Categories/NSString+WMFHTMLParsing.m +++ b/wikipedia/Categories/NSString+WMFHTMLParsing.m @@ -42,100 +42,117 @@ } + (NSString*)wmf_stringSnippetSimplifiedInString:(NSString*)string { - NSString* result = [string stringByReplacingOccurrencesOfString:@"&" withString:@"&"]; - NSError* err = nil; - NSRegularExpression* newlinesRegex = [NSRegularExpression - regularExpressionWithPattern:@"\n{2,}" - options:0 - error:&err]; - NSRange range = NSMakeRange(0, result.length); - result = [newlinesRegex stringByReplacingMatchesInString:result + @autoreleasepool { + NSString* result = [string stringByReplacingOccurrencesOfString:@"&" withString:@"&"]; + NSError* err = nil; + NSRegularExpression* newlinesRegex = [NSRegularExpression + regularExpressionWithPattern:@"\n{2,}" + options:0 + error:&err]; + NSRange range = NSMakeRange(0, result.length); + result = [newlinesRegex stringByReplacingMatchesInString:result + options:0 + range:range + withTemplate:@"\n"]; + + + // We probably don't want to try to handle ideographic parens + err = nil; + NSRegularExpression* parensRegex = [NSRegularExpression + regularExpressionWithPattern:@"[(][^()]+[)]" + options:0 + error:&err]; + + result = [NSString wmf_recursivelyUpdateString:result withRegex:parensRegex]; + + // Nor do we want to try to handle ideographic brackets + err = nil; + NSRegularExpression* bracketsRegex = [NSRegularExpression + regularExpressionWithPattern:@"\\[[^]]+]" + options:0 + error:&err]; + + range = NSMakeRange(0, result.length); + result = [bracketsRegex stringByReplacingMatchesInString:result + options:0 + range:range + withTemplate:@""]; + + // Unlike parens and brackets and unlike doubled up space in general, + // we do not want whitespace preceding the comma, ideographic comma, + // or semicolon + err = nil; + NSRegularExpression* whitespaceCommaSemicolonRegex = [NSRegularExpression + regularExpressionWithPattern:@"\\s+([,、;])" options:0 - range:range - withTemplate:@"\n"]; - - - // We probably don't want to try to handle ideographic parens - err = nil; - NSRegularExpression* parensRegex = [NSRegularExpression - regularExpressionWithPattern:@"[(][^)]+[)]" - options:0 - error:&err]; - range = NSMakeRange(0, result.length); - result = [parensRegex stringByReplacingMatchesInString:result - options:0 - range:range - withTemplate:@""]; - - // Nor do we want to try to handle ideographic brackets - err = nil; - NSRegularExpression* bracketsRegex = [NSRegularExpression - regularExpressionWithPattern:@"\\[[^]]+]" - options:0 - error:&err]; - range = NSMakeRange(0, result.length); - result = [bracketsRegex stringByReplacingMatchesInString:result - options:0 - range:range - withTemplate:@""]; - - // Unlike parens and brackets and unlike doubled up space in general, - // we do not want whitespace preceding the comma or ideographic comma - err = nil; - NSRegularExpression* whitespaceCommaRegex = [NSRegularExpression - regularExpressionWithPattern:@"\\s+([,、])" + error:&err]; + range = NSMakeRange(0, result.length); + result = [whitespaceCommaSemicolonRegex stringByReplacingMatchesInString:result + options:0 + range:range + withTemplate:@"$1"]; + + // Ideographic stops from TextExtracts, which were from OpenSearch + err = nil; + NSRegularExpression* whitespacePeriodRegex = [NSRegularExpression + regularExpressionWithPattern:@"\\s+([\\.|。|.|。])" + options:0 + error:&err]; + range = NSMakeRange(0, result.length); + result = [whitespacePeriodRegex stringByReplacingMatchesInString:result + options:0 + range:range + withTemplate:@"$1"]; + + // In practice, we rarely care about doubled up whitespace in the + // string except for the actual space character + err = nil; + NSRegularExpression* spacesRegex = [NSRegularExpression + regularExpressionWithPattern:@" {2,}" + options:0 + error:&err]; + range = NSMakeRange(0, result.length); + result = [spacesRegex stringByReplacingMatchesInString:result + options:0 + range:range + withTemplate:@" "]; + + // Note about trailing colon characters: they usually look strange if kept, + // and removing them (plus spaces and newlines) doesn't often create merged + // words that look bad - these are usually at tag boundaries. For Latinized + // langs sometimes this means words like "include" finish the snippet. + // But as a matter of markup structure, something like a <p> tag + // shouldn't be </p> closed until something like <ul>...</ul> is closed. + // In fact, some sections have this layout, and some do not. + err = nil; + NSRegularExpression* leadingTrailingWhitespaceNewlineRegex = [NSRegularExpression + regularExpressionWithPattern:@"^[\\s\n]+|[\\s\n:]+$" options:0 - error:&err]; - range = NSMakeRange(0, result.length); - result = [whitespaceCommaRegex stringByReplacingMatchesInString:result - options:0 - range:range - withTemplate:@"$1"]; + error:&err]; + range = NSMakeRange(0, result.length); + result = [leadingTrailingWhitespaceNewlineRegex stringByReplacingMatchesInString:result + options:0 + range:range + withTemplate:@""]; + + return result; + } +} - // Ideographic stops from TextExtracts, which were from OpenSearch - err = nil; - NSRegularExpression* whitespacePeriodRegex = [NSRegularExpression - regularExpressionWithPattern:@"\\s+([\\.|。|.|。])" - options:0 - error:&err]; - range = NSMakeRange(0, result.length); - result = [whitespacePeriodRegex stringByReplacingMatchesInString:result - options:0 - range:range - withTemplate:@"$1"]; - - // In practice, we rarely care about doubled up whitespace in the - // string except for the actual space character - err = nil; - NSRegularExpression* spacesRegex = [NSRegularExpression - regularExpressionWithPattern:@" {2,}" - options:0 - error:&err]; - range = NSMakeRange(0, result.length); - result = [spacesRegex stringByReplacingMatchesInString:result - options:0 - range:range - withTemplate:@" "]; - - // Note about trailing colon characters: they usually look strange if kept, - // and removing them (plus spaces and newlines) doesn't often create merged - // words that look bad - these are usually at tag boundaries. For Latinized - // langs sometimes this means words like "include" finish the snippet. - // But as a matter of markup structure, something like a <p> tag - // shouldn't be </p> closed until something like <ul>...</ul> is closed. - // In fact, some sections have this layout, and some do not. - err = nil; - NSRegularExpression* leadingTrailingWhitespaceNewlineRegex = [NSRegularExpression - regularExpressionWithPattern:@"^[\\s\n]+|[\\s\n:]+$" - options:0 - error:&err]; - range = NSMakeRange(0, result.length); - result = [leadingTrailingWhitespaceNewlineRegex stringByReplacingMatchesInString:result - options:0 - range:range - withTemplate:@""]; - - return result; ++wmf_recursivelyUpdateString : (NSString*)string withRegex : (NSRegularExpression*)regex { + NSString* oldResult; + NSRange range; + @autoreleasepool { + do { + oldResult = [string copy]; + range = NSMakeRange(0, string.length); + string = [regex stringByReplacingMatchesInString:string + options:0 + range:range + withTemplate:@""]; + } while (![oldResult isEqualToString:string]); + return string; + } } @end -- To view, visit https://gerrit.wikimedia.org/r/196298 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I3c26877c7e4e220e84af9f192423384ea3b5c64e Gerrit-PatchSet: 1 Gerrit-Project: apps/ios/wikipedia Gerrit-Branch: master Gerrit-Owner: Dr0ptp4kt <ab...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits