Mhurd has uploaded a new change for review.
https://gerrit.wikimedia.org/r/197823
Change subject: WIP: speed up html parse. Prune html before HPPLE parse.
......................................................................
WIP: speed up html parse. Prune html before HPPLE parse.
~50% faster on Obama article.
Change-Id: I3f407bbf7e75a130a8d4944f4dd75a35cf68bb88
---
M wikipedia/Networking/Fetchers/ArticleFetcher.m
1 file changed, 37 insertions(+), 24 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/apps/ios/wikipedia
refs/changes/23/197823/1
diff --git a/wikipedia/Networking/Fetchers/ArticleFetcher.m
b/wikipedia/Networking/Fetchers/ArticleFetcher.m
index a623786..fdbfc1b 100644
--- a/wikipedia/Networking/Fetchers/ArticleFetcher.m
+++ b/wikipedia/Networking/Fetchers/ArticleFetcher.m
@@ -99,11 +99,16 @@
return;
}
+ CFTimeInterval startTime = CACurrentMediaTime();
+
//[self applyResultsForLeadSection:leadSectionResults];
for (int n = 0; n < [self.article.sections count]; n++) {
(void)self.article.sections[n].images; // hack
[self createImageRecordsForSection:n];
}
+
+ CFTimeInterval elapsedTime = CACurrentMediaTime() - startTime;
+ NSLog(@"createImageRecordsForSection elapsedTime = %f",
elapsedTime);
[self associateThumbFromTempDirWithArticle];
@@ -232,8 +237,32 @@
[requestSerializer setValue:nil forHTTPHeaderField:@"X-MCCMNC"];
}
+- (NSString*)fastReduceToImgTagsOnlyFromHTML:(NSString*)html {
+ NSString* marker = @"<img ";
+ NSArray* stringsStartingWithImgTags = [html
componentsSeparatedByString:marker];
+ if (stringsStartingWithImgTags.count == 0) {
+ return @"";
+ }
+ NSMutableArray* output = [NSMutableArray
arrayWithCapacity:stringsStartingWithImgTags.count];
+ NSInteger iStart = [html hasPrefix:marker] ? 0 : 1; // Make it work
even if html starts with "<img ".
+ NSInteger counter = 0;
+ for (NSInteger i = iStart; i < stringsStartingWithImgTags.count; i++) {
+ NSString* thisImgTagPlusStuff = [marker
stringByAppendingString:stringsStartingWithImgTags[i]];
+ NSRange endOfImgTag = [thisImgTagPlusStuff
rangeOfString:@">"];
+ output[counter++] = (endOfImgTag.location != NSNotFound) ?
+ [thisImgTagPlusStuff substringToIndex :
endOfImgTag.location + 1]
+ :
+ thisImgTagPlusStuff;
+ }
+ return [output componentsJoinedByString:@""];
+}
+
- (void)createImageRecordsForSection:(int)sectionId {
NSString* html = self.article.sections[sectionId].text;
+
+ // Brute force strip out non-img tags as quickly as possible.
+ // This reduction makes "searchWithXPathQuery" ~2x faster on large
articles.
+ html = [self fastReduceToImgTagsOnlyFromHTML:html];
// Parse the section html extracting the image urls (in order)
// See: http://www.raywenderlich.com/14172/how-to-parse-html-on-ios
@@ -241,37 +270,21 @@
// Call *after* article record created but before section html sent across
bridge.
- // Reminder: don't do "context performBlockAndWait" here -
createImageRecordsForHtmlOnContext gets
- // called in a loop which is encompassed by such a block already!
-
if (html.length == 0) {
return;
}
- NSData* sectionHtmlData = [html
dataUsingEncoding:NSUTF8StringEncoding];
- TFHpple* sectionParser = [TFHpple
hppleWithHTMLData:sectionHtmlData];
- NSString* imageLinkElementsXpathQuery = @"//a[@class='image']";
- // ^ the navbox exclusion prevents images from the hidden navbox table
from appearing
- // in the last section's TOC cell.
+ NSData* sectionHtmlData = [html dataUsingEncoding:NSUTF8StringEncoding];
- NSArray* imageLinks = [sectionParser
searchWithXPathQuery:imageLinkElementsXpathQuery];
+ TFHpple* sectionParser = [TFHpple hppleWithHTMLData:sectionHtmlData];
+
+ NSString* imageElementsXpathQuery = @"//img[@src]";
+
+ NSArray* imageNodes = [sectionParser
searchWithXPathQuery:imageElementsXpathQuery];
+
NSUInteger imageIndexInSection = 0;
- for (TFHppleElement* linkNode in imageLinks) {
- NSInteger imageNodeIndex = [linkNode.children
indexOfObjectPassingTest:^BOOL (TFHppleElement* child, NSUInteger idx, BOOL*
stop) {
- if ([child.tagName isEqualToString:@"img"]) {
- *stop = YES;
- return YES;
- } else {
- return NO;
- }
- }];
- NSParameterAssert(imageNodeIndex != NSNotFound);
- if (imageNodeIndex == NSNotFound) {
- // TODO: handle this error somehow, for now, go to the next
linkNode
- continue;
- }
- TFHppleElement* imageNode = linkNode.children[imageNodeIndex];
+ for (TFHppleElement* imageNode in imageNodes) {
NSString* heightFromImgTag = imageNode.attributes[@"height"];
NSString* widthFromImgTag = imageNode.attributes[@"width"];
--
To view, visit https://gerrit.wikimedia.org/r/197823
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I3f407bbf7e75a130a8d4944f4dd75a35cf68bb88
Gerrit-PatchSet: 1
Gerrit-Project: apps/ios/wikipedia
Gerrit-Branch: master
Gerrit-Owner: Mhurd <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits