Revision: 28120
http://sourceforge.net/p/bibdesk/svn/28120
Author: hofman
Date: 2022-12-15 17:26:56 +0000 (Thu, 15 Dec 2022)
Log Message:
-----------
Rewrite IACR web parser. The site pages have significantly changed. Get bibtex
from preelement ofarticle pages. For search or day results, download individual
paper pages.
Modified Paths:
--------------
trunk/bibdesk/BDSKIACRParser.h
trunk/bibdesk/BDSKIACRParser.m
Modified: trunk/bibdesk/BDSKIACRParser.h
===================================================================
--- trunk/bibdesk/BDSKIACRParser.h 2022-12-15 15:14:02 UTC (rev 28119)
+++ trunk/bibdesk/BDSKIACRParser.h 2022-12-15 17:26:56 UTC (rev 28120)
@@ -37,8 +37,8 @@
*/
#import <Cocoa/Cocoa.h>
-#import "BDSKWebParser.h"
+#import "BDSKAsynchronousWebParser.h"
-@interface BDSKIACRParser : BDSKWebParser
+@interface BDSKIACRParser : BDSKAsynchronousWebParser
@end
Modified: trunk/bibdesk/BDSKIACRParser.m
===================================================================
--- trunk/bibdesk/BDSKIACRParser.m 2022-12-15 15:14:02 UTC (rev 28119)
+++ trunk/bibdesk/BDSKIACRParser.m 2022-12-15 17:26:56 UTC (rev 28120)
@@ -40,6 +40,7 @@
#import "BibItem.h"
#import "DOMNode_BDSKExtensions.h"
#import "NSURL_BDSKExtensions.h"
+#import "NSString_BDSKExtensions.h"
#import <AGRegex/AGRegex.h>
@@ -50,100 +51,102 @@
if ([url hasDomain:@"eprint.iacr.org"] == NO)
return NO;
- if ([[[url path] lowercaseString] isEqualToString:@"/cgi-bin/search.pl"])
+ if ([url hasFirstPathComponent:@"search"] || [url
hasFirstPathComponent:@"days"])
return YES;
-
+
AGRegex *absRegex = [AGRegex regexWithPattern:@"^/[0-9]{4}/[0-9]+$"];
if ([absRegex findInString:[url path]])
return YES;
+ DOMNode *node = [[domDocument documentElement] singleNodeForXPath:[self
citationNodeXPath]];
+
+ if (node)
+ return YES;
+
return NO;
}
++ (NSString *)citationNodeXPath { return @"./body//a[@class='paperlink']"; }
+
- (NSArray *)itemsReturningError:(NSError **)outError {
-
- NSMutableArray *items = [NSMutableArray array];
-
NSURL *url = [self URL];
+ DOMElement *rootElement = [[self domDocument] documentElement];
+ AGRegex *yrnRegex = [AGRegex regexWithPattern:@"^/([0-9]{4})/([0-9]+)$"];
+ AGRegexMatch *yrnMatch = [yrnRegex findInString:[url path]];
+
+ if (yrnMatch) {
+ // individual article
+
+ DOMNode *bibtexNode = [rootElement
singleNodeForXPath:@"./body//pre[@id='bibtex']"];
+
+ if (bibtexNode) {
+ NSString *bibtexString = [bibtexNode stringValue];
+ BibItem *item = [[self itemsFromBibTeXString:bibtexString
error:NULL] firstObject];
+
+ if (item) {
+ NSString *baseURLString = [url absoluteString];
+ [item setField:BDSKUrlString toValue:nil];
+ [item addURLString:[baseURLString
stringByAppendingPathExtension:@"pdf"]];
+ [item addURLString:baseURLString];
+
+ return [NSArray arrayWithObjects:item, nil];
+ }
+ }
+
+ } else {
+ // search results or articles of previous days
+
+ NSString *paperNodeXPath = @"./body//a[@class='paperlink']";
+ NSArray *paperNodes = [rootElement nodesForXPath:[[self class]
citationNodeXPath]];
+
+ for (DOMNode *paperNode in paperNodes) {
+ NSString *path = [paperNode stringValueOfAttribute:@"href"];
+ yrnMatch = [yrnRegex findInString:path];
+ NSString *year = [yrnMatch groupAtIndex:1];
+ NSString *reportNum = [yrnMatch groupAtIndex:2];
+
+ NSURL *bibtexURL = [[NSURL URLWithString:path relativeToURL:url]
absoluteURL];
+ NSURLRequest *request = [NSURLRequest requestWithURL:bibtexURL];
+ NSDictionary *contextInfo = [NSDictionary
dictionaryWithObjectsAndKeys:[bibtexURL absoluteString], @"baseURLString", nil];
+
+ [self addDownloadWithRequest:request contextInfo:contextInfo];
+ }
+
+ }
+
+ return nil;
+}
- // is this a search results page or an individual article?
- BOOL isSearch = [[[url path] lowercaseString]
isEqualToString:@"/cgi-bin/search.pl"];
+- (NSArray *)itemsFromDownload:(BDSKCitationDownload *)download error:(NSError
**)outError {
+ NSXMLDocument *xmlDoc = [[[NSXMLDocument alloc] initWithData:[download
data] options:NSXMLDocumentTidyHTML error:outError] autorelease];
- // construct the source item(s) to parse
- NSArray *sources = nil;
- DOMElement *rootElement = [[self domDocument] documentElement];
- if (isSearch)
- sources = [rootElement nodesForXPath:@"./body//dt"];
- else
- sources = [NSArray arrayWithObjects:rootElement, nil];
-
- if ([sources count] == 0)
+ if (xmlDoc == nil)
return nil;
- DOMXPathExpression *titleNodePath = nil;
- DOMXPathExpression *authorNodePath = nil;
- DOMXPathExpression *pathToSearchNodePath = nil;
+ NSXMLNode *bibtexNode = [[[xmlDoc rootElement]
nodesForXPath:@"./body//pre[@id='bibtex']" error:NULL] firstObject];
- if (isSearch) {
- titleNodePath = [[self domDocument]
createExpression:@"following-sibling::dd/b" resolver:nil];
- authorNodePath = [[self domDocument]
createExpression:@"following-sibling::dd[position()=2]/em" resolver:nil];
- pathToSearchNodePath = [[self domDocument]
createExpression:@".//a/@href" resolver:nil];
- } else {
- titleNodePath = [[self domDocument] createExpression:@".//b"
resolver:nil];
- authorNodePath = [[self domDocument] createExpression:@".//i"
resolver:nil];
- }
-
- for (DOMNode *sourceNode in sources) {
-
- NSMutableDictionary *pubFields = [NSMutableDictionary
dictionary];
- NSArray *urlsArray = nil;
- NSString *pathToSearch = nil;
- DOMNode *node;
- NSString *string;
+ if (bibtexNode) {
+ NSString *baseURLString = [[download contextInfo]
objectForKey:@"baseURLString"];
+ AGRegex *yrnRegex = [AGRegex
regexWithPattern:@"^/([0-9]{4})/([0-9]+)$"];
+ AGRegexMatch *yrnMatch = [yrnRegex findInString:[[NSURL
URLWithString:baseURLString] path]];
+ NSString *year = [yrnMatch groupAtIndex:1];
+ NSString *reportNum = [yrnMatch groupAtIndex:2];
- // set title
- node = [sourceNode singleNodeForXPathExpression:titleNodePath];
- if ((string = [node stringValue]))
- [pubFields setObject:string forKey:BDSKTitleString];
- // set authors
- node = [sourceNode singleNodeForXPathExpression:authorNodePath];
- if ((string = [node stringValue]))
- [pubFields setObject:string forKey:BDSKAuthorString];
- // to get year and report number
- if (pathToSearchNodePath) {
- node = [sourceNode
singleNodeForXPathExpression:pathToSearchNodePath];
- if ((string = [node stringValue]))
- pathToSearch = string;
- } else {
- pathToSearch = [url path];
+ NSString *bibtexString = [[bibtexNode stringValue]
stringByTrimmingCharactersInSet:[NSCharacterSet
whitespaceAndNewlineCharacterSet]];
+ BibItem *item = [[self itemsFromBibTeXString:bibtexString error:NULL]
firstObject];
+
+ if (item) {
+ [item setField:BDSKUrlString toValue:nil];
+ [item addURLString:[baseURLString
stringByAppendingPathExtension:@"pdf"]];
+ [item addURLString:baseURLString];
}
- // compute year and report number
- AGRegex *yrnRegex = [AGRegex
regexWithPattern:@"^/([0-9]{4})/([0-9]+)$"];
- AGRegexMatch *yrnMatch = [yrnRegex findInString:pathToSearch];
- NSString *year = [yrnMatch groupAtIndex:1];
- NSString *reportNum = [yrnMatch groupAtIndex:2];
- NSString *urlBaseString = [NSString stringWithFormat:@"%@://%@/%@/%@",
[url scheme], [url host], year, reportNum];
-
- // set year, report number, PDF url, eprint
- if ((year != nil) && (reportNum != nil)) {
- [pubFields setValue:year forKey:BDSKYearString];
- [pubFields setValue:[NSString
stringWithFormat:@"Cryptology ePrint Archive, Report %@/%@", year, reportNum]
forKey:@"Note"];
- urlsArray = [NSArray arrayWithObjects:
- [urlBaseString stringByAppendingPathExtension:@"pdf"],
urlBaseString, nil];
- [pubFields setValue:[NSString
stringWithFormat:@"\\url{%@}", urlBaseString] forKey:@"Eprint"];
- }
-
- // add item
- BibItem *item = [[BibItem alloc] initWithType:BDSKMiscString
citeKey:nil pubFields:pubFields URLStrings:urlsArray];
- [items addObject:item];
- [item release];
-
- }
-
- return items;
+ return [NSArray arrayWithObjects:item, nil];
+ }
+
+ return nil;
}
+ (NSString *)address { return @"https://eprint.iacr.org/"; }
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
_______________________________________________
Bibdesk-commit mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/bibdesk-commit