A url like this might give accurate results:
http://markmail.org/browse?q=list:wso2+from:[EMAIL PROTECTED]:jonathan.
Here's an updated stylesheet. The username has to be the alias (e.g.
jonathan) rather than the full name (e.g. Jonathan Marsh). The other
operations seem to work fine on the alias too.
The XSLT version also does sorting and totaling now.
Jonathan Marsh - http://www.wso2.com - http://auburnmarshes.spaces.live.com
> -----Original Message-----
> From: [EMAIL PROTECTED] [mailto:[EMAIL PROTECTED]
> On Behalf Of Samisa Abeysinghe
> Sent: Friday, May 23, 2008 10:24 AM
> To: [email protected]
> Subject: Re: [mashup-dev] Regex with Scrapers
>
> Jonathan Marsh wrote:
> > Here's something to get you going, which scrapes the pages in two
> different
> > ways:
> > - using regexp (slightly more elaborate than yours) (no sorting, no
> > totaling)
> > - using XSLT (sorting, no totaling)
> >
> > Interestingly, the XSLT version seems substantially faster, even with
> all
> > the parsing that is entailed.
> >
> > P.S. To me the "checkins" reported by markmail don't seem correct,
> way too
> > low.
> >
>
> Yes, that is because of infra problems I suspect, we seem to have
> missed
> check-in mails time to time.
> I too noticed this and I aggregated mails, check-ins and Jira to check
> if I get the total number right, and they tallied , so I suppose my
> logic is correct :)
>
> Samisa...
>
> > Jonathan Marsh - http://www.wso2.com -
> http://auburnmarshes.spaces.live.com
> >
> >
> >> -----Original Message-----
> >> From: [EMAIL PROTECTED] [mailto:mashup-dev-
> [EMAIL PROTECTED]
> >> On Behalf Of Samisa Abeysinghe
> >> Sent: Monday, May 19, 2008 2:07 PM
> >> To: [EMAIL PROTECTED]; [email protected]
> >> Subject: Re: [mashup-dev] Regex with Scrapers
> >>
> >> Keith Chapman wrote:
> >>
> >>> Hi Samisa,
> >>>
> >>> Yes the scraper supports regex. We haven't got a specific sample
> for
> >>> this in the Mashup Server. The can use regex in the scraper config
> as
> >>> explained in http://web-harvest.sourceforge.net/manual.php#regexp
> >>>
> >> I will look into that.
> >>
> >> What I want is to implement [1] with Mashups server. As you can see,
> >> with PHP, I scrape and then sort and display.
> >> I want to scrape and sort with my Mashup service :)
> >>
> >> Samisa...
> >>
> >> [1] http://ww2.wso2.org/~samisa/wso2_mailing_lists.phps
> >>
> >>
> >>> Thanks,
> >>> Keith.
> >>>
> >>>
> >>> Samisa Abeysinghe wrote:
> >>>
> >>>> Can I use a regex when scraping html content?
> >>>>
> >>>> What I have seen in samples is XSLT.
> >>>>
> >>>> Thanks,
> >>>> Samisa...
> >>>>
> >>>>
> >>> _______________________________________________
> >>> Mashup-dev mailing list
> >>> [email protected]
> >>> http://www.wso2.org/cgi-bin/mailman/listinfo/mashup-dev
> >>> -------------------------------------------------------------------
> --
> >>>
> >> ---
> >>
> >>> No virus found in this incoming message.
> >>> Checked by AVG.
> >>> Version: 8.0.100 / Virus Database: 269.23.20/1453 - Release Date:
> >>>
> >> 5/18/2008 9:31 AM
> >>
> >> --
> >> Samisa Abeysinghe
> >> Director, Engineering; WSO2 Inc.
> >>
> >> http://www.wso2.com/ - "The Open Source SOA Company"
> >>
> >>
> >> _______________________________________________
> >> Mashup-dev mailing list
> >> [email protected]
> >> http://www.wso2.org/cgi-bin/mailman/listinfo/mashup-dev
> >>
> >> --------------------------------------------------------------------
> ----
> >>
> >> _______________________________________________
> >> Mashup-dev mailing list
> >> [email protected]
> >> http://www.wso2.org/cgi-bin/mailman/listinfo/mashup-dev
> >> --------------------------------------------------------------------
> ----
> >>
> >>
> >> No virus found in this incoming message.
> >> Checked by AVG.
> >> Version: 8.0.100 / Virus Database: 269.24.0/1459 - Release Date:
> 5/21/2008 5:34 PM
>
>
> --
> Samisa Abeysinghe
> Director, Engineering; WSO2 Inc.
>
> http://www.wso2.com/ - "The Open Source SOA Company"
>
>
> _______________________________________________
> Mashup-dev mailing list
> [email protected]
> http://www.wso2.org/cgi-bin/mailman/listinfo/mashup-dev
/*
* Copyright 2008 WSO2, Inc. http://www.wso2.org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
Created 2008-05 Jonathan Marsh; [EMAIL PROTECTED]
*/
function summarizeActivity(user) {
function scrape2xmllist(scrape) {
var lists = scrape.split(",");
var response = <></>;
for (var i in lists) {
if (lists[i] != "") {
var item = lists[i].split(":");
response += <list name={item[0]} count={item[1]}/>;
}
}
return response;
}
var timestamp = new Date().valueOf();
var scraper;
var config =
<config>
<var-def name='summary'>
<regexp>
<regexp-pattern><![CDATA[<td[^>]*><a[^>]*>(.*?)</a></td><td[^>]*>(\d+)</td>]]></regexp-pattern>
<regexp-source>
<http method='get' url='filled-in-later' />
</regexp-source>
<regexp-result>
<template>${_1}:${_2},</template>
</regexp-result>
</regexp>
</var-def>
</config>;
config["var-def"[EMAIL PROTECTED] = "http://markmail.org/browse?q=list:wso2 from:" + user + " -subject:jira";
scraper = new Scraper(config);
var normal = scrape2xmllist(scraper.summary);
config["var-def"[EMAIL PROTECTED] = "http://markmail.org/browse?q=list:wso2 from:[EMAIL PROTECTED] author:" + user;
scraper = new Scraper(config);
var checkins= scrape2xmllist(scraper.summary);
config["var-def"[EMAIL PROTECTED] = "http://markmail.org/browse?q=list:wso2 from:" + user + " subject:jira";
scraper = new Scraper(config);
var jiras = scrape2xmllist(scraper.summary);
var summary = <summary user={user} elapsed={new Date().valueOf() - timestamp}>
<filter type="normal">{normal}</filter>
<filter type="checkins">{checkins}</filter>
<filter type="jiras">{jiras}</filter>
</summary>;
return summary;
}
function summarizeActivityXSLT(user) {
var timestamp = new Date().valueOf();
var scraper;
var config =
<config>
<var-def name='summary'>
<xslt>
<xml>
<html-to-xml>
<http method='get' url='filled-in-later' />
</html-to-xml>
</xml>
<stylesheet><![CDATA[
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" omit-xml-declaration="yes"/>
<xsl:template match="/">
<filter>
<xsl:attribute name="total"><xsl:value-of select="sum(//tr/td[not(a)])"/></xsl:attribute>
<xsl:for-each select="//tr/td[a]">
<xsl:sort select="following-sibling::td" data-type="number" order="descending"/>
<list name="{a}" count="{following-sibling::td}"/>
</xsl:for-each>
</filter>
</xsl:template>
</xsl:stylesheet>
]]></stylesheet>
</xslt>
</var-def>
</config>;
config["var-def"[EMAIL PROTECTED] = "http://markmail.org/browse?q=list:wso2 from:" + user + " -subject:jira";
scraper = new Scraper(config);
var normal = new XML(scraper.summary);
config["var-def"[EMAIL PROTECTED] = "http://markmail.org/browse?q=list:wso2 from:[EMAIL PROTECTED] author:" + user;
scraper = new Scraper(config);
var checkins = new XML(scraper.summary);
config["var-def"[EMAIL PROTECTED] = "http://markmail.org/browse?q=list:wso2 from:" + user + " subject:jira";
scraper = new Scraper(config);
var jiras = new XML(scraper.summary);
var total = parseInt([EMAIL PROTECTED]) + parseInt([EMAIL PROTECTED]) - parseInt([EMAIL PROTECTED]);
var summary = <summary user={user} elapsed={new Date().valueOf() - timestamp} total={total}>
<filter type="normal" [EMAIL PROTECTED]>{normal.*}</filter>
<filter type="checkins" [EMAIL PROTECTED]>{checkins.*}</filter>
<filter type="jiras" [EMAIL PROTECTED]>{jiras.*}</filter>
</summary>;
return summary;
}_______________________________________________
Mashup-dev mailing list
[email protected]
http://www.wso2.org/cgi-bin/mailman/listinfo/mashup-dev