[
https://issues.apache.org/jira/browse/PIG-505?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12642269#action_12642269
]
David Ciemiewicz commented on PIG-505:
--------------------------------------
The problem is this - I think we've not done a good a job of communicating this
assumption of full typing for Maps. In fact, it was the Pig 2.0 development
team that ported this user defined function from Pig 1.4.
The function is string.URLPARSE and It uses a generic Map<Object,Object> return
type rather than, say, Map<String,String> which would seem more appropriate.
Also, this function DOES have a a defined outputSchema of DataType.MAP.
Maybe one of the solutions to working around this problem is making another
pass through the example user defined functions and making sure they are FULLY
2.0 compliant for types.
Here's the example code for string.URLPARSE. Please advise on what you think
the correct defintion for the return type and the outputSchema should be.
Thanks.
package string;
import java.io.IOException;
import java.util.Map;
import java.util.HashMap;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.data.DataType;
/**
* string.URLPARSE parses a URL into the parts as described in RFC 1738 and RFC
1808.
*
* According to the RFCs, a URL may be described as:
*
* scheme://authority/path;params?query#fragment
* scheme://[EMAIL PROTECTED]:port/path;params?query#fragment
*
* ULRPARSE also supports parsing non-conformant ULRs that have scheme://
stripped.
*
* <dl>
* <dt><b>Parameters:</b></dt>
* <dd><code>url</code> - <code>String</code> url string to parse.</dd>
*
* <dt><b>Return Value:</b></dt>
* <dd><code>Map</code> parsed url</dd>
*
* <dt><b>Return Schema:</b></dt>
* <dd>urlparse</dd>
* <dd>#'scheme' String</dd>
* <dd>#'authority' String</dd>
* <dd>#'userinfo' String</dd>
* <dd>#'host' String</dd>
* <dd>#'port' String</dd>
* <dd>#'path' String</dd>
* <dd>#'params' String</dd>
* <dd>#'paramsFields' Map</dd>
* <dd>#'query' String</dd>
* <dd>#'queryFields' Map</dd>
* <dd>#'fragment' String</dd>
*
* <p>Note that none of the returned values are decoded so the values may
contain
* escaped characters such as %2F and +</p>
*
* <dt><b>Example:</b></dt>
* <dd><code>
* register string.jar;<br/>
* A = load 'mydata' using PigStorage() as ( url );<br/>
* B = foreach A generate url, string.URLPARSE(url) as parsedurl;
* C = foreach B generate url, parsedurl#'host' as host;
* </code></dd>
* </dl>
*
* @author David (Ciemo) Ciemiewicz
*/
public class URLPARSE extends EvalFunc<Map<Object, Object> > {
// @Override
static private String strOrNullStr(String str) {
return (str == null) ? "" : str;
};
static private Pattern scheme_rest =
Pattern.compile("^(([A-Za-z][A-Za-z0-9.+-]*):)?(.*)$");
static private Pattern authority_rest =
Pattern.compile("^((//)?((([^@:/]+)@)?([^:/]+)(:([0-9]+))?))(.*)$");
static private Pattern path_params_query_fragment =
Pattern.compile("^(/?[^;?#]*)?(;([^?#]*))?(\\?([^#]*))?(#(.*))?$");
static private Pattern field_value_pairs =
Pattern.compile("(([^=&]+)=([^=&]*))&*");
public Map<Object, Object> exec(Tuple input) throws IOException {
if (input == null || input.size() == 0)
return null;
String url;
try{
url = (String)input.get(0);
} catch(Exception e){
System.out.println("Can't convert field to a string;
error = " + e.getMessage());
return null;
}
//Pattern p;
HashMap<Object, Object> output = new HashMap<Object, Object>();
Matcher m;
String scheme = "";
String authority = "";
String userinfo = "";
String host = "";
String port = "";
String path = "";
String params = "";
String query = "";
String fragment = "";
String rest = "";
if(url == null) {
return null;
}
m = scheme_rest.matcher(url);
if (m.find()) {
scheme = strOrNullStr(m.group(2));
rest = strOrNullStr(m.group(3));
m = authority_rest.matcher(rest);
if (m.find()) {
//String doubleslash = strOrNullStr(m.group(2));
authority = strOrNullStr(m.group(3));
userinfo = strOrNullStr(m.group(5));
host = strOrNullStr(m.group(6));
port = strOrNullStr(m.group(8));
rest = strOrNullStr(m.group(9));
m = path_params_query_fragment.matcher(rest);
if (m.find()) {
path = strOrNullStr(m.group(1));
params = strOrNullStr(m.group(3));
query = strOrNullStr(m.group(5));
fragment = strOrNullStr(m.group(7));
}
}
}
HashMap<Object, Object> paramsFieldsMap = new HashMap<Object,
Object>();
m = field_value_pairs.matcher(params);
while (m.find()) {
String field = strOrNullStr(m.group(2));
String value = strOrNullStr(m.group(3));
paramsFieldsMap.put(field, value);
}
HashMap<Object, Object> queryFieldsMap = new HashMap<Object,
Object>();
m = field_value_pairs.matcher(query);
while (m.find()) {
String field = strOrNullStr(m.group(2));
String value = strOrNullStr(m.group(3));
queryFieldsMap.put(field, value);
}
output.put("url", url);
output.put("scheme", scheme);
output.put("authority", authority);
output.put("userinfo", userinfo);
output.put("host", host);
output.put("port", port);
output.put("path", path);
output.put("params", params);
output.put("paramsFields", paramsFieldsMap);
output.put("query", query);
output.put("queryFields", queryFieldsMap);
output.put("fragment", fragment);
return output;
}
@Override
public Schema outputSchema(Schema input) {
return new Schema(new
Schema.FieldSchema(getSchemaName("urlparse", input), DataType.MAP));
}
}
> Lineage for UDFs that do not return bytearray
> ---------------------------------------------
>
> Key: PIG-505
> URL: https://issues.apache.org/jira/browse/PIG-505
> Project: Pig
> Issue Type: Bug
> Affects Versions: types_branch
> Reporter: Santhosh Srinivasan
> Assignee: Santhosh Srinivasan
> Fix For: types_branch
>
>
> In Pig-335, the lineage design states that UDFs that return bytearrays could
> cause problems in tracing the lineage. For UDFs that do not return bytearray,
> the lineage design should pickup the right load function to use as long as
> there is no ambiguity. In the current implementation, we could have issues
> with scripts like:
> {code}
> a = load 'input' as (field1);
> b = foreach a generate myudf_to_double(field1);
> c = foreach b generate $0 + 2.0;
> {code}
> When $0 has to be cast to a double, the lineage code will complain that it
> hit a UDF and hence cannot determine the right load function to use.
--
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.