Hi
I am using LZO compression in our scripts but one script is still creating
errors
Diagnostic Messages for this Task:
Error: java.io.IOException: java.io.EOFException: Premature EOF from inputStream
at
org.apache.hadoop.hive.io.HiveIOExceptionHandlerChain.handleRecordReaderCreationException(HiveIOExceptionHandlerChain.java:97)
at
org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil.handleRecordReaderCreationException(HiveIOExceptionHandlerUtil.java:57)
at
org.apache.hadoop.hive.ql.io.HiveInputFormat.getRecordReader(HiveInputFormat.java:243)
at
org.apache.hadoop.hive.ql.io.CombineHiveInputFormat.getRecordReader(CombineHiveInputFormat.java:522)
at
org.apache.hadoop.mapred.MapTask$TrackedRecordReader.<init>(MapTask.java:160)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:381)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:334)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:152)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1332)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:147)
Caused by: java.io.EOFException: Premature EOF from inputStream
at
com.hadoop.compression.lzo.LzopInputStream.readFully(LzopInputStream.java:75)
at
com.hadoop.compression.lzo.LzopInputStream.readHeader(LzopInputStream.java:114)
at
com.hadoop.compression.lzo.LzopInputStream.<init>(LzopInputStream.java:54)
at
com.hadoop.compression.lzo.LzopCodec.createInputStream(LzopCodec.java:83)
at org.apache.hadoop.io.SequenceFile$Reader.init(SequenceFile.java:1871)
at
org.apache.hadoop.io.SequenceFile$Reader.initialize(SequenceFile.java:1765)
at
org.apache.hadoop.io.SequenceFile$Reader.<init>(SequenceFile.java:1714)
at
org.apache.hadoop.io.SequenceFile$Reader.<init>(SequenceFile.java:1728)
at
org.apache.hadoop.mapred.SequenceFileRecordReader.<init>(SequenceFileRecordReader.java:49)
at
org.apache.hadoop.mapred.SequenceFileInputFormat.getRecordReader(SequenceFileInputFormat.java:64)
at
org.apache.hadoop.hive.ql.io.HiveInputFormat.getRecordReader(HiveInputFormat.java:240)
... 9 more
SCRIPT
=======
set hiveconf mapred.output.compression.type=BLOCK;
set
mapred.map.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
set mapreduce.map.output.compress=true;
set hive.exec.compress.output=true;
set
mapreduce.output.fileoutputformat.compress.codec=com.hadoop.compression.lzo.LzopCodec;
set mapreduce.output.fileoutputformat.compress=true;
set hive.exec.compress.intermediate=true;
set mapreduce.job.maps=500;
set mapreduce.job.reduces=8;
set mapreduce.tasktracker.map.tasks.maximum=12;
set mapreduce.tasktracker.reduce.tasks.maximum=8;
add jar
/home/nextag/sasubramanian/mycode/impressions/jar/impressions-hiveudfs-1.0-20130615-155038.jar;
create temporary function collect as
'com.wizecommerce.utils.hive.udf.GenericUDAFCollect';
create temporary function isnextagip as
'com.wizecommerce.utils.hive.udf.IsNextagIP';
create temporary function isfrombot as
'com.wizecommerce.utils.hive.udf.IsFromBot';
create temporary function processblankkeyword as
'com.wizecommerce.utils.hive.udf.ProcessBlankKeyword';
create temporary function getValidHiddenSellers as
'com.wizecommerce.utils.hive.udf.GetValidHiddenSellers';
INSERT OVERWRITE DIRECTORY
'/user/beeswax/warehouse/keyword_impressions_ptitles_log/2013-03-19'
SELECT
hp.header_date,
hp.impression_id,
hp.header_searchsessionid,
hp.cached_visit_id,
split(hp.header_servername,'[\.]')[0],
hp.cached_ip,
hp.header_adnode,
IF (concat_ws(',' , collect_set(concat_ws('|', cast(hp.seller_id as
STRING), cast(IF(hp.seller_pricetier IS NULL, -1L, hp.seller_pricetier) as
STRING), cast(hp.seller_price as STRING), cast(IF(hp.ptitle_rank IS NULL, -1L,
hp.ptitle_rank) as STRING)))) = '-1|-1',NULL,concat_ws(',' ,
collect_set(concat_ws('|', cast(hp.seller_id as STRING),
cast(IF(hp.seller_pricetier IS NULL, -1L, hp.seller_pricetier) as STRING),
cast(hp.seller_price as STRING), cast(IF(hp.ptitle_rank IS NULL, -1L,
hp.ptitle_rank) as STRING))))),
IF(concat_ws(',' , getValidHiddenSellers(collect_set(concat_ws('|',
cast(sh.seller_id as STRING), cast(sh.ptitle_id as STRING), cast(sh.tag_id as
STRING), cast(IF(sh.price_tier IS NULL, -1L, sh.price_tier) as STRING))))) =
'',NULL, concat_ws(',' , getValidHiddenSellers(collect_set(concat_ws('|',
cast(sh.seller_id as STRING), cast(sh.ptitle_id as STRING), cast(sh.tag_id as
STRING), cast(IF(sh.price_tier IS NULL, -1L, sh.price_tier) as STRING))))))
FROM
(SELECT
h.header_date,
h.header_servername,
h.impression_id,
h.header_searchsessionid,
h.cached_visit_id,
h.cached_ip,
h.header_adnode,
p.ptitle_ptitleid,
p.seller_id,
p.seller_pricetier,
p.seller_price,
p.ptitle_rank
FROM
(SELECT
header_date,
header_servername,
impression_id,
header_searchsessionid,
cached_ip,
header_adnode,
cached_recordid,
cached_visit_id
FROM
outpdir_impressions_header
WHERE
header_date_partition='2013-03-19'
AND
header_rbabsentsellers = 1L
AND
cached_recordid IS NOT NULL
AND
isnextagip(cached_ip) = FALSE
AND
isfrombot(cached_visit_id) = FALSE
) h
LEFT OUTER JOIN
(SELECT
po.impression_id,
po.ptitle_ptitleid,
po.header_date,
po.seller_id,
po.seller_pricetier,
po.seller_price,
po.ptitle_rank
FROM
(SELECT
impression_id,
ptitle_ptitleid,
header_date,
seller_id,
seller_pricetier,
seller_price,
ptitle_rank
FROM
outpdir_impressions_ptitle
WHERE
header_date_partition = '2013-03-19'
AND
seller_id IS NOT NULL
) po
JOIN
(SELECT
impression_id,
ptitle_ptitleid,
ptitle_rank,
COUNT(DISTINCT seller_id, seller_pricetier, seller_price,
ptitle_rank)
FROM
outpdir_impressions_ptitle pi
WHERE
header_date_partition = '2013-03-19'
AND
seller_id IS NOT NULL
GROUP BY
impression_id,
ptitle_ptitleid,
ptitle_rank
HAVING
COUNT(DISTINCT seller_id, seller_pricetier, seller_price,
ptitle_rank) = 1
) pi
ON
po.impression_id = pi.impression_id
AND
po.ptitle_ptitleid = pi.ptitle_ptitleid
AND
po.ptitle_rank = pi.ptitle_rank
) p
ON
h.impression_id = p.impression_id
AND
h.header_date=p.header_date
) hp
LEFT OUTER JOIN
(SELECT
*
FROM
outpdir_seller_hidden
WHERE
header_date_partition='2013-03-19'
) sh
ON
hp.impression_id = sh.impression_id
AND
hp.header_date = sh.header_date
GROUP BY
hp.header_date,
hp.impression_id,
hp.header_searchsessionid,
hp.cached_visit_id,
hp.header_servername,
hp.cached_ip,
hp.header_adnode
CONFIDENTIALITY NOTICE
======================
This email message and any attachments are for the exclusive use of the
intended recipient(s) and may contain confidential and privileged information.
Any unauthorized review, use, disclosure or distribution is prohibited. If you
are not the intended recipient, please contact the sender by reply email and
destroy all copies of the original message along with any attachments, from
your computer system. If you are the intended recipient, please be advised that
the content of this message is subject to access, review and disclosure by the
sender's Email System Administrator.