Hi,
Can you please suggest how do I improve the performance of the python code
being called from PIG? Also in case same can be implemented in PIG?
PIG script:
REGISTER '/inreport/bigdata-smsiuc/bigdata_scripts/smsiuc_udf.py' using
streaming_python as smsiuc_udfs;
cdrs = load '/mtsmscdrs/201604/processing/ZTE_2016040111*' USING
PigStorage('|','-tagFile') ;
cdrrecord = FOREACH cdrs GENERATE (chararray) UPPER($1) as aparty, (chararray)
UPPER($2) as bparty,$3 as smssentdate,$4 as smssenttime,($29=='6' ? 'S' : 'F')
as status,(chararray) UPPER($26) as srcgt,(chararray) UPPER($27) as
destgt,($12=='405899136999995' ? 'MTSDEL-CDMA' : ($12=='919875089998' ?
'MTSRAJ-GSM' : ($12=='405899150999995' ? 'MTSCHN-CDMA' : $12) ) ) as smscgt,
(chararray)$0 as cdrfname,(chararray) $13 as prepost;
filteredp2pcdrs = FILTER cdrrecord by
smsiuc_udfs.pullp2pcdrs(aparty,bparty,srcgt,destgt) and status == 'S' and
SUBSTRING(smssentdate,4,6) == '$MON';
p2preportmap = FOREACH filteredp2pcdrs GENERATE
smsiuc_udfs.p2preport(srcgt,destgt,aparty,bparty),smscgt,status,prepost;
grpp2preportmap = GROUP p2preportmap by (p2pmappedreport);
p2preport = FOREACH grpp2preportmap GENERATE group as
(grpp2preportmap),COUNT(p2preportmap);
STORE p2preport into '/testalnumout/p2preport/' using PigStorage(',');
store filteredp2pcdrs into '/testalnumout/filteredp2pcdrs/' using
PigStorage(',');
Python Udf:
@outputSchema('output_bag_field_name:bag{mstbag:(inner_field_name_1:chararray,
inner_field_name_2:chararray)}')
def readfileinbag(filename,position):
masterbag = {}
with open(filename) as f:
for line in f:
splitline = line.strip().split(",")
masterbag[splitline[0]] = ",".join(splitline[position:])
return masterbag
@outputSchema('p2pmappedreport:tuple(OriginOperator:chararray,OriginCircle:chararray,DestinationOperator:chararray,DestinationCircle:chararray,apartycircle:chararray,bpartycircle)')
def p2preport(srcgt,destgt,aparty,bparty):
mastergt = {}
masterlrn = {}
origno = str(int(aparty))
destno = str(int(bparty))
try:
if ((os.path.isfile(MASTERLRN) and os.access(MASTERLRN,
os.R_OK) and os.stat(MASTERLRN).st_size > 0) and (os.path.isfile(MASTERGT) and
os.access(MASTERGT, os.R_OK) and os.stat(MASTERGT).st_size > 0)):
#READ CONTENTS OF MASTER GT/LRN IN BAG/DICT
mastergt = readfileinbag(MASTERGT,1)
masterlrn = readfileinbag(MASTERLRN,2)
mastergtcircle = readfileinbag(MASTERGT,2)
if(srcgt in mastergt):
returnstring = mastergt[srcgt]
elif(srcgt[0:9] in mastergt):
returnstring = mastergt[srcgt[0:9]]
elif(srcgt[0:8] in mastergt):
returnstring = mastergt[srcgt[0:8]]
elif(srcgt[0:7] in mastergt):
returnstring = mastergt[srcgt[0:7]]
elif(srcgt[0:6] in mastergt):
returnstring = mastergt[srcgt[0:6]]
elif(srcgt[0:5] in mastergt):
returnstring = mastergt[srcgt[0:5]]
elif(srcgt[0:4] in mastergt):
returnstring = mastergt[srcgt[0:4]]
else:
returnstring = mastergt.get(srcgt,srcgt+",")
if destgt in mastergt:
returnstring = returnstring + "," +
mastergt[destgt]
elif(destgt[0:9] in mastergt):
returnstring = returnstring + "," +
mastergt[destgt[0:9]]
elif(destgt[0:8] in mastergt):
returnstring = returnstring + "," +
mastergt[destgt[0:8]]
elif(destgt[0:7] in mastergt):
returnstring = returnstring + "," +
mastergt[destgt[0:7]]
elif(destgt[0:6] in mastergt):
returnstring = returnstring + "," +
mastergt[destgt[0:6]]
elif(destgt[0:5] in mastergt):
returnstring = returnstring + "," +
mastergt[destgt[0:5]]
elif(destgt[0:4] in mastergt):
returnstring = returnstring + "," +
mastergt[destgt[0:4]]
else:
returnstring = returnstring +
mastergt.get(destgt,destgt+",")
if origno[0:9] in mastergtcircle:
returnstring = returnstring + "," +
mastergtcircle[origno[0:9]]
elif origno[0:8] in mastergtcircle:
returnstring = returnstring + "," +
mastergtcircle[origno[0:8]]
elif origno[0:7] in mastergtcircle:
returnstring = returnstring + "," +
mastergtcircle[origno[0:7]]
elif origno[0:6] in mastergtcircle:
returnstring = returnstring + "," +
mastergtcircle[origno[0:6]]
elif origno[0:5] in mastergtcircle:
returnstring = returnstring + "," +
mastergtcircle[origno[0:5]]
elif origno[0:4] in mastergtcircle:
returnstring = returnstring + "," +
mastergtcircle[origno[0:4]]
else:
returnstring = returnstring + "," +
mastergtcircle.get(origno,origno)
if destno[2:6] in masterlrn:
returnstring = returnstring + "," +
masterlrn[destno[2:6]]
elif destno[0:7] in mastergtcircle:
returnstring = returnstring + "," +
mastergtcircle[destno[0:7]]
elif destno[0:6] in mastergtcircle:
returnstring = returnstring + "," +
mastergtcircle[destno[0:6]]
elif destno[0:5] in mastergtcircle:
returnstring = returnstring + "," +
mastergtcircle[destno[0:5]]
elif destno[0:4] in mastergtcircle:
returnstring = returnstring + "," +
mastergtcircle[destno[0:4]]
else:
returnstring = returnstring + "," +
mastergtcircle.get(destno,destno)
return returnstring
else:
return 'NULL,NULL,NULL,NULL,NULL,NULL,NULL'
except AttributeError:
pass
MASTERLRN is having following contents
3078,IDEA,OR
3079,IDEA,PB
3080,IDEA,RAJ
3081,IDEA,CHN
3082,IDEA,UPE
MASTERGT is having following contents
MTS_VASSLFCARE,MTS,DL
NEWTC,MTS,DL
NOIDA_ALARM,MTS,DL
OCG1CTO,MTS,DL
OCG3CTO,MTS,DL
OCG4CTO,MTS,DL
OCGAR,MTS,DL
OCGCNT,MTS,DL
OCGERROR,MTS,DL
OCGEXCEPTION,MTS,DL
ODP,MTS,DL
OMCR,MTS,DL
Best regards
Amit Sharma
________________________________
This E-Mail may contain Confidential and/or legally privileged Information and
is meant for the intended recipient(s) only. If you have received this e-mail
in error and are not the intended recipient/s, Kindly notify the sender and
then delete this e-mail immediately from your system. You are also hereby
notified that any use, any form of reproduction, dissemination, copying,
disclosure, modification, distribution and/or publication of this e-mail, its
contents or its attachment/s other than by its intended recipient/s is strictly
prohibited and may be unlawful.
Internet Communications cannot be guaranteed to be secure or error-free as
information could be delayed, intercepted, corrupted, lost, or contain viruses.
Sistema Shyam Teleservices Limited does not accept any liability for any
errors, omissions, viruses or computer problems experienced by any recipient as
a result of this e-mail.