Repository: incubator-hawq Updated Branches: refs/heads/master a81ae771e -> cb7caf540
HAWQ-503. Fix the bug of failed temporary directory and GRM host/rack conflicts in resource pool in YARN mode Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/cb7caf54 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/cb7caf54 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/cb7caf54 Branch: refs/heads/master Commit: cb7caf5408b8677bfd189fdea9654825c6ce370d Parents: a81ae77 Author: Wen Lin <w...@pivotal.io> Authored: Thu Mar 10 14:17:04 2016 +0800 Committer: Wen Lin <w...@pivotal.io> Committed: Thu Mar 10 14:17:04 2016 +0800 ---------------------------------------------------------------------- .../resourcebroker_LIBYARN_proc.c | 2 + src/backend/resourcemanager/resourcepool.c | 112 ++++++++++++++----- 2 files changed, 87 insertions(+), 27 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/cb7caf54/src/backend/resourcemanager/resourcebroker/resourcebroker_LIBYARN_proc.c ---------------------------------------------------------------------- diff --git a/src/backend/resourcemanager/resourcebroker/resourcebroker_LIBYARN_proc.c b/src/backend/resourcemanager/resourcebroker/resourcebroker_LIBYARN_proc.c index d3028c3..3afae7b 100644 --- a/src/backend/resourcemanager/resourcebroker/resourcebroker_LIBYARN_proc.c +++ b/src/backend/resourcemanager/resourcebroker/resourcebroker_LIBYARN_proc.c @@ -1500,6 +1500,8 @@ int RB2YARN_getClusterReport(DQueue hosts) segstat->Info.GRMRackNameLen = racknamelen; segstat->Info.GRMRackNameOffset = segstat->Info.GRMHostNameOffset + __SIZE_ALIGN64(hostnamelen+1); + segstat->Info.FailedTmpDirOffset = 0; + segstat->Info.FailedTmpDirLen = 0; segstat->Info.Size = segsize; memcpy((char *)&(segstat->Info) + sizeof(SegInfoData), http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/cb7caf54/src/backend/resourcemanager/resourcepool.c ---------------------------------------------------------------------- diff --git a/src/backend/resourcemanager/resourcepool.c b/src/backend/resourcemanager/resourcepool.c index 6cc53eb..a0f64eb 100644 --- a/src/backend/resourcemanager/resourcepool.c +++ b/src/backend/resourcemanager/resourcepool.c @@ -1004,47 +1004,69 @@ int addHAWQSegWithSegStat(SegStat segstat, bool *capstatchanged) int old = segresource->Stat->Info.FailedTmpDirLen == 0 ? 0 :__SIZE_ALIGN64(segresource->Stat->Info.FailedTmpDirLen+1); - int new = segstat->Info.FailedTmpDirLen == 0 ? + int new = segstat->Info.FailedTmpDirLen == 0 ? 0 : __SIZE_ALIGN64(segstat->Info.FailedTmpDirLen+1); - if (new > old && - segresource->Stat->Info.Size - - (segresource->Stat->Info.HostNameOffset + __SIZE_ALIGN64(segresource->Stat->Info.HostNameLen+1)) - < new) + + int current = segresource->Stat->Info.Size - + (segresource->Stat->Info.HostNameOffset + __SIZE_ALIGN64(segresource->Stat->Info.HostNameLen+1)); + if (segresource->Stat->Info.GRMHostNameLen != 0 && segresource->Stat->Info.GRMHostNameOffset != 0) + current -= __SIZE_ALIGN64(segresource->Stat->Info.GRMHostNameLen+1); + if (segresource->Stat->Info.GRMRackNameLen != 0 && segresource->Stat->Info.GRMRackNameOffset != 0) + current -= __SIZE_ALIGN64(segresource->Stat->Info.GRMRackNameLen+1); + + /* + * repalloc memory if new size exceeds the old one. + * we don't shrink memory size if new size is less than the old one. + */ + if (new > old && current < new) { SegStat newSegStat = rm_repalloc(PCONTEXT, segresource->Stat, offsetof(SegStatData, Info) + segresource->Stat->Info.Size + (new - old)); segresource->Stat = newSegStat; - memset((char*)&segresource->Stat->Info + segresource->Stat->Info.Size, 0, (new - old)); segresource->Stat->Info.Size += (new - old); } - if (segstat->FailedTmpDirNum != 0) + if (segresource->Stat->Info.FailedTmpDirOffset == 0) { + Assert(segresource->Stat->FailedTmpDirNum == 0); segresource->Stat->Info.FailedTmpDirOffset = segresource->Stat->Info.HostNameOffset + __SIZE_ALIGN64(segresource->Stat->Info.HostNameLen+1); + if (segresource->Stat->Info.GRMHostNameLen != 0 && segresource->Stat->Info.GRMHostNameOffset != 0) + segresource->Stat->Info.FailedTmpDirOffset += __SIZE_ALIGN64(segresource->Stat->Info.GRMHostNameLen+1); + if (segresource->Stat->Info.GRMRackNameLen != 0 && segresource->Stat->Info.GRMRackNameOffset != 0) + segresource->Stat->Info.FailedTmpDirOffset += __SIZE_ALIGN64(segresource->Stat->Info.GRMRackNameLen+1); + } + + /* clear old failed temporary directory string in SegInfoData */ + memset((char *)&segresource->Stat->Info + + segresource->Stat->Info.FailedTmpDirOffset, + 0, + segresource->Stat->Info.Size - + segresource->Stat->Info.FailedTmpDirOffset); + + if (segstat->FailedTmpDirNum != 0) + { memcpy((char *)&segresource->Stat->Info + segresource->Stat->Info.FailedTmpDirOffset, GET_SEGINFO_FAILEDTMPDIR(&segstat->Info), strlen(GET_SEGINFO_FAILEDTMPDIR(&segstat->Info))); - memset((char *)&segresource->Stat->Info + - segresource->Stat->Info.FailedTmpDirOffset + - segstat->Info.FailedTmpDirLen, - 0, - segresource->Stat->Info.Size - - segresource->Stat->Info.FailedTmpDirOffset - - segstat->Info.FailedTmpDirLen); } else { - memset((char *)&segresource->Stat->Info + segresource->Stat->Info.FailedTmpDirOffset, - 0, - segresource->Stat->Info.Size - segresource->Stat->Info.FailedTmpDirOffset); segresource->Stat->Info.FailedTmpDirOffset = 0; } segresource->Stat->Info.FailedTmpDirLen = segstat->Info.FailedTmpDirLen; segresource->Stat->FailedTmpDirNum = segstat->FailedTmpDirNum; + elog(RMLOG, "After resource manager " + "updates segment failed temporary directory, " + "GRM hostname:%s, GRM rackname:%s", + segresource->Stat->Info.GRMHostNameLen == 0 ? + "":GET_SEGINFO_GRMHOSTNAME(&(segresource->Stat->Info)), + segresource->Stat->Info.GRMRackNameLen == 0 ? + "":GET_SEGINFO_GRMRACKNAME(&(segresource->Stat->Info))); + setSegResHAWQAvailability(segresource, segstat->FTSAvailable); if (Gp_role != GP_ROLE_UTILITY) { @@ -1205,27 +1227,60 @@ int updateHAWQSegWithGRMSegStat( SegStat segstat) int oldgracklen = segres->Stat->Info.GRMRackNameLen == 0 ? 0 : __SIZE_ALIGN64(segres->Stat->Info.GRMRackNameLen+1); + + Assert(segres->Stat->Info.HostNameOffset != 0); + int current = segres->Stat->Info.Size - + (segres->Stat->Info.HostNameOffset + + __SIZE_ALIGN64(segres->Stat->Info.HostNameLen+1)); + if (segres->Stat->FailedTmpDirNum != 0) + current -= __SIZE_ALIGN64(segres->Stat->Info.FailedTmpDirLen +1); + + /* + * If new GRM hostname and rackname length exceeds the old one, + * repalloc memory. But never shrink memory. + */ int change = ghostlen + gracklen - oldghostlen - oldgracklen; - if (change > 0) + if (change > 0 && current < (ghostlen + gracklen)) { newSegStat = rm_repalloc(PCONTEXT, segres->Stat, offsetof(SegStatData, Info) + segres->Stat->Info.Size + change); segres->Stat = newSegStat; + segres->Stat->Info.Size += change; } else newSegStat = segres->Stat; - Assert(newSegStat != NULL); - /* Reset the memory area for GRM host and rack name zero filled. */ - memset((char*)newSegStat + - offsetof(SegStatData, Info) + segres->Stat->Info.Size - - (oldghostlen + oldgracklen), - '\0', - ghostlen + gracklen); + /* Refill failed temporary directory string */ + if (segres->Stat->FailedTmpDirNum != 0 && change > 0 + && current < (ghostlen + gracklen)) + { + Assert(newSegStat->Info.FailedTmpDirOffset != 0 && + newSegStat->Info.FailedTmpDirLen != 0); + memmove((char*)newSegStat + offsetof(SegStatData, Info) + + newSegStat->Info.FailedTmpDirOffset + change, + (char*)newSegStat + offsetof(SegStatData, Info) + + newSegStat->Info.FailedTmpDirOffset, + __SIZE_ALIGN64(segres->Stat->Info.FailedTmpDirLen + 1)); + memset((char*)newSegStat + offsetof(SegStatData, Info) + + newSegStat->Info.FailedTmpDirOffset, + 0, + change); + newSegStat->Info.FailedTmpDirOffset += change; + } Assert(newSegStat != NULL); + /* Reset the memory area for GRM host and rack name zero filled. */ + Assert(newSegStat->Info.HostNameLen != 0); + memset((char *)&newSegStat->Info + newSegStat->Info.HostNameOffset + + __SIZE_ALIGN64(newSegStat->Info.HostNameLen + 1), + 0, + segres->Stat->Info.Size - + newSegStat->Info.HostNameOffset - + __SIZE_ALIGN64(newSegStat->Info.HostNameLen + 1) - + (segres->Stat->Info.FailedTmpDirLen == 0) ? + 0:__SIZE_ALIGN64(segres->Stat->Info.FailedTmpDirLen + 1)); newSegStat->Info.GRMHostNameLen = segstat->Info.GRMHostNameLen; newSegStat->Info.GRMHostNameOffset = newSegStat->Info.HostNameOffset + @@ -1233,8 +1288,6 @@ int updateHAWQSegWithGRMSegStat( SegStat segstat) newSegStat->Info.GRMRackNameLen = segstat->Info.GRMRackNameLen; newSegStat->Info.GRMRackNameOffset = newSegStat->Info.GRMHostNameOffset + __SIZE_ALIGN64(newSegStat->Info.GRMHostNameLen+1); - newSegStat->Info.Size = newSegStat->Info.GRMRackNameOffset + - __SIZE_ALIGN64(newSegStat->Info.GRMRackNameLen+1); strcpy(GET_SEGINFO_GRMHOSTNAME(&(newSegStat->Info)), GET_SEGINFO_GRMHOSTNAME(&(segstat->Info))); @@ -1249,6 +1302,11 @@ int updateHAWQSegWithGRMSegStat( SegStat segstat) GET_SEGINFO_GRMHOSTNAME(&(newSegStat->Info)), GET_SEGINFO_GRMRACKNAME(&(newSegStat->Info))); + elog(RMLOG, "After resource manager " + "updates segment info's GRM host name and rack name, " + "failed temporary directory: %s", + segres->Stat->FailedTmpDirNum == 0 ? "":GET_SEGINFO_FAILEDTMPDIR(&(segres->Stat->Info))); + /* Always set segment global resource manager available. */ setSegResGLOBAvailability(segres, RESOURCE_SEG_STATUS_AVAILABLE);