pcc updated this revision to Diff 207379.
pcc added a comment.

- Simplify tagAlloca


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D63908/new/

https://reviews.llvm.org/D63908

Files:
  clang/docs/HardwareAssistedAddressSanitizerDesign.rst
  compiler-rt/lib/hwasan/hwasan_allocator.cpp
  compiler-rt/lib/hwasan/hwasan_checks.h
  compiler-rt/lib/hwasan/hwasan_flags.inc
  compiler-rt/lib/hwasan/hwasan_report.cpp
  compiler-rt/lib/hwasan/hwasan_report.h
  compiler-rt/test/hwasan/TestCases/heap-buffer-overflow.c
  compiler-rt/test/hwasan/TestCases/random-align-right.c
  compiler-rt/test/hwasan/TestCases/stack-oob.c
  compiler-rt/test/hwasan/TestCases/tail-magic.c
  llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
  llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
  llvm/test/CodeGen/AArch64/hwasan-check-memaccess.ll
  llvm/test/Instrumentation/HWAddressSanitizer/alloca-with-calls.ll
  llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
  llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
  llvm/test/Instrumentation/HWAddressSanitizer/kernel-alloca.ll

Index: llvm/test/Instrumentation/HWAddressSanitizer/kernel-alloca.ll
===================================================================
--- llvm/test/Instrumentation/HWAddressSanitizer/kernel-alloca.ll
+++ llvm/test/Instrumentation/HWAddressSanitizer/kernel-alloca.ll
@@ -14,9 +14,10 @@
 ; CHECK: %[[B:[^ ]*]] = lshr i64 %[[A]], 20
 ; CHECK: %[[BASE_TAG:[^ ]*]] = xor i64 %[[A]], %[[B]]
 
-; CHECK: %[[X:[^ ]*]] = alloca i32, align 16
+; CHECK: %[[X:[^ ]*]] = alloca { i32, [12 x i8] }, align 16
+; CHECK: %[[X_GEP:[^ ]*]] = getelementptr { i32, [12 x i8] }, { i32, [12 x i8] }* %[[X]], i32 0, i32 0
 ; CHECK: %[[X_TAG:[^ ]*]] = xor i64 %[[BASE_TAG]], 0
-; CHECK: %[[X1:[^ ]*]] = ptrtoint i32* %[[X]] to i64
+; CHECK: %[[X1:[^ ]*]] = ptrtoint i32* %[[X_GEP]] to i64
 ; CHECK: %[[C:[^ ]*]] = shl i64 %[[X_TAG]], 56
 ; CHECK: %[[D:[^ ]*]] = or i64 %[[C]], 72057594037927935
 ; CHECK: %[[E:[^ ]*]] = and i64 %[[X1]], %[[D]]
Index: llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
===================================================================
--- llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
+++ llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
@@ -28,11 +28,35 @@
 ; RECOVER-ZERO-BASED-SHADOW: %[[E:[^ ]*]] = inttoptr i64 %[[D]] to i8*
 ; RECOVER: %[[MEMTAG:[^ ]*]] = load i8, i8* %[[E]]
 ; RECOVER: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
-; RECOVER: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
+; RECOVER: br i1 %[[F]], label %[[MISMATCH:[0-9]*]], label %[[CONT:[0-9]*]], !prof {{.*}}
+
+; RECOVER: [[MISMATCH]]:
+; RECOVER: %[[NOTSHORT:[^ ]*]] = icmp ugt i8 %[[MEMTAG]], 15
+; RECOVER: br i1 %[[NOTSHORT]], label %[[FAIL:[0-9]*]], label %[[SHORT:[0-9]*]], !prof {{.*}}
 
+; RECOVER: [[FAIL]]:
 ; RECOVER: call void asm sideeffect "brk #2336", "{x0}"(i64 %[[A]])
 ; RECOVER: br label
 
+; RECOVER: [[SHORT]]:
+; RECOVER: %[[LOWBITS:[^ ]*]] = and i64 %[[A]], 15
+; RECOVER: %[[LOWBITS_I8:[^ ]*]] = trunc i64 %[[LOWBITS]] to i8
+; RECOVER: %[[LAST:[^ ]*]] = add i8 %[[LOWBITS_I8]], 0
+; RECOVER: %[[OOB:[^ ]*]] = icmp uge i8 %[[LAST]], %[[MEMTAG]]
+; RECOVER: br i1 %[[OOB]], label %[[FAIL]], label %[[INBOUNDS:[0-9]*]], !prof {{.*}}
+
+; RECOVER: [[INBOUNDS]]:
+; RECOVER: %[[EOG_ADDR:[^ ]*]] = or i64 %[[C]], 15
+; RECOVER: %[[EOG_PTR:[^ ]*]] = inttoptr i64 %[[EOG_ADDR]] to i8*
+; RECOVER: %[[EOGTAG:[^ ]*]] = load i8, i8* %[[EOG_PTR]]
+; RECOVER: %[[EOG_MISMATCH:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[EOGTAG]]
+; RECOVER: br i1 %[[EOG_MISMATCH]], label %[[FAIL]], label %[[CONT1:[0-9]*]], !prof {{.*}}
+
+; RECOVER: [[CONT1]]:
+; RECOVER: br label %[[CONT]]
+
+; RECOVER: [[CONT]]:
+
 ; ABORT-DYNAMIC-SHADOW: call void @llvm.hwasan.check.memaccess(i8* %.hwasan.shadow, i8* %a, i32 0)
 ; ABORT-ZERO-BASED-SHADOW: call void @llvm.hwasan.check.memaccess(i8* null, i8* %a, i32 0)
 
@@ -55,11 +79,35 @@
 ; RECOVER-ZERO-BASED-SHADOW: %[[E:[^ ]*]] = inttoptr i64 %[[D]] to i8*
 ; RECOVER: %[[MEMTAG:[^ ]*]] = load i8, i8* %[[E]]
 ; RECOVER: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
-; RECOVER: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
+; RECOVER: br i1 %[[F]], label %[[MISMATCH:[0-9]*]], label %[[CONT:[0-9]*]], !prof {{.*}}
+
+; RECOVER: [[MISMATCH]]:
+; RECOVER: %[[NOTSHORT:[^ ]*]] = icmp ugt i8 %[[MEMTAG]], 15
+; RECOVER: br i1 %[[NOTSHORT]], label %[[FAIL:[0-9]*]], label %[[SHORT:[0-9]*]], !prof {{.*}}
 
+; RECOVER: [[FAIL]]:
 ; RECOVER: call void asm sideeffect "brk #2337", "{x0}"(i64 %[[A]])
 ; RECOVER: br label
 
+; RECOVER: [[SHORT]]:
+; RECOVER: %[[LOWBITS:[^ ]*]] = and i64 %[[A]], 15
+; RECOVER: %[[LOWBITS_I8:[^ ]*]] = trunc i64 %[[LOWBITS]] to i8
+; RECOVER: %[[LAST:[^ ]*]] = add i8 %[[LOWBITS_I8]], 1
+; RECOVER: %[[OOB:[^ ]*]] = icmp uge i8 %[[LAST]], %[[MEMTAG]]
+; RECOVER: br i1 %[[OOB]], label %[[FAIL]], label %[[INBOUNDS:[0-9]*]], !prof {{.*}}
+
+; RECOVER: [[INBOUNDS]]:
+; RECOVER: %[[EOG_ADDR:[^ ]*]] = or i64 %[[C]], 15
+; RECOVER: %[[EOG_PTR:[^ ]*]] = inttoptr i64 %[[EOG_ADDR]] to i8*
+; RECOVER: %[[EOGTAG:[^ ]*]] = load i8, i8* %[[EOG_PTR]]
+; RECOVER: %[[EOG_MISMATCH:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[EOGTAG]]
+; RECOVER: br i1 %[[EOG_MISMATCH]], label %[[FAIL]], label %[[CONT1:[0-9]*]], !prof {{.*}}
+
+; RECOVER: [[CONT1]]:
+; RECOVER: br label %[[CONT]]
+
+; RECOVER: [[CONT]]:
+
 ; ABORT: %[[A:[^ ]*]] = bitcast i16* %a to i8*
 ; ABORT-DYNAMIC-SHADOW: call void @llvm.hwasan.check.memaccess(i8* %.hwasan.shadow, i8* %[[A]], i32 1)
 ; ABORT-ZERO-BASED-SHADOW: call void @llvm.hwasan.check.memaccess(i8* null, i8* %[[A]], i32 1)
Index: llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
===================================================================
--- llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
+++ llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
@@ -16,24 +16,29 @@
 ; CHECK: %[[B:[^ ]*]] = lshr i64 %[[A]], 20
 ; CHECK: %[[BASE_TAG:[^ ]*]] = xor i64 %[[A]], %[[B]]
 
-; CHECK: %[[X:[^ ]*]] = alloca i32, align 16
+; CHECK: %[[X:[^ ]*]] = alloca { i32, [12 x i8] }, align 16
+; CHECK: %[[X_GEP:[^ ]*]] = getelementptr { i32, [12 x i8] }, { i32, [12 x i8] }* %[[X]], i32 0, i32 0
 ; CHECK: %[[X_TAG:[^ ]*]] = xor i64 %[[BASE_TAG]], 0
-; CHECK: %[[X1:[^ ]*]] = ptrtoint i32* %[[X]] to i64
+; CHECK: %[[X1:[^ ]*]] = ptrtoint i32* %[[X_GEP]] to i64
 ; CHECK: %[[C:[^ ]*]] = shl i64 %[[X_TAG]], 56
 ; CHECK: %[[D:[^ ]*]] = or i64 %[[X1]], %[[C]]
 ; CHECK: %[[X_HWASAN:[^ ]*]] = inttoptr i64 %[[D]] to i32*
 
 ; CHECK: %[[X_TAG2:[^ ]*]] = trunc i64 %[[X_TAG]] to i8
-; CHECK: %[[E:[^ ]*]] = ptrtoint i32* %[[X]] to i64
+; CHECK: %[[E:[^ ]*]] = ptrtoint i32* %[[X_GEP]] to i64
 ; CHECK: %[[F:[^ ]*]] = lshr i64 %[[E]], 4
 ; DYNAMIC-SHADOW: %[[X_SHADOW:[^ ]*]] = getelementptr i8, i8* %.hwasan.shadow, i64 %[[F]]
 ; ZERO-BASED-SHADOW: %[[X_SHADOW:[^ ]*]] = inttoptr i64 %[[F]] to i8*
-; CHECK: call void @llvm.memset.p0i8.i64(i8* align 1 %[[X_SHADOW]], i8 %[[X_TAG2]], i64 1, i1 false)
+; CHECK: %[[X_SHADOW_GEP:[^ ]*]] = getelementptr i8, i8* %[[X_SHADOW]], i32 0
+; CHECK: store i8 4, i8* %[[X_SHADOW_GEP]]
+; CHECK: %[[X_I8:[^ ]*]] = bitcast i32* %[[X_GEP]] to i8*
+; CHECK: %[[X_I8_GEP:[^ ]*]] = getelementptr i8, i8* %[[X_I8]], i32 15
+; CHECK: store i8 %[[X_TAG2]], i8* %[[X_I8_GEP]]
 ; CHECK: call void @use32(i32* nonnull %[[X_HWASAN]])
 
 ; UAR-TAGS: %[[BASE_TAG_COMPL:[^ ]*]] = xor i64 %[[BASE_TAG]], 255
 ; UAR-TAGS: %[[X_TAG_UAR:[^ ]*]] = trunc i64 %[[BASE_TAG_COMPL]] to i8
-; CHECK: %[[E2:[^ ]*]] = ptrtoint i32* %[[X]] to i64
+; CHECK: %[[E2:[^ ]*]] = ptrtoint i32* %[[X_GEP]] to i64
 ; CHECK: %[[F2:[^ ]*]] = lshr i64 %[[E2]], 4
 ; DYNAMIC-SHADOW: %[[X_SHADOW2:[^ ]*]] = getelementptr i8, i8* %.hwasan.shadow, i64 %[[F2]]
 ; ZERO-BASED-SHADOW: %[[X_SHADOW2:[^ ]*]] = inttoptr i64 %[[F2]] to i8*
Index: llvm/test/Instrumentation/HWAddressSanitizer/alloca-with-calls.ll
===================================================================
--- llvm/test/Instrumentation/HWAddressSanitizer/alloca-with-calls.ll
+++ llvm/test/Instrumentation/HWAddressSanitizer/alloca-with-calls.ll
@@ -9,9 +9,10 @@
 
 define void @test_alloca() sanitize_hwaddress {
 ; CHECK-LABEL: @test_alloca(
+; CHECK: %[[GEP:[^ ]*]] = getelementptr { i32, [12 x i8] }, { i32, [12 x i8] }* %x, i32 0, i32 0
 ; CHECK: %[[T1:[^ ]*]] = call i8 @__hwasan_generate_tag()
 ; CHECK: %[[A:[^ ]*]] = zext i8 %[[T1]] to i64
-; CHECK: %[[B:[^ ]*]] = ptrtoint i32* %x to i64
+; CHECK: %[[B:[^ ]*]] = ptrtoint i32* %[[GEP]] to i64
 ; CHECK: %[[C:[^ ]*]] = shl i64 %[[A]], 56
 ; CHECK: or i64 %[[B]], %[[C]]
 
Index: llvm/test/CodeGen/AArch64/hwasan-check-memaccess.ll
===================================================================
--- llvm/test/CodeGen/AArch64/hwasan-check-memaccess.ll
+++ llvm/test/CodeGen/AArch64/hwasan-check-memaccess.ll
@@ -40,8 +40,20 @@
 ; CHECK-NEXT: ldrb w16, [x9, x16]
 ; CHECK-NEXT: cmp x16, x0, lsr #56
 ; CHECK-NEXT: b.ne .Ltmp0
+; CHECK-NEXT: .Ltmp1:
 ; CHECK-NEXT: ret
 ; CHECK-NEXT: .Ltmp0:
+; CHECK-NEXT: cmp w16, #15
+; CHECK-NEXT: b.hi .Ltmp2
+; CHECK-NEXT: and x17, x0, #0xf
+; CHECK-NEXT: add x17, x17, #255
+; CHECK-NEXT: cmp w16, w17
+; CHECK-NEXT: b.ls .Ltmp2
+; CHECK-NEXT: orr x16, x0, #0xf
+; CHECK-NEXT: ldrb w16, [x16]
+; CHECK-NEXT: cmp x16, x0, lsr #56
+; CHECK-NEXT: b.eq .Ltmp1
+; CHECK-NEXT: .Ltmp2:
 ; CHECK-NEXT: stp x0, x1, [sp, #-256]!
 ; CHECK-NEXT: stp x29, x30, [sp, #232]
 ; CHECK-NEXT: mov x1, #456
@@ -58,9 +70,21 @@
 ; CHECK-NEXT: ubfx x16, x1, #4, #52
 ; CHECK-NEXT: ldrb w16, [x9, x16]
 ; CHECK-NEXT: cmp x16, x1, lsr #56
-; CHECK-NEXT: b.ne .Ltmp1
+; CHECK-NEXT: b.ne .Ltmp3
+; CHECK-NEXT: .Ltmp4:
 ; CHECK-NEXT: ret
-; CHECK-NEXT: .Ltmp1:
+; CHECK-NEXT: .Ltmp3:
+; CHECK-NEXT: cmp w16, #15
+; CHECK-NEXT: b.hi .Ltmp5
+; CHECK-NEXT: and x17, x1, #0xf
+; CHECK-NEXT: add x17, x17, #2047
+; CHECK-NEXT: cmp w16, w17
+; CHECK-NEXT: b.ls .Ltmp5
+; CHECK-NEXT: orr x16, x1, #0xf
+; CHECK-NEXT: ldrb w16, [x16]
+; CHECK-NEXT: cmp x16, x1, lsr #56
+; CHECK-NEXT: b.eq .Ltmp4
+; CHECK-NEXT: .Ltmp5:
 ; CHECK-NEXT: stp x0, x1, [sp, #-256]!
 ; CHECK-NEXT: stp x29, x30, [sp, #232]
 ; CHECK-NEXT: mov x0, x1
Index: llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
===================================================================
--- llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -203,7 +203,7 @@
                                    Value **MaybeMask);
 
   bool isInterestingAlloca(const AllocaInst &AI);
-  bool tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag);
+  bool tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size);
   Value *tagPointer(IRBuilder<> &IRB, Type *Ty, Value *PtrLong, Value *Tag);
   Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong);
   bool instrumentStack(
@@ -621,10 +621,35 @@
   }
 
   Instruction *CheckTerm =
-      SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, !Recover,
+      SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, false,
                                 MDBuilder(*C).createBranchWeights(1, 100000));
 
   IRB.SetInsertPoint(CheckTerm);
+  Value *OutOfShortGranuleTagRange =
+      IRB.CreateICmpUGT(MemTag, ConstantInt::get(Int8Ty, 15));
+  Instruction *CheckFailTerm =
+      SplitBlockAndInsertIfThen(OutOfShortGranuleTagRange, CheckTerm, !Recover,
+                                MDBuilder(*C).createBranchWeights(1, 100000));
+
+  IRB.SetInsertPoint(CheckTerm);
+  Value *PtrLowBits = IRB.CreateTrunc(IRB.CreateAnd(PtrLong, 15), Int8Ty);
+  PtrLowBits = IRB.CreateAdd(
+      PtrLowBits, ConstantInt::get(Int8Ty, (1 << AccessSizeIndex) - 1));
+  Value *PtrLowBitsOOB = IRB.CreateICmpUGE(PtrLowBits, MemTag);
+  SplitBlockAndInsertIfThen(PtrLowBitsOOB, CheckTerm, false,
+                            MDBuilder(*C).createBranchWeights(1, 100000),
+                            nullptr, nullptr, CheckFailTerm->getParent());
+
+  IRB.SetInsertPoint(CheckTerm);
+  Value *InlineTagAddr = IRB.CreateOr(AddrLong, 15);
+  InlineTagAddr = IRB.CreateIntToPtr(InlineTagAddr, Int8PtrTy);
+  Value *InlineTag = IRB.CreateLoad(Int8Ty, InlineTagAddr);
+  Value *InlineTagMismatch = IRB.CreateICmpNE(PtrTag, InlineTag);
+  SplitBlockAndInsertIfThen(InlineTagMismatch, CheckTerm, false,
+                            MDBuilder(*C).createBranchWeights(1, 100000),
+                            nullptr, nullptr, CheckFailTerm->getParent());
+
+  IRB.SetInsertPoint(CheckFailTerm);
   InlineAsm *Asm;
   switch (TargetTriple.getArch()) {
     case Triple::x86_64:
@@ -648,6 +673,8 @@
       report_fatal_error("unsupported architecture");
   }
   IRB.CreateCall(Asm, PtrLong);
+  if (Recover)
+    cast<BranchInst>(CheckFailTerm)->setSuccessor(0, CheckTerm->getParent());
 }
 
 void HWAddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
@@ -724,15 +751,14 @@
 }
 
 bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI,
-                                   Value *Tag) {
-  size_t Size = (getAllocaSizeInBytes(*AI) + Mapping.getAllocaAlignment() - 1) &
-                ~(Mapping.getAllocaAlignment() - 1);
+                                   Value *Tag, size_t Size) {
+  size_t AlignedSize = alignTo(Size, Mapping.getAllocaAlignment());
 
   Value *JustTag = IRB.CreateTrunc(Tag, IRB.getInt8Ty());
   if (ClInstrumentWithCalls) {
     IRB.CreateCall(HwasanTagMemoryFunc,
                    {IRB.CreatePointerCast(AI, Int8PtrTy), JustTag,
-                    ConstantInt::get(IntptrTy, Size)});
+                    ConstantInt::get(IntptrTy, AlignedSize)});
   } else {
     size_t ShadowSize = Size >> Mapping.Scale;
     Value *ShadowPtr = memToShadow(IRB.CreatePointerCast(AI, IntptrTy), IRB);
@@ -742,7 +768,16 @@
     // FIXME: the interceptor is not as fast as real memset. Consider lowering
     // llvm.memset right here into either a sequence of stores, or a call to
     // hwasan_tag_memory.
-    IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, /*Align=*/1);
+    if (ShadowSize)
+      IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, /*Align=*/1);
+    if (Size != AlignedSize) {
+      IRB.CreateStore(
+          ConstantInt::get(Int8Ty, Size % Mapping.getAllocaAlignment()),
+          IRB.CreateConstGEP1_32(Int8Ty, ShadowPtr, ShadowSize));
+      IRB.CreateStore(JustTag, IRB.CreateConstGEP1_32(
+                                   Int8Ty, IRB.CreateBitCast(AI, Int8PtrTy),
+                                   AlignedSize - 1));
+    }
   }
   return true;
 }
@@ -1041,14 +1076,15 @@
       DDI->setArgOperand(2, MetadataAsValue::get(*C, NewExpr));
     }
 
-    tagAlloca(IRB, AI, Tag);
+    size_t Size = getAllocaSizeInBytes(*AI);
+    tagAlloca(IRB, AI, Tag, Size);
 
     for (auto RI : RetVec) {
       IRB.SetInsertPoint(RI);
 
       // Re-tag alloca memory with the special UAR tag.
       Value *Tag = getUARTag(IRB, StackTag);
-      tagAlloca(IRB, AI, Tag);
+      tagAlloca(IRB, AI, Tag, alignTo(Size, Mapping.getAllocaAlignment()));
     }
   }
 
@@ -1089,11 +1125,6 @@
     for (auto &Inst : BB) {
       if (ClInstrumentStack)
         if (AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
-          // Realign all allocas. We don't want small uninteresting allocas to
-          // hide in instrumented alloca's padding.
-          if (AI->getAlignment() < Mapping.getAllocaAlignment())
-            AI->setAlignment(Mapping.getAllocaAlignment());
-          // Instrument some of them.
           if (isInterestingAlloca(*AI))
             AllocasToInstrument.push_back(AI);
           continue;
@@ -1149,6 +1180,45 @@
                                StackTag);
   }
 
+  // Pad and align each of the allocas that we instrumented to stop small
+  // uninteresting allocas from hiding in instrumented alloca's padding and so
+  // that we have enough space to store real tags for short granules.
+  DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap;
+  for (AllocaInst *AI : AllocasToInstrument) {
+    uint64_t Size = getAllocaSizeInBytes(*AI);
+    uint64_t AlignedSize = alignTo(Size, Mapping.getAllocaAlignment());
+    AI->setAlignment(std::max(AI->getAlignment(), 16u));
+    if (Size != AlignedSize) {
+      Type *TypeWithPadding = StructType::get(
+          AI->getAllocatedType(), ArrayType::get(Int8Ty, AlignedSize - Size));
+      auto *NewAI = new AllocaInst(
+          TypeWithPadding, AI->getType()->getAddressSpace(), nullptr, "", AI);
+      NewAI->takeName(AI);
+      NewAI->setAlignment(AI->getAlignment());
+      NewAI->setUsedWithInAlloca(AI->isUsedWithInAlloca());
+      NewAI->setSwiftError(AI->isSwiftError());
+      NewAI->copyMetadata(*AI);
+      Value *Zero = ConstantInt::get(Int32Ty, 0);
+      auto *GEP = GetElementPtrInst::Create(TypeWithPadding, NewAI,
+                                            {Zero, Zero}, "", AI);
+      AI->replaceAllUsesWith(GEP);
+      AllocaToPaddedAllocaMap[AI] = NewAI;
+    }
+  }
+
+  if (!AllocaToPaddedAllocaMap.empty()) {
+    for (auto &BB : F)
+      for (auto &Inst : BB)
+        if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst))
+          if (auto *AI =
+                  dyn_cast_or_null<AllocaInst>(DVI->getVariableLocation()))
+            if (auto *NewAI = AllocaToPaddedAllocaMap.lookup(AI))
+              DVI->setArgOperand(
+                  0, MetadataAsValue::get(*C, LocalAsMetadata::get(NewAI)));
+    for (auto &P : AllocaToPaddedAllocaMap)
+      P.first->eraseFromParent();
+  }
+
   // If we split the entry block, move any allocas that were originally in the
   // entry block back into the entry block so that they aren't treated as
   // dynamic allocas.
Index: llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -304,17 +304,82 @@
             .addReg(Reg)
             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
         *STI);
-    MCSymbol *HandleMismatchSym = OutContext.createTempSymbol();
+    MCSymbol *HandlePartialSym = OutContext.createTempSymbol();
     OutStreamer->EmitInstruction(
         MCInstBuilder(AArch64::Bcc)
             .addImm(AArch64CC::NE)
-            .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
+            .addExpr(MCSymbolRefExpr::create(HandlePartialSym, OutContext)),
         *STI);
+    MCSymbol *ReturnSym = OutContext.createTempSymbol();
+    OutStreamer->EmitLabel(ReturnSym);
     OutStreamer->EmitInstruction(
         MCInstBuilder(AArch64::RET).addReg(AArch64::LR), *STI);
 
-    OutStreamer->EmitLabel(HandleMismatchSym);
+    OutStreamer->EmitLabel(HandlePartialSym);
+    OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWri)
+                                     .addReg(AArch64::WZR)
+                                     .addReg(AArch64::W16)
+                                     .addImm(15)
+                                     .addImm(0),
+                                 *STI);
+    MCSymbol *HandleMismatchSym = OutContext.createTempSymbol();
+    OutStreamer->EmitInstruction(
+        MCInstBuilder(AArch64::Bcc)
+            .addImm(AArch64CC::HI)
+            .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
+        *STI);
+
+    OutStreamer->EmitInstruction(
+        MCInstBuilder(AArch64::ANDXri)
+            .addReg(AArch64::X17)
+            .addReg(Reg)
+            .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
+        *STI);
+    size_t Size = 1 << (AccessInfo & 0xf);
+    if (Size != 1)
+      OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ADDXri)
+                                       .addReg(AArch64::X17)
+                                       .addReg(AArch64::X17)
+                                       .addImm(Size - 1)
+                                       .addImm(0),
+                                   *STI);
+    OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWrs)
+                                     .addReg(AArch64::WZR)
+                                     .addReg(AArch64::W16)
+                                     .addReg(AArch64::W17)
+                                     .addImm(0),
+                                 *STI);
+    OutStreamer->EmitInstruction(
+        MCInstBuilder(AArch64::Bcc)
+            .addImm(AArch64CC::LS)
+            .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
+        *STI);
+
+    OutStreamer->EmitInstruction(
+        MCInstBuilder(AArch64::ORRXri)
+            .addReg(AArch64::X16)
+            .addReg(Reg)
+            .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
+        *STI);
+    OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBui)
+                                     .addReg(AArch64::W16)
+                                     .addReg(AArch64::X16)
+                                     .addImm(0),
+                                 *STI);
+    OutStreamer->EmitInstruction(
+        MCInstBuilder(AArch64::SUBSXrs)
+            .addReg(AArch64::XZR)
+            .addReg(AArch64::X16)
+            .addReg(Reg)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
+        *STI);
+    OutStreamer->EmitInstruction(
+        MCInstBuilder(AArch64::Bcc)
+            .addImm(AArch64CC::EQ)
+            .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)),
+        *STI);
 
+    OutStreamer->EmitLabel(HandleMismatchSym);
     OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXpre)
                                      .addReg(AArch64::SP)
                                      .addReg(AArch64::X0)
Index: compiler-rt/test/hwasan/TestCases/tail-magic.c
===================================================================
--- compiler-rt/test/hwasan/TestCases/tail-magic.c
+++ compiler-rt/test/hwasan/TestCases/tail-magic.c
@@ -10,19 +10,25 @@
 #include <stdio.h>
 #include <sanitizer/hwasan_interface.h>
 
-static volatile void *sink;
+static volatile char *sink;
+
+// Overwrite the tail in a non-hwasan function so that we don't detect the
+// stores as OOB.
+__attribute__((no_sanitize("hwaddress"))) void overwrite_tail() {
+  sink[20] = 0x42;
+  sink[24] = 0x66;
+}
 
 int main(int argc, char **argv) {
   __hwasan_enable_allocator_tagging();
 
   char *p = (char*)malloc(20);
-  sink = p;
-  p[20] = 0x42;
-  p[24] = 0x66;
+  sink = (char *)((uintptr_t)(p) & 0xffffffffffffff);
+  overwrite_tail();
   free(p);
 // CHECK: ERROR: HWAddressSanitizer: alocation-tail-overwritten; heap object [{{.*}}) of size 20
 // CHECK: in main {{.*}}tail-magic.c:[[@LINE-2]]
 // CHECK: allocated here:
-// CHECK: in main {{.*}}tail-magic.c:[[@LINE-8]]
+// CHECK: in main {{.*}}tail-magic.c:[[@LINE-7]]
 // CHECK: Tail contains: .. .. .. .. 42 {{.. .. ..}} 66
 }
Index: compiler-rt/test/hwasan/TestCases/stack-oob.c
===================================================================
--- compiler-rt/test/hwasan/TestCases/stack-oob.c
+++ compiler-rt/test/hwasan/TestCases/stack-oob.c
@@ -1,3 +1,4 @@
+// RUN: %clang_hwasan -DSIZE=15 -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
 // RUN: %clang_hwasan -DSIZE=16 -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
 // RUN: %clang_hwasan -DSIZE=64 -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
 // RUN: %clang_hwasan -DSIZE=0x1000 -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
@@ -17,7 +18,7 @@
 int main() {
   return f();
   // CHECK: READ of size 1 at
-  // CHECK: #0 {{.*}} in f{{.*}}stack-oob.c:14
+  // CHECK: #0 {{.*}} in f{{.*}}stack-oob.c:15
 
   // CHECK: is located in stack of threa
 
Index: compiler-rt/test/hwasan/TestCases/random-align-right.c
===================================================================
--- compiler-rt/test/hwasan/TestCases/random-align-right.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// Tests malloc_align_right=1 and 8 (randomly aligning right).
-// RUN: %clang_hwasan  %s -o %t
-//
-// RUN: %run %t 20
-// RUN: %run %t 30
-// RUN: %env_hwasan_opts=malloc_align_right=1 not %run %t 20 2>&1 | FileCheck %s --check-prefix=CHECK20
-// RUN: %env_hwasan_opts=malloc_align_right=1 not %run %t 30 2>&1 | FileCheck %s --check-prefix=CHECK30
-// RUN: %env_hwasan_opts=malloc_align_right=8 not %run %t 30 2>&1 | FileCheck %s --check-prefix=CHECK30
-
-// REQUIRES: stable-runtime
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <sanitizer/hwasan_interface.h>
-
-static volatile void *sink;
-
-int main(int argc, char **argv) {
-  __hwasan_enable_allocator_tagging();
-  int index = atoi(argv[1]);
-
-  // Perform 1000 buffer overflows within the 16-byte granule,
-  // so that random right-alignment has a very high chance of
-  // catching at least one of them.
-  for (int i = 0; i < 1000; i++) {
-    char *p = (char*)malloc(20);
-    sink = p;
-    p[index] = 0;
-// index=20 requires malloc_align_right=1 to catch
-// CHECK20: HWAddressSanitizer: tag-mismatch
-// index=30 requires malloc_align_right={1,8} to catch
-// CHECK30: HWAddressSanitizer: tag-mismatch
-  }
-}
-
Index: compiler-rt/test/hwasan/TestCases/heap-buffer-overflow.c
===================================================================
--- compiler-rt/test/hwasan/TestCases/heap-buffer-overflow.c
+++ compiler-rt/test/hwasan/TestCases/heap-buffer-overflow.c
@@ -1,21 +1,13 @@
 // RUN: %clang_hwasan  %s -o %t
-// RUN:                                       not %run %t 40 2>&1 | FileCheck %s --check-prefix=CHECK40-LEFT
-// RUN: %env_hwasan_opts=malloc_align_right=2 not %run %t 40 2>&1 | FileCheck %s --check-prefix=CHECK40-RIGHT
-// RUN:                                       not %run %t 80 2>&1 | FileCheck %s --check-prefix=CHECK80-LEFT
-// RUN: %env_hwasan_opts=malloc_align_right=2 not %run %t 80 2>&1 | FileCheck %s --check-prefix=CHECK80-RIGHT
+// RUN: not %run %t 40 2>&1 | FileCheck %s --check-prefix=CHECK40
+// RUN: not %run %t 80 2>&1 | FileCheck %s --check-prefix=CHECK80
 // RUN: not %run %t -30 2>&1 | FileCheck %s --check-prefix=CHECKm30
 // RUN: not %run %t -30 1000000 2>&1 | FileCheck %s --check-prefix=CHECKMm30
 // RUN: not %run %t 1000000 1000000 2>&1 | FileCheck %s --check-prefix=CHECKM
 
 // Test OOB within the granule.
-// Misses the bug when malloc is left-aligned, catches it otherwise.
-// RUN:                                           %run %t 31
-// RUN: %env_hwasan_opts=malloc_align_right=2 not %run %t 31 2>&1 | FileCheck %s --check-prefix=CHECK31
-
-// RUN:                                           %run %t 30 20
-// RUN: %env_hwasan_opts=malloc_align_right=9 not %run %t 30 20 2>&1 | FileCheck %s --check-prefix=CHECK20-RIGHT8
-
-// RUN: %env_hwasan_opts=malloc_align_right=42 not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WRONG-FLAG
+// RUN: not %run %t 31 2>&1 | FileCheck %s --check-prefix=CHECK31
+// RUN: not %run %t 30 20 2>&1 | FileCheck %s --check-prefix=CHECK20
 
 // REQUIRES: stable-runtime
 
@@ -33,15 +25,11 @@
   fprintf(stderr, "base: %p access: %p\n", x, &x[offset]);
   sink = x[offset];
 
-// CHECK40-LEFT: allocated heap chunk; size: 32 offset: 8
-// CHECK40-LEFT: is located 10 bytes to the right of 30-byte region
-// CHECK40-RIGHT: allocated heap chunk; size: 32 offset:
-// CHECK40-RIGHT: is located 10 bytes to the right of 30-byte region
+// CHECK40: allocated heap chunk; size: 32 offset: 8
+// CHECK40: is located 10 bytes to the right of 30-byte region
 //
-// CHECK80-LEFT: allocated heap chunk; size: 32 offset: 16
-// CHECK80-LEFT: is located 50 bytes to the right of 30-byte region
-// CHECK80-RIGHT: allocated heap chunk; size: 32 offset:
-// CHECK80-RIGHT: is located 50 bytes to the right of 30-byte region
+// CHECK80: allocated heap chunk; size: 32 offset: 16
+// CHECK80: is located 50 bytes to the right of 30-byte region
 //
 // CHECKm30: is located 30 bytes to the left of 30-byte region
 //
@@ -51,10 +39,13 @@
 // CHECKM: is a large allocated heap chunk; size: 1003520 offset: 1000000
 // CHECKM: is located 0 bytes to the right of 1000000-byte region
 //
+// CHECK31: tags: [[TAG:..]]/0e (ptr/mem)
 // CHECK31: is located 1 bytes to the right of 30-byte region
+// CHECK31: Memory tags around the buggy address
+// CHECK31: [0e]
+// CHECK31: Tags for short granules around the buggy address
+// CHECK31: {{\[}}[[TAG]]]
 //
-// CHECK20-RIGHT8: is located 10 bytes to the right of 20-byte region [0x{{.*}}8,0x{{.*}}c)
-//
-// CHECK-WRONG-FLAG: ERROR: unsupported value of malloc_align_right flag: 42
+// CHECK20: is located 10 bytes to the right of 20-byte region [0x{{.*}}0,0x{{.*}}4)
   free(x);
 }
Index: compiler-rt/lib/hwasan/hwasan_report.h
===================================================================
--- compiler-rt/lib/hwasan/hwasan_report.h
+++ compiler-rt/lib/hwasan/hwasan_report.h
@@ -25,7 +25,7 @@
                        bool is_store, bool fatal, uptr *registers_frame);
 void ReportInvalidFree(StackTrace *stack, uptr addr);
 void ReportTailOverwritten(StackTrace *stack, uptr addr, uptr orig_size,
-                           uptr tail_size, const u8 *expected);
+                           const u8 *expected);
 void ReportRegisters(uptr *registers_frame, uptr pc);
 void ReportAtExitStatistics();
 
Index: compiler-rt/lib/hwasan/hwasan_report.cpp
===================================================================
--- compiler-rt/lib/hwasan/hwasan_report.cpp
+++ compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -208,6 +208,19 @@
   }
 }
 
+// Returns true if tag == *tag_ptr, reading tags from short granules if
+// necessary. This may return a false positive if tags 1-15 are used as a
+// regular tag rather than a short granule marker.
+static bool TagsEqual(tag_t tag, tag_t *tag_ptr) {
+  if (tag == *tag_ptr)
+    return true;
+  if (*tag_ptr == 0 || *tag_ptr > kShadowAlignment - 1)
+    return false;
+  uptr mem = ShadowToMem(reinterpret_cast<uptr>(tag_ptr));
+  tag_t inline_tag = *reinterpret_cast<tag_t *>(mem + kShadowAlignment - 1);
+  return tag == inline_tag;
+}
+
 void PrintAddressDescription(
     uptr tagged_addr, uptr access_size,
     StackAllocationsRingBuffer *current_stack_allocations) {
@@ -235,39 +248,36 @@
   // check the allocator if it has a live chunk there.
   tag_t addr_tag = GetTagFromPointer(tagged_addr);
   tag_t *tag_ptr = reinterpret_cast<tag_t*>(MemToShadow(untagged_addr));
-  if (*tag_ptr != addr_tag) { // should be true usually.
-    tag_t *left = tag_ptr, *right = tag_ptr;
-    // scan left.
-    for (int i = 0; i < 1000 && *left == *tag_ptr; i++, left--){}
-    // scan right.
-    for (int i = 0; i < 1000 && *right == *tag_ptr; i++, right++){}
-    // Chose the object that has addr_tag and that is closer to addr.
-    tag_t *candidate = nullptr;
-    if (*right == addr_tag && *left == addr_tag)
-      candidate = right - tag_ptr < tag_ptr - left ? right : left;
-    else if (*right == addr_tag)
-      candidate = right;
-    else if (*left == addr_tag)
+  tag_t *candidate = nullptr, *left = tag_ptr, *right = tag_ptr;
+  for (int i = 0; i < 1000; i++) {
+    if (TagsEqual(addr_tag, left)) {
       candidate = left;
+      break;
+    }
+    --left;
+    if (TagsEqual(addr_tag, right)) {
+      candidate = right;
+      break;
+    }
+    ++right;
+  }
 
-    if (candidate) {
-      uptr mem = ShadowToMem(reinterpret_cast<uptr>(candidate));
-      HwasanChunkView chunk = FindHeapChunkByAddress(mem);
-      if (chunk.IsAllocated()) {
-        Printf("%s", d.Location());
-        Printf(
-            "%p is located %zd bytes to the %s of %zd-byte region [%p,%p)\n",
-            untagged_addr,
-            candidate == left ? untagged_addr - chunk.End()
-            : chunk.Beg() - untagged_addr,
-            candidate == right ? "left" : "right", chunk.UsedSize(),
-            chunk.Beg(), chunk.End());
-        Printf("%s", d.Allocation());
-        Printf("allocated here:\n");
-        Printf("%s", d.Default());
-        GetStackTraceFromId(chunk.GetAllocStackId()).Print();
-        num_descriptions_printed++;
-      }
+  if (candidate) {
+    uptr mem = ShadowToMem(reinterpret_cast<uptr>(candidate));
+    HwasanChunkView chunk = FindHeapChunkByAddress(mem);
+    if (chunk.IsAllocated()) {
+      Printf("%s", d.Location());
+      Printf("%p is located %zd bytes to the %s of %zd-byte region [%p,%p)\n",
+             untagged_addr,
+             candidate == left ? untagged_addr - chunk.End()
+                               : chunk.Beg() - untagged_addr,
+             candidate == left ? "right" : "left", chunk.UsedSize(),
+             chunk.Beg(), chunk.End());
+      Printf("%s", d.Allocation());
+      Printf("allocated here:\n");
+      Printf("%s", d.Default());
+      GetStackTraceFromId(chunk.GetAllocStackId()).Print();
+      num_descriptions_printed++;
     }
   }
 
@@ -325,13 +335,10 @@
 
 void ReportStats() {}
 
-static void PrintTagsAroundAddr(tag_t *tag_ptr) {
-  Printf(
-      "Memory tags around the buggy address (one tag corresponds to %zd "
-      "bytes):\n", kShadowAlignment);
-
+static void PrintTagInfoAroundAddr(tag_t *tag_ptr, uptr num_rows,
+                                   void (*print_tag)(InternalScopedString &s,
+                                                     tag_t *tag)) {
   const uptr row_len = 16;  // better be power of two.
-  const uptr num_rows = 17;
   tag_t *center_row_beg = reinterpret_cast<tag_t *>(
       RoundDownTo(reinterpret_cast<uptr>(tag_ptr), row_len));
   tag_t *beg_row = center_row_beg - row_len * (num_rows / 2);
@@ -341,7 +348,7 @@
     s.append("%s", row == center_row_beg ? "=>" : "  ");
     for (uptr i = 0; i < row_len; i++) {
       s.append("%s", row + i == tag_ptr ? "[" : " ");
-      s.append("%02x", row[i]);
+      print_tag(s, &row[i]);
       s.append("%s", row + i == tag_ptr ? "]" : " ");
     }
     s.append("%s\n", row == center_row_beg ? "<=" : "  ");
@@ -349,6 +356,34 @@
   Printf("%s", s.data());
 }
 
+static void PrintTagsAroundAddr(tag_t *tag_ptr) {
+  Printf(
+      "Memory tags around the buggy address (one tag corresponds to %zd "
+      "bytes):\n", kShadowAlignment);
+  PrintTagInfoAroundAddr(tag_ptr, 17, [](InternalScopedString &s, tag_t *tag) {
+    s.append("%02x", *tag);
+  });
+
+  Printf(
+      "Tags for short granules around the buggy address (one tag corresponds "
+      "to %zd bytes):\n",
+      kShadowAlignment);
+  PrintTagInfoAroundAddr(tag_ptr, 3, [](InternalScopedString &s, tag_t *tag) {
+    if (*tag >= 1 && *tag <= kShadowAlignment) {
+      uptr granule_addr = ShadowToMem(reinterpret_cast<uptr>(tag));
+      s.append("%02x",
+               *reinterpret_cast<u8 *>(granule_addr + kShadowAlignment - 1));
+    } else {
+      s.append("..");
+    }
+  });
+  Printf(
+      "See "
+      "https://clang.llvm.org/docs/";
+      "HardwareAssistedAddressSanitizerDesign.html#short-granules for a "
+      "description of short granule tags\n");
+}
+
 void ReportInvalidFree(StackTrace *stack, uptr tagged_addr) {
   ScopedReport R(flags()->halt_on_error);
 
@@ -376,7 +411,8 @@
 }
 
 void ReportTailOverwritten(StackTrace *stack, uptr tagged_addr, uptr orig_size,
-                           uptr tail_size, const u8 *expected) {
+                           const u8 *expected) {
+  uptr tail_size = kShadowAlignment - (orig_size % kShadowAlignment);
   ScopedReport R(flags()->halt_on_error);
   Decorator d;
   uptr untagged_addr = UntagAddr(tagged_addr);
@@ -420,11 +456,9 @@
     "to the right of a heap object, but within the %zd-byte granule, e.g.\n"
     "   char *x = new char[20];\n"
     "   x[25] = 42;\n"
-    "By default %s does not detect such bugs at the time of write,\n"
-    "but can detect them at the time of free/delete.\n"
-    "To disable this feature set HWASAN_OPTIONS=free_checks_tail_magic=0;\n"
-    "To enable checking at the time of access, set "
-    "HWASAN_OPTIONS=malloc_align_right to non-zero\n\n",
+    "%s does not detect such bugs in uninstrumented code at the time of write,"
+    "\nbut can detect them at the time of free/delete.\n"
+    "To disable this feature set HWASAN_OPTIONS=free_checks_tail_magic=0\n",
     kShadowAlignment, SanitizerToolName);
   Printf("%s", s.data());
   GetCurrentThread()->Announce();
Index: compiler-rt/lib/hwasan/hwasan_flags.inc
===================================================================
--- compiler-rt/lib/hwasan/hwasan_flags.inc
+++ compiler-rt/lib/hwasan/hwasan_flags.inc
@@ -37,32 +37,6 @@
     "HWASan allocator flag. max_malloc_fill_size is the maximal amount of "
     "bytes that will be filled with malloc_fill_byte on malloc.")
 
-// Rules for malloc alignment on aarch64:
-//   * If the size is 16-aligned, then malloc should return 16-aligned memory.
-//   * Otherwise, malloc should return 8-alignment memory.
-// So,
-//   * If the size is 16-aligned, we don't need to do anything.
-//   * Otherwise we don't have to obey 16-alignment, just the 8-alignment.
-//   * We may want to break the 8-alignment rule to catch more buffer overflows
-//     but this will break valid code in some rare cases, like this:
-//     struct Foo {
-//       // accessed via atomic instructions that require 8-alignment.
-//       std::atomic<int64_t> atomic_stuff;
-//       ...
-//       char vla[1];  // the actual size of vla could be anything.
-//     }
-// Which means that the safe values for malloc_align_right are 0, 8, 9,
-// and the values 1 and 2 may require changes in otherwise valid code.
-
-HWASAN_FLAG(
-    int, malloc_align_right, 0,  // off by default
-    "HWASan allocator flag. "
-    "0 (default): allocations are always aligned left to 16-byte boundary; "
-    "1: allocations are sometimes aligned right to 1-byte boundary (risky); "
-    "2: allocations are always aligned right to 1-byte boundary (risky); "
-    "8: allocations are sometimes aligned right to 8-byte boundary; "
-    "9: allocations are always aligned right to 8-byte boundary."
-  )
 HWASAN_FLAG(bool, free_checks_tail_magic, 1,
     "If set, free() will check the magic values "
     "to the right of the allocated object "
Index: compiler-rt/lib/hwasan/hwasan_checks.h
===================================================================
--- compiler-rt/lib/hwasan/hwasan_checks.h
+++ compiler-rt/lib/hwasan/hwasan_checks.h
@@ -61,15 +61,29 @@
   // __builtin_unreachable();
 }
 
+__attribute__((always_inline, nodebug)) static bool PossiblyShortTagMatches(
+    tag_t mem_tag, uptr ptr, uptr sz) {
+  tag_t ptr_tag = GetTagFromPointer(ptr);
+  if (ptr_tag == mem_tag)
+    return true;
+  if (mem_tag > 15)
+    return false;
+  if ((ptr & 15) + sz > mem_tag)
+    return false;
+#ifndef __aarch64__
+  ptr = UntagAddr(ptr);
+#endif
+  return *(u8 *)(ptr | 15) == ptr_tag;
+}
+
 enum class ErrorAction { Abort, Recover };
 enum class AccessType { Load, Store };
 
 template <ErrorAction EA, AccessType AT, unsigned LogSize>
 __attribute__((always_inline, nodebug)) static void CheckAddress(uptr p) {
-  tag_t ptr_tag = GetTagFromPointer(p);
   uptr ptr_raw = p & ~kAddressTagMask;
   tag_t mem_tag = *(tag_t *)MemToShadow(ptr_raw);
-  if (UNLIKELY(ptr_tag != mem_tag)) {
+  if (UNLIKELY(!PossiblyShortTagMatches(mem_tag, p, 1 << LogSize))) {
     SigTrap<0x20 * (EA == ErrorAction::Recover) +
             0x10 * (AT == AccessType::Store) + LogSize>(p);
     if (EA == ErrorAction::Abort)
@@ -85,15 +99,25 @@
   tag_t ptr_tag = GetTagFromPointer(p);
   uptr ptr_raw = p & ~kAddressTagMask;
   tag_t *shadow_first = (tag_t *)MemToShadow(ptr_raw);
-  tag_t *shadow_last = (tag_t *)MemToShadow(ptr_raw + sz - 1);
-  for (tag_t *t = shadow_first; t <= shadow_last; ++t)
+  tag_t *shadow_last = (tag_t *)MemToShadow(ptr_raw + sz);
+  for (tag_t *t = shadow_first; t < shadow_last; ++t)
     if (UNLIKELY(ptr_tag != *t)) {
       SigTrap<0x20 * (EA == ErrorAction::Recover) +
               0x10 * (AT == AccessType::Store) + 0xf>(p, sz);
       if (EA == ErrorAction::Abort)
         __builtin_unreachable();
     }
+  uptr end = p + sz;
+  uptr tail_sz = end & 0xf;
+  if (UNLIKELY(tail_sz != 0 && !PossiblyShortTagMatches(
+                                   *shadow_last, end & ~0xfull, tail_sz))) {
+    SigTrap<0x20 * (EA == ErrorAction::Recover) +
+            0x10 * (AT == AccessType::Store) + 0xf>(p, sz);
+    if (EA == ErrorAction::Abort)
+      __builtin_unreachable();
+  }
 }
+
 }  // end namespace __hwasan
 
 #endif  // HWASAN_CHECKS_H
Index: compiler-rt/lib/hwasan/hwasan_allocator.cpp
===================================================================
--- compiler-rt/lib/hwasan/hwasan_allocator.cpp
+++ compiler-rt/lib/hwasan/hwasan_allocator.cpp
@@ -16,6 +16,7 @@
 #include "sanitizer_common/sanitizer_stackdepot.h"
 #include "hwasan.h"
 #include "hwasan_allocator.h"
+#include "hwasan_checks.h"
 #include "hwasan_mapping.h"
 #include "hwasan_malloc_bisect.h"
 #include "hwasan_thread.h"
@@ -42,13 +43,8 @@
   kRightAlignAlways
 };
 
-// These two variables are initialized from flags()->malloc_align_right
-// in HwasanAllocatorInit and are never changed afterwards.
-static RightAlignMode right_align_mode = kRightAlignNever;
-static bool right_align_8 = false;
-
 // Initialized in HwasanAllocatorInit, an never changed.
-static ALIGNED(16) u8 tail_magic[kShadowAlignment];
+static ALIGNED(16) u8 tail_magic[kShadowAlignment - 1];
 
 bool HwasanChunkView::IsAllocated() const {
   return metadata_ && metadata_->alloc_context_id && metadata_->requested_size;
@@ -58,8 +54,6 @@
 static uptr AlignRight(uptr addr, uptr requested_size) {
   uptr tail_size = requested_size % kShadowAlignment;
   if (!tail_size) return addr;
-  if (right_align_8)
-    return tail_size > 8 ? addr : addr + 8;
   return addr + kShadowAlignment - tail_size;
 }
 
@@ -95,30 +89,7 @@
                        !flags()->disable_allocator_tagging);
   SetAllocatorMayReturnNull(common_flags()->allocator_may_return_null);
   allocator.Init(common_flags()->allocator_release_to_os_interval_ms);
-  switch (flags()->malloc_align_right) {
-    case 0: break;
-    case 1:
-      right_align_mode = kRightAlignSometimes;
-      right_align_8 = false;
-      break;
-    case 2:
-      right_align_mode = kRightAlignAlways;
-      right_align_8 = false;
-      break;
-    case 8:
-      right_align_mode = kRightAlignSometimes;
-      right_align_8 = true;
-      break;
-    case 9:
-      right_align_mode = kRightAlignAlways;
-      right_align_8 = true;
-      break;
-    default:
-      Report("ERROR: unsupported value of malloc_align_right flag: %d\n",
-             flags()->malloc_align_right);
-      Die();
-  }
-  for (uptr i = 0; i < kShadowAlignment; i++)
+  for (uptr i = 0; i < sizeof(tail_magic); i++)
     tail_magic[i] = GetCurrentThread()->GenerateRandomTag();
 }
 
@@ -172,9 +143,10 @@
     uptr fill_size = Min(size, (uptr)flags()->max_malloc_fill_size);
     internal_memset(allocated, flags()->malloc_fill_byte, fill_size);
   }
-  if (!right_align_mode)
+  if (size != orig_size) {
     internal_memcpy(reinterpret_cast<u8 *>(allocated) + orig_size, tail_magic,
-                    size - orig_size);
+                    size - orig_size - 1);
+  }
 
   void *user_ptr = allocated;
   // Tagging can only be skipped when both tag_in_malloc and tag_in_free are
@@ -185,16 +157,14 @@
     tag_t tag = flags()->tag_in_malloc && malloc_bisect(stack, orig_size)
                     ? (t ? t->GenerateRandomTag() : kFallbackAllocTag)
                     : 0;
-    user_ptr = (void *)TagMemoryAligned((uptr)user_ptr, size, tag);
-  }
-
-  if ((orig_size % kShadowAlignment) && (alignment <= kShadowAlignment) &&
-      right_align_mode) {
-    uptr as_uptr = reinterpret_cast<uptr>(user_ptr);
-    if (right_align_mode == kRightAlignAlways ||
-        GetTagFromPointer(as_uptr) & 1) {  // use a tag bit as a random bit.
-      user_ptr = reinterpret_cast<void *>(AlignRight(as_uptr, orig_size));
-      meta->right_aligned = 1;
+    uptr tag_size = orig_size ? orig_size : 1;
+    uptr full_granule_size = RoundDownTo(tag_size, kShadowAlignment);
+    user_ptr = (void *)TagMemoryAligned((uptr)user_ptr, full_granule_size, tag);
+    if (full_granule_size != tag_size) {
+      u8 *short_granule = reinterpret_cast<u8 *>(allocated) + full_granule_size;
+      TagMemoryAligned((uptr)short_granule, kShadowAlignment,
+                       tag_size % kShadowAlignment);
+      short_granule[kShadowAlignment - 1] = tag;
     }
   }
 
@@ -204,10 +174,10 @@
 
 static bool PointerAndMemoryTagsMatch(void *tagged_ptr) {
   CHECK(tagged_ptr);
-  tag_t ptr_tag = GetTagFromPointer(reinterpret_cast<uptr>(tagged_ptr));
+  uptr tagged_uptr = reinterpret_cast<uptr>(tagged_ptr);
   tag_t mem_tag = *reinterpret_cast<tag_t *>(
       MemToShadow(reinterpret_cast<uptr>(UntagPtr(tagged_ptr))));
-  return ptr_tag == mem_tag;
+  return PossiblyShortTagMatches(mem_tag, tagged_uptr, 1);
 }
 
 static void HwasanDeallocate(StackTrace *stack, void *tagged_ptr) {
@@ -228,14 +198,15 @@
 
   // Check tail magic.
   uptr tagged_size = TaggedSize(orig_size);
-  if (flags()->free_checks_tail_magic && !right_align_mode && orig_size) {
-    uptr tail_size = tagged_size - orig_size;
+  if (flags()->free_checks_tail_magic && orig_size &&
+      tagged_size != orig_size) {
+    uptr tail_size = tagged_size - orig_size - 1;
     CHECK_LT(tail_size, kShadowAlignment);
     void *tail_beg = reinterpret_cast<void *>(
         reinterpret_cast<uptr>(aligned_ptr) + orig_size);
     if (tail_size && internal_memcmp(tail_beg, tail_magic, tail_size))
       ReportTailOverwritten(stack, reinterpret_cast<uptr>(tagged_ptr),
-                            orig_size, tail_size, tail_magic);
+                            orig_size, tail_magic);
   }
 
   meta->requested_size = 0;
Index: clang/docs/HardwareAssistedAddressSanitizerDesign.rst
===================================================================
--- clang/docs/HardwareAssistedAddressSanitizerDesign.rst
+++ clang/docs/HardwareAssistedAddressSanitizerDesign.rst
@@ -38,6 +38,30 @@
 
 For a more detailed discussion of this approach see https://arxiv.org/pdf/1802.09517.pdf
 
+Short granules
+--------------
+
+A short granule is a granule of size between 1 and `TG-1` bytes. The size
+of a short granule is stored at the location in shadow memory where the
+granule's tag is normally stored, while the granule's actual tag is stored
+in the last byte of the granule. This means that in order to verify that a
+pointer tag matches a memory tag, HWASAN must check for two possibilities:
+
+* the pointer tag is equal to the memory tag in shadow memory, or
+* the shadow memory tag is actually a short granule size, the value being loaded
+  is in bounds of the granule and the pointer tag is equal to the last byte of
+  the granule.
+
+Pointer tags between 1 to `TG-1` are possible and are as likely as any other
+tag. This means that these tags in memory have two interpretations: the full
+tag interpretation (where the pointer tag is between 1 and `TG-1` and the
+last byte of the granule is ordinary data) and the short tag interpretation
+(where the pointer tag is stored in the granule).
+
+When HWASAN detects an error near a memory tag between 1 and `TG-1`, it
+will show both the memory tag and the last byte of the granule. Currently,
+it is up to the user to disambiguate the two possibilities.
+
 Instrumentation
 ===============
 
@@ -46,24 +70,40 @@
 All memory accesses are prefixed with an inline instruction sequence that
 verifies the tags. Currently, the following sequence is used:
 
-
 .. code-block:: none
 
   // int foo(int *a) { return *a; }
-  // clang -O2 --target=aarch64-linux -fsanitize=hwaddress -c load.c
+  // clang -O2 --target=aarch64-linux -fsanitize=hwaddress -fsanitize-recover=hwaddress -c load.c
   foo:
-       0:	08 00 00 90 	adrp	x8, 0 <__hwasan_shadow>
-       4:	08 01 40 f9 	ldr	x8, [x8]          // shadow base (to be resolved by the loader)
-       8:	09 dc 44 d3 	ubfx	x9, x0, #4, #52 // shadow offset
-       c:	28 69 68 38 	ldrb	w8, [x9, x8]    // load shadow tag
-      10:	09 fc 78 d3 	lsr	x9, x0, #56       // extract address tag
-      14:	3f 01 08 6b 	cmp	w9, w8            // compare tags
-      18:	61 00 00 54 	b.ne	24              // jump on mismatch
-      1c:	00 00 40 b9 	ldr	w0, [x0]          // original load
-      20:	c0 03 5f d6 	ret
-      24:	40 20 21 d4 	brk	#0x902            // trap
+       0:	90000008 	adrp	x8, 0 <__hwasan_shadow>
+       4:	f9400108 	ldr	x8, [x8]         // shadow base (to be resolved by the loader)
+       8:	d344dc09 	ubfx	x9, x0, #4, #52  // shadow offset
+       c:	38696909 	ldrb	w9, [x8, x9]     // load shadow tag
+      10:	d378fc08 	lsr	x8, x0, #56      // extract address tag
+      14:	6b09011f 	cmp	w8, w9           // compare tags
+      18:	54000061 	b.ne	24 <foo+0x24>    // jump to short tag handler on mismatch
+      1c:	b9400000 	ldr	w0, [x0]         // original load
+      20:	d65f03c0 	ret
+      24:	7100413f 	cmp	w9, #0x10        // is this a short tag?
+      28:	54000142 	b.cs	50 <foo+0x50>    // if not, trap
+      2c:	12000c0a 	and	w10, w0, #0xf    // find the address's position in the short granule
+      30:	11000d4a 	add	w10, w10, #0x3   // adjust to the position of the last byte loaded
+      34:	6b09015f 	cmp	w10, w9          // check that position is in bounds
+      38:	540000c2 	b.cs	50 <foo+0x50>    // if not, trap
+      3c:	9240dc09 	and	x9, x0, #0xffffffffffffff
+      40:	b2400d29 	orr	x9, x9, #0xf     // compute address of last byte of granule
+      44:	39400129 	ldrb	w9, [x9]         // load tag from it
+      48:	6b09011f 	cmp	w8, w9           // compare with pointer tag
+      4c:	54fffe80 	b.eq	1c <foo+0x1c>    // if so, continue
+      50:	d4212440 	brk	#0x922           // otherwise trap
+      54:	b9400000 	ldr	w0, [x0]         // tail duplicated original load (to handle recovery)
+      58:	d65f03c0 	ret
 
 Alternatively, memory accesses are prefixed with a function call.
+On AArch64, a function call is used by default in trapping mode. The code size
+and performance overhead of the call is reduced by using a custom calling
+convention that preserves most registers and is specialized to the register
+containing the address and the type and size of the memory access.
 
 Heap
 ----
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to