pcc created this revision.
pcc added a reviewer: eugenis.
Herald added subscribers: Sanitizers, cfe-commits, jfb, hiraditya, javed.absar, 
kubamracek.
Herald added projects: clang, Sanitizers, LLVM.

A short granule is a granule of size between 1 and `TG-1` bytes. The size
of a short granule is stored at the location in shadow memory where the
granule's tag is normally stored, while the granule's actual tag is stored
in the last byte of the granule. This means that in order to verify that a
pointer tag matches a memory tag, HWASAN must check for two possibilities:

- the pointer tag is equal to the memory tag in shadow memory, or
- the shadow memory tag is actually a short granule size, the value being 
loaded is in bounds of the granule and the pointer tag is equal to the last 
byte of the granule.

Pointer tags between 1 to `TG-1` are possible and are as likely as any other
tag. This means that these tags in memory have two interpretations: the full
tag interpretation (where the pointer tag is between 1 and `TG-1` and the
last byte of the granule is ordinary data) and the short tag interpretation
(where the pointer tag is stored in the granule).

When HWASAN detects an error near a memory tag between 1 and `TG-1`, it
will show both the memory tag and the last byte of the granule. Currently,
it is up to the user to disambiguate the two possibilities.

Because this functionality obsoletes the right aligned heap feature of
the HWASAN memory allocator (and because we can no longer easily test
it), the feature is removed.

Also update the documentation to cover both short granule tags and
outlined checks.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D63908

Files:
  clang/docs/HardwareAssistedAddressSanitizerDesign.rst
  compiler-rt/lib/hwasan/hwasan_allocator.cpp
  compiler-rt/lib/hwasan/hwasan_checks.h
  compiler-rt/lib/hwasan/hwasan_flags.inc
  compiler-rt/lib/hwasan/hwasan_report.cpp
  compiler-rt/lib/hwasan/hwasan_report.h
  compiler-rt/test/hwasan/TestCases/heap-buffer-overflow.c
  compiler-rt/test/hwasan/TestCases/random-align-right.c
  compiler-rt/test/hwasan/TestCases/stack-oob.c
  compiler-rt/test/hwasan/TestCases/tail-magic.c
  llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
  llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
  llvm/test/CodeGen/AArch64/hwasan-check-memaccess.ll
  llvm/test/Instrumentation/HWAddressSanitizer/alloca-with-calls.ll
  llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
  llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
  llvm/test/Instrumentation/HWAddressSanitizer/kernel-alloca.ll

Index: llvm/test/Instrumentation/HWAddressSanitizer/kernel-alloca.ll
===================================================================
--- llvm/test/Instrumentation/HWAddressSanitizer/kernel-alloca.ll
+++ llvm/test/Instrumentation/HWAddressSanitizer/kernel-alloca.ll
@@ -14,9 +14,10 @@
 ; CHECK: %[[B:[^ ]*]] = lshr i64 %[[A]], 20
 ; CHECK: %[[BASE_TAG:[^ ]*]] = xor i64 %[[A]], %[[B]]
 
-; CHECK: %[[X:[^ ]*]] = alloca i32, align 16
+; CHECK: %[[X:[^ ]*]] = alloca { i32, [12 x i8] }, align 16
+; CHECK: %[[X_GEP:[^ ]*]] = getelementptr { i32, [12 x i8] }, { i32, [12 x i8] }* %[[X]], i32 0, i32 0
 ; CHECK: %[[X_TAG:[^ ]*]] = xor i64 %[[BASE_TAG]], 0
-; CHECK: %[[X1:[^ ]*]] = ptrtoint i32* %[[X]] to i64
+; CHECK: %[[X1:[^ ]*]] = ptrtoint i32* %[[X_GEP]] to i64
 ; CHECK: %[[C:[^ ]*]] = shl i64 %[[X_TAG]], 56
 ; CHECK: %[[D:[^ ]*]] = or i64 %[[C]], 72057594037927935
 ; CHECK: %[[E:[^ ]*]] = and i64 %[[X1]], %[[D]]
Index: llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
===================================================================
--- llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
+++ llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
@@ -28,11 +28,35 @@
 ; RECOVER-ZERO-BASED-SHADOW: %[[E:[^ ]*]] = inttoptr i64 %[[D]] to i8*
 ; RECOVER: %[[MEMTAG:[^ ]*]] = load i8, i8* %[[E]]
 ; RECOVER: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
-; RECOVER: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
+; RECOVER: br i1 %[[F]], label %[[MISMATCH:[0-9]*]], label %[[CONT:[0-9]*]], !prof {{.*}}
+
+; RECOVER: [[MISMATCH]]:
+; RECOVER: %[[NOTSHORT:[^ ]*]] = icmp ugt i8 %[[MEMTAG]], 15
+; RECOVER: br i1 %[[NOTSHORT]], label %[[FAIL:[0-9]*]], label %[[SHORT:[0-9]*]], !prof {{.*}}
 
+; RECOVER: [[FAIL]]:
 ; RECOVER: call void asm sideeffect "brk #2336", "{x0}"(i64 %[[A]])
 ; RECOVER: br label
 
+; RECOVER: [[SHORT]]:
+; RECOVER: %[[LOWBITS:[^ ]*]] = and i64 %[[A]], 15
+; RECOVER: %[[LOWBITS_I8:[^ ]*]] = trunc i64 %[[LOWBITS]] to i8
+; RECOVER: %[[LAST:[^ ]*]] = add i8 %[[LOWBITS_I8]], 0
+; RECOVER: %[[OOB:[^ ]*]] = icmp uge i8 %[[LAST]], %[[MEMTAG]]
+; RECOVER: br i1 %[[OOB]], label %[[FAIL]], label %[[INBOUNDS:[0-9]*]], !prof {{.*}}
+
+; RECOVER: [[INBOUNDS]]:
+; RECOVER: %[[EOG_ADDR:[^ ]*]] = or i64 %[[C]], 15
+; RECOVER: %[[EOG_PTR:[^ ]*]] = inttoptr i64 %[[EOG_ADDR]] to i8*
+; RECOVER: %[[EOGTAG:[^ ]*]] = load i8, i8* %[[EOG_PTR]]
+; RECOVER: %[[EOG_MISMATCH:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[EOGTAG]]
+; RECOVER: br i1 %[[EOG_MISMATCH]], label %[[FAIL]], label %[[CONT1:[0-9]*]], !prof {{.*}}
+
+; RECOVER: [[CONT1]]:
+; RECOVER: br label %[[CONT]]
+
+; RECOVER: [[CONT]]:
+
 ; ABORT-DYNAMIC-SHADOW: call void @llvm.hwasan.check.memaccess(i8* %.hwasan.shadow, i8* %a, i32 0)
 ; ABORT-ZERO-BASED-SHADOW: call void @llvm.hwasan.check.memaccess(i8* null, i8* %a, i32 0)
 
@@ -55,11 +79,35 @@
 ; RECOVER-ZERO-BASED-SHADOW: %[[E:[^ ]*]] = inttoptr i64 %[[D]] to i8*
 ; RECOVER: %[[MEMTAG:[^ ]*]] = load i8, i8* %[[E]]
 ; RECOVER: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]]
-; RECOVER: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}}
+; RECOVER: br i1 %[[F]], label %[[MISMATCH:[0-9]*]], label %[[CONT:[0-9]*]], !prof {{.*}}
+
+; RECOVER: [[MISMATCH]]:
+; RECOVER: %[[NOTSHORT:[^ ]*]] = icmp ugt i8 %[[MEMTAG]], 15
+; RECOVER: br i1 %[[NOTSHORT]], label %[[FAIL:[0-9]*]], label %[[SHORT:[0-9]*]], !prof {{.*}}
 
+; RECOVER: [[FAIL]]:
 ; RECOVER: call void asm sideeffect "brk #2337", "{x0}"(i64 %[[A]])
 ; RECOVER: br label
 
+; RECOVER: [[SHORT]]:
+; RECOVER: %[[LOWBITS:[^ ]*]] = and i64 %[[A]], 15
+; RECOVER: %[[LOWBITS_I8:[^ ]*]] = trunc i64 %[[LOWBITS]] to i8
+; RECOVER: %[[LAST:[^ ]*]] = add i8 %[[LOWBITS_I8]], 1
+; RECOVER: %[[OOB:[^ ]*]] = icmp uge i8 %[[LAST]], %[[MEMTAG]]
+; RECOVER: br i1 %[[OOB]], label %[[FAIL]], label %[[INBOUNDS:[0-9]*]], !prof {{.*}}
+
+; RECOVER: [[INBOUNDS]]:
+; RECOVER: %[[EOG_ADDR:[^ ]*]] = or i64 %[[C]], 15
+; RECOVER: %[[EOG_PTR:[^ ]*]] = inttoptr i64 %[[EOG_ADDR]] to i8*
+; RECOVER: %[[EOGTAG:[^ ]*]] = load i8, i8* %[[EOG_PTR]]
+; RECOVER: %[[EOG_MISMATCH:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[EOGTAG]]
+; RECOVER: br i1 %[[EOG_MISMATCH]], label %[[FAIL]], label %[[CONT1:[0-9]*]], !prof {{.*}}
+
+; RECOVER: [[CONT1]]:
+; RECOVER: br label %[[CONT]]
+
+; RECOVER: [[CONT]]:
+
 ; ABORT: %[[A:[^ ]*]] = bitcast i16* %a to i8*
 ; ABORT-DYNAMIC-SHADOW: call void @llvm.hwasan.check.memaccess(i8* %.hwasan.shadow, i8* %[[A]], i32 1)
 ; ABORT-ZERO-BASED-SHADOW: call void @llvm.hwasan.check.memaccess(i8* null, i8* %[[A]], i32 1)
Index: llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
===================================================================
--- llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
+++ llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
@@ -16,24 +16,29 @@
 ; CHECK: %[[B:[^ ]*]] = lshr i64 %[[A]], 20
 ; CHECK: %[[BASE_TAG:[^ ]*]] = xor i64 %[[A]], %[[B]]
 
-; CHECK: %[[X:[^ ]*]] = alloca i32, align 16
+; CHECK: %[[X:[^ ]*]] = alloca { i32, [12 x i8] }, align 16
+; CHECK: %[[X_GEP:[^ ]*]] = getelementptr { i32, [12 x i8] }, { i32, [12 x i8] }* %[[X]], i32 0, i32 0
 ; CHECK: %[[X_TAG:[^ ]*]] = xor i64 %[[BASE_TAG]], 0
-; CHECK: %[[X1:[^ ]*]] = ptrtoint i32* %[[X]] to i64
+; CHECK: %[[X1:[^ ]*]] = ptrtoint i32* %[[X_GEP]] to i64
 ; CHECK: %[[C:[^ ]*]] = shl i64 %[[X_TAG]], 56
 ; CHECK: %[[D:[^ ]*]] = or i64 %[[X1]], %[[C]]
 ; CHECK: %[[X_HWASAN:[^ ]*]] = inttoptr i64 %[[D]] to i32*
 
 ; CHECK: %[[X_TAG2:[^ ]*]] = trunc i64 %[[X_TAG]] to i8
-; CHECK: %[[E:[^ ]*]] = ptrtoint i32* %[[X]] to i64
+; CHECK: %[[E:[^ ]*]] = ptrtoint i32* %[[X_GEP]] to i64
 ; CHECK: %[[F:[^ ]*]] = lshr i64 %[[E]], 4
 ; DYNAMIC-SHADOW: %[[X_SHADOW:[^ ]*]] = getelementptr i8, i8* %.hwasan.shadow, i64 %[[F]]
 ; ZERO-BASED-SHADOW: %[[X_SHADOW:[^ ]*]] = inttoptr i64 %[[F]] to i8*
-; CHECK: call void @llvm.memset.p0i8.i64(i8* align 1 %[[X_SHADOW]], i8 %[[X_TAG2]], i64 1, i1 false)
+; CHECK: %[[X_SHADOW_GEP:[^ ]*]] = getelementptr i8, i8* %[[X_SHADOW]], i32 0
+; CHECK: store i8 4, i8* %[[X_SHADOW_GEP]]
+; CHECK: %[[X_I8:[^ ]*]] = bitcast i32* %[[X_GEP]] to i8*
+; CHECK: %[[X_I8_GEP:[^ ]*]] = getelementptr i8, i8* %[[X_I8]], i32 15
+; CHECK: store i8 %[[X_TAG2]], i8* %[[X_I8_GEP]]
 ; CHECK: call void @use32(i32* nonnull %[[X_HWASAN]])
 
 ; UAR-TAGS: %[[BASE_TAG_COMPL:[^ ]*]] = xor i64 %[[BASE_TAG]], 255
 ; UAR-TAGS: %[[X_TAG_UAR:[^ ]*]] = trunc i64 %[[BASE_TAG_COMPL]] to i8
-; CHECK: %[[E2:[^ ]*]] = ptrtoint i32* %[[X]] to i64
+; CHECK: %[[E2:[^ ]*]] = ptrtoint i32* %[[X_GEP]] to i64
 ; CHECK: %[[F2:[^ ]*]] = lshr i64 %[[E2]], 4
 ; DYNAMIC-SHADOW: %[[X_SHADOW2:[^ ]*]] = getelementptr i8, i8* %.hwasan.shadow, i64 %[[F2]]
 ; ZERO-BASED-SHADOW: %[[X_SHADOW2:[^ ]*]] = inttoptr i64 %[[F2]] to i8*
Index: llvm/test/Instrumentation/HWAddressSanitizer/alloca-with-calls.ll
===================================================================
--- llvm/test/Instrumentation/HWAddressSanitizer/alloca-with-calls.ll
+++ llvm/test/Instrumentation/HWAddressSanitizer/alloca-with-calls.ll
@@ -9,9 +9,10 @@
 
 define void @test_alloca() sanitize_hwaddress {
 ; CHECK-LABEL: @test_alloca(
+; CHECK: %[[GEP:[^ ]*]] = getelementptr { i32, [12 x i8] }, { i32, [12 x i8] }* %x, i32 0, i32 0
 ; CHECK: %[[T1:[^ ]*]] = call i8 @__hwasan_generate_tag()
 ; CHECK: %[[A:[^ ]*]] = zext i8 %[[T1]] to i64
-; CHECK: %[[B:[^ ]*]] = ptrtoint i32* %x to i64
+; CHECK: %[[B:[^ ]*]] = ptrtoint i32* %[[GEP]] to i64
 ; CHECK: %[[C:[^ ]*]] = shl i64 %[[A]], 56
 ; CHECK: or i64 %[[B]], %[[C]]
 
Index: llvm/test/CodeGen/AArch64/hwasan-check-memaccess.ll
===================================================================
--- llvm/test/CodeGen/AArch64/hwasan-check-memaccess.ll
+++ llvm/test/CodeGen/AArch64/hwasan-check-memaccess.ll
@@ -40,8 +40,20 @@
 ; CHECK-NEXT: ldrb w16, [x9, x16]
 ; CHECK-NEXT: cmp x16, x0, lsr #56
 ; CHECK-NEXT: b.ne .Ltmp0
+; CHECK-NEXT: .Ltmp1:
 ; CHECK-NEXT: ret
 ; CHECK-NEXT: .Ltmp0:
+; CHECK-NEXT: cmp w16, #15
+; CHECK-NEXT: b.hi .Ltmp2
+; CHECK-NEXT: and x17, x0, #0xf
+; CHECK-NEXT: add x17, x17, #255
+; CHECK-NEXT: cmp w16, w17
+; CHECK-NEXT: b.ls .Ltmp2
+; CHECK-NEXT: orr x16, x0, #0xf
+; CHECK-NEXT: ldrb w16, [x16]
+; CHECK-NEXT: cmp x16, x0, lsr #56
+; CHECK-NEXT: b.eq .Ltmp1
+; CHECK-NEXT: .Ltmp2:
 ; CHECK-NEXT: stp x0, x1, [sp, #-256]!
 ; CHECK-NEXT: stp x29, x30, [sp, #232]
 ; CHECK-NEXT: mov x1, #456
@@ -58,9 +70,21 @@
 ; CHECK-NEXT: ubfx x16, x1, #4, #52
 ; CHECK-NEXT: ldrb w16, [x9, x16]
 ; CHECK-NEXT: cmp x16, x1, lsr #56
-; CHECK-NEXT: b.ne .Ltmp1
+; CHECK-NEXT: b.ne .Ltmp3
+; CHECK-NEXT: .Ltmp4:
 ; CHECK-NEXT: ret
-; CHECK-NEXT: .Ltmp1:
+; CHECK-NEXT: .Ltmp3:
+; CHECK-NEXT: cmp w16, #15
+; CHECK-NEXT: b.hi .Ltmp5
+; CHECK-NEXT: and x17, x1, #0xf
+; CHECK-NEXT: add x17, x17, #2047
+; CHECK-NEXT: cmp w16, w17
+; CHECK-NEXT: b.ls .Ltmp5
+; CHECK-NEXT: orr x16, x1, #0xf
+; CHECK-NEXT: ldrb w16, [x16]
+; CHECK-NEXT: cmp x16, x1, lsr #56
+; CHECK-NEXT: b.eq .Ltmp4
+; CHECK-NEXT: .Ltmp5:
 ; CHECK-NEXT: stp x0, x1, [sp, #-256]!
 ; CHECK-NEXT: stp x29, x30, [sp, #232]
 ; CHECK-NEXT: mov x0, x1
Index: llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
===================================================================
--- llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -203,7 +203,7 @@
                                    Value **MaybeMask);
 
   bool isInterestingAlloca(const AllocaInst &AI);
-  bool tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag);
+  bool tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, bool Zero);
   Value *tagPointer(IRBuilder<> &IRB, Type *Ty, Value *PtrLong, Value *Tag);
   Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong);
   bool instrumentStack(
@@ -621,10 +621,35 @@
   }
 
   Instruction *CheckTerm =
-      SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, !Recover,
+      SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, false,
                                 MDBuilder(*C).createBranchWeights(1, 100000));
 
   IRB.SetInsertPoint(CheckTerm);
+  Value *OutOfShortGranuleTagRange =
+      IRB.CreateICmpUGT(MemTag, ConstantInt::get(Int8Ty, 15));
+  Instruction *CheckFailTerm =
+      SplitBlockAndInsertIfThen(OutOfShortGranuleTagRange, CheckTerm, !Recover,
+                                MDBuilder(*C).createBranchWeights(1, 100000));
+
+  IRB.SetInsertPoint(CheckTerm);
+  Value *PtrLowBits = IRB.CreateTrunc(IRB.CreateAnd(PtrLong, 15), Int8Ty);
+  PtrLowBits = IRB.CreateAdd(
+      PtrLowBits, ConstantInt::get(Int8Ty, (1 << AccessSizeIndex) - 1));
+  Value *PtrLowBitsOOB = IRB.CreateICmpUGE(PtrLowBits, MemTag);
+  SplitBlockAndInsertIfThen(PtrLowBitsOOB, CheckTerm, false,
+                            MDBuilder(*C).createBranchWeights(1, 100000),
+                            nullptr, nullptr, CheckFailTerm->getParent());
+
+  IRB.SetInsertPoint(CheckTerm);
+  Value *InlineTagAddr = IRB.CreateOr(AddrLong, 15);
+  InlineTagAddr = IRB.CreateIntToPtr(InlineTagAddr, Int8PtrTy);
+  Value *InlineTag = IRB.CreateLoad(Int8Ty, InlineTagAddr);
+  Value *InlineTagMismatch = IRB.CreateICmpNE(PtrTag, InlineTag);
+  SplitBlockAndInsertIfThen(InlineTagMismatch, CheckTerm, false,
+                            MDBuilder(*C).createBranchWeights(1, 100000),
+                            nullptr, nullptr, CheckFailTerm->getParent());
+
+  IRB.SetInsertPoint(CheckFailTerm);
   InlineAsm *Asm;
   switch (TargetTriple.getArch()) {
     case Triple::x86_64:
@@ -648,6 +673,8 @@
       report_fatal_error("unsupported architecture");
   }
   IRB.CreateCall(Asm, PtrLong);
+  if (Recover)
+    cast<BranchInst>(CheckFailTerm)->setSuccessor(0, CheckTerm->getParent());
 }
 
 void HWAddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
@@ -724,15 +751,17 @@
 }
 
 bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI,
-                                   Value *Tag) {
-  size_t Size = (getAllocaSizeInBytes(*AI) + Mapping.getAllocaAlignment() - 1) &
-                ~(Mapping.getAllocaAlignment() - 1);
+                                   Value *Tag, bool Zero) {
+  size_t Size = getAllocaSizeInBytes(*AI);
+  size_t AlignedSize = alignTo(Size, Mapping.getAllocaAlignment());
+  if (Zero)
+    Size = AlignedSize;
 
   Value *JustTag = IRB.CreateTrunc(Tag, IRB.getInt8Ty());
   if (ClInstrumentWithCalls) {
     IRB.CreateCall(HwasanTagMemoryFunc,
                    {IRB.CreatePointerCast(AI, Int8PtrTy), JustTag,
-                    ConstantInt::get(IntptrTy, Size)});
+                    ConstantInt::get(IntptrTy, AlignedSize)});
   } else {
     size_t ShadowSize = Size >> Mapping.Scale;
     Value *ShadowPtr = memToShadow(IRB.CreatePointerCast(AI, IntptrTy), IRB);
@@ -742,7 +771,16 @@
     // FIXME: the interceptor is not as fast as real memset. Consider lowering
     // llvm.memset right here into either a sequence of stores, or a call to
     // hwasan_tag_memory.
-    IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, /*Align=*/1);
+    if (ShadowSize)
+      IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, /*Align=*/1);
+    if (Size != AlignedSize) {
+      IRB.CreateStore(
+          ConstantInt::get(Int8Ty, Size % Mapping.getAllocaAlignment()),
+          IRB.CreateConstGEP1_32(Int8Ty, ShadowPtr, ShadowSize));
+      IRB.CreateStore(JustTag, IRB.CreateConstGEP1_32(
+                                   Int8Ty, IRB.CreateBitCast(AI, Int8PtrTy),
+                                   AlignedSize - 1));
+    }
   }
   return true;
 }
@@ -1041,14 +1079,14 @@
       DDI->setArgOperand(2, MetadataAsValue::get(*C, NewExpr));
     }
 
-    tagAlloca(IRB, AI, Tag);
+    tagAlloca(IRB, AI, Tag, false);
 
     for (auto RI : RetVec) {
       IRB.SetInsertPoint(RI);
 
       // Re-tag alloca memory with the special UAR tag.
       Value *Tag = getUARTag(IRB, StackTag);
-      tagAlloca(IRB, AI, Tag);
+      tagAlloca(IRB, AI, Tag, true);
     }
   }
 
@@ -1089,11 +1127,6 @@
     for (auto &Inst : BB) {
       if (ClInstrumentStack)
         if (AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
-          // Realign all allocas. We don't want small uninteresting allocas to
-          // hide in instrumented alloca's padding.
-          if (AI->getAlignment() < Mapping.getAllocaAlignment())
-            AI->setAlignment(Mapping.getAllocaAlignment());
-          // Instrument some of them.
           if (isInterestingAlloca(*AI))
             AllocasToInstrument.push_back(AI);
           continue;
@@ -1149,6 +1182,45 @@
                                StackTag);
   }
 
+  // Pad and align each of the allocas that we instrumented to stop small
+  // uninteresting allocas from hiding in instrumented alloca's padding and so
+  // that we have enough space to store real tags for short granules.
+  DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap;
+  for (AllocaInst *AI : AllocasToInstrument) {
+    uint64_t Size = getAllocaSizeInBytes(*AI);
+    uint64_t AlignedSize = alignTo(Size, Mapping.getAllocaAlignment());
+    AI->setAlignment(std::max(AI->getAlignment(), 16u));
+    if (Size != AlignedSize) {
+      Type *TypeWithPadding = StructType::get(
+          AI->getAllocatedType(), ArrayType::get(Int8Ty, AlignedSize - Size));
+      auto *NewAI = new AllocaInst(
+          TypeWithPadding, AI->getType()->getAddressSpace(), nullptr, "", AI);
+      NewAI->takeName(AI);
+      NewAI->setAlignment(AI->getAlignment());
+      NewAI->setUsedWithInAlloca(AI->isUsedWithInAlloca());
+      NewAI->setSwiftError(AI->isSwiftError());
+      NewAI->copyMetadata(*AI);
+      Value *Zero = ConstantInt::get(Int32Ty, 0);
+      auto *GEP = GetElementPtrInst::Create(TypeWithPadding, NewAI,
+                                            {Zero, Zero}, "", AI);
+      AI->replaceAllUsesWith(GEP);
+      AllocaToPaddedAllocaMap[AI] = NewAI;
+    }
+  }
+
+  if (!AllocaToPaddedAllocaMap.empty()) {
+    for (auto &BB : F)
+      for (auto &Inst : BB)
+        if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst))
+          if (auto *AI =
+                  dyn_cast_or_null<AllocaInst>(DVI->getVariableLocation()))
+            if (auto *NewAI = AllocaToPaddedAllocaMap.lookup(AI))
+              DVI->setArgOperand(
+                  0, MetadataAsValue::get(*C, LocalAsMetadata::get(NewAI)));
+    for (auto &P : AllocaToPaddedAllocaMap)
+      P.first->eraseFromParent();
+  }
+
   // If we split the entry block, move any allocas that were originally in the
   // entry block back into the entry block so that they aren't treated as
   // dynamic allocas.
Index: llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -304,17 +304,82 @@
             .addReg(Reg)
             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
         *STI);
-    MCSymbol *HandleMismatchSym = OutContext.createTempSymbol();
+    MCSymbol *HandlePartialSym = OutContext.createTempSymbol();
     OutStreamer->EmitInstruction(
         MCInstBuilder(AArch64::Bcc)
             .addImm(AArch64CC::NE)
-            .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
+            .addExpr(MCSymbolRefExpr::create(HandlePartialSym, OutContext)),
         *STI);
+    MCSymbol *ReturnSym = OutContext.createTempSymbol();
+    OutStreamer->EmitLabel(ReturnSym);
     OutStreamer->EmitInstruction(
         MCInstBuilder(AArch64::RET).addReg(AArch64::LR), *STI);
 
-    OutStreamer->EmitLabel(HandleMismatchSym);
+    OutStreamer->EmitLabel(HandlePartialSym);
+    OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWri)
+                                     .addReg(AArch64::WZR)
+                                     .addReg(AArch64::W16)
+                                     .addImm(15)
+                                     .addImm(0),
+                                 *STI);
+    MCSymbol *HandleMismatchSym = OutContext.createTempSymbol();
+    OutStreamer->EmitInstruction(
+        MCInstBuilder(AArch64::Bcc)
+            .addImm(AArch64CC::HI)
+            .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
+        *STI);
+
+    OutStreamer->EmitInstruction(
+        MCInstBuilder(AArch64::ANDXri)
+            .addReg(AArch64::X17)
+            .addReg(Reg)
+            .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
+        *STI);
+    size_t Size = 1 << (AccessInfo & 0xf);
+    if (Size != 1)
+      OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ADDXri)
+                                       .addReg(AArch64::X17)
+                                       .addReg(AArch64::X17)
+                                       .addImm(Size - 1)
+                                       .addImm(0),
+                                   *STI);
+    OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWrs)
+                                     .addReg(AArch64::WZR)
+                                     .addReg(AArch64::W16)
+                                     .addReg(AArch64::W17)
+                                     .addImm(0),
+                                 *STI);
+    OutStreamer->EmitInstruction(
+        MCInstBuilder(AArch64::Bcc)
+            .addImm(AArch64CC::LS)
+            .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
+        *STI);
+
+    OutStreamer->EmitInstruction(
+        MCInstBuilder(AArch64::ORRXri)
+            .addReg(AArch64::X16)
+            .addReg(Reg)
+            .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
+        *STI);
+    OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBui)
+                                     .addReg(AArch64::W16)
+                                     .addReg(AArch64::X16)
+                                     .addImm(0),
+                                 *STI);
+    OutStreamer->EmitInstruction(
+        MCInstBuilder(AArch64::SUBSXrs)
+            .addReg(AArch64::XZR)
+            .addReg(AArch64::X16)
+            .addReg(Reg)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
+        *STI);
+    OutStreamer->EmitInstruction(
+        MCInstBuilder(AArch64::Bcc)
+            .addImm(AArch64CC::EQ)
+            .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)),
+        *STI);
 
+    OutStreamer->EmitLabel(HandleMismatchSym);
     OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXpre)
                                      .addReg(AArch64::SP)
                                      .addReg(AArch64::X0)
Index: compiler-rt/test/hwasan/TestCases/tail-magic.c
===================================================================
--- compiler-rt/test/hwasan/TestCases/tail-magic.c
+++ compiler-rt/test/hwasan/TestCases/tail-magic.c
@@ -10,19 +10,25 @@
 #include <stdio.h>
 #include <sanitizer/hwasan_interface.h>
 
-static volatile void *sink;
+static volatile char *sink;
+
+// Overwrite the tail in a non-hwasan function so that we don't detect the
+// stores as OOB.
+__attribute__((no_sanitize("hwaddress"))) void overwrite_tail() {
+  sink[20] = 0x42;
+  sink[24] = 0x66;
+}
 
 int main(int argc, char **argv) {
   __hwasan_enable_allocator_tagging();
 
   char *p = (char*)malloc(20);
-  sink = p;
-  p[20] = 0x42;
-  p[24] = 0x66;
+  sink = (char *)((uintptr_t)(p) & 0xffffffffffffff);
+  overwrite_tail();
   free(p);
 // CHECK: ERROR: HWAddressSanitizer: alocation-tail-overwritten; heap object [{{.*}}) of size 20
 // CHECK: in main {{.*}}tail-magic.c:[[@LINE-2]]
 // CHECK: allocated here:
-// CHECK: in main {{.*}}tail-magic.c:[[@LINE-8]]
+// CHECK: in main {{.*}}tail-magic.c:[[@LINE-7]]
 // CHECK: Tail contains: .. .. .. .. 42 {{.. .. ..}} 66
 }
Index: compiler-rt/test/hwasan/TestCases/stack-oob.c
===================================================================
--- compiler-rt/test/hwasan/TestCases/stack-oob.c
+++ compiler-rt/test/hwasan/TestCases/stack-oob.c
@@ -1,3 +1,4 @@
+// RUN: %clang_hwasan -DSIZE=15 -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
 // RUN: %clang_hwasan -DSIZE=16 -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
 // RUN: %clang_hwasan -DSIZE=64 -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
 // RUN: %clang_hwasan -DSIZE=0x1000 -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
@@ -17,7 +18,7 @@
 int main() {
   return f();
   // CHECK: READ of size 1 at
-  // CHECK: #0 {{.*}} in f{{.*}}stack-oob.c:14
+  // CHECK: #0 {{.*}} in f{{.*}}stack-oob.c:15
 
   // CHECK: is located in stack of threa
 
Index: compiler-rt/test/hwasan/TestCases/random-align-right.c
===================================================================
--- compiler-rt/test/hwasan/TestCases/random-align-right.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// Tests malloc_align_right=1 and 8 (randomly aligning right).
-// RUN: %clang_hwasan  %s -o %t
-//
-// RUN: %run %t 20
-// RUN: %run %t 30
-// RUN: %env_hwasan_opts=malloc_align_right=1 not %run %t 20 2>&1 | FileCheck %s --check-prefix=CHECK20
-// RUN: %env_hwasan_opts=malloc_align_right=1 not %run %t 30 2>&1 | FileCheck %s --check-prefix=CHECK30
-// RUN: %env_hwasan_opts=malloc_align_right=8 not %run %t 30 2>&1 | FileCheck %s --check-prefix=CHECK30
-
-// REQUIRES: stable-runtime
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <sanitizer/hwasan_interface.h>
-
-static volatile void *sink;
-
-int main(int argc, char **argv) {
-  __hwasan_enable_allocator_tagging();
-  int index = atoi(argv[1]);
-
-  // Perform 1000 buffer overflows within the 16-byte granule,
-  // so that random right-alignment has a very high chance of
-  // catching at least one of them.
-  for (int i = 0; i < 1000; i++) {
-    char *p = (char*)malloc(20);
-    sink = p;
-    p[index] = 0;
-// index=20 requires malloc_align_right=1 to catch
-// CHECK20: HWAddressSanitizer: tag-mismatch
-// index=30 requires malloc_align_right={1,8} to catch
-// CHECK30: HWAddressSanitizer: tag-mismatch
-  }
-}
-
Index: compiler-rt/test/hwasan/TestCases/heap-buffer-overflow.c
===================================================================
--- compiler-rt/test/hwasan/TestCases/heap-buffer-overflow.c
+++ compiler-rt/test/hwasan/TestCases/heap-buffer-overflow.c
@@ -1,21 +1,13 @@
 // RUN: %clang_hwasan  %s -o %t
-// RUN:                                       not %run %t 40 2>&1 | FileCheck %s --check-prefix=CHECK40-LEFT
-// RUN: %env_hwasan_opts=malloc_align_right=2 not %run %t 40 2>&1 | FileCheck %s --check-prefix=CHECK40-RIGHT
-// RUN:                                       not %run %t 80 2>&1 | FileCheck %s --check-prefix=CHECK80-LEFT
-// RUN: %env_hwasan_opts=malloc_align_right=2 not %run %t 80 2>&1 | FileCheck %s --check-prefix=CHECK80-RIGHT
+// RUN: not %run %t 40 2>&1 | FileCheck %s --check-prefix=CHECK40
+// RUN: not %run %t 80 2>&1 | FileCheck %s --check-prefix=CHECK80
 // RUN: not %run %t -30 2>&1 | FileCheck %s --check-prefix=CHECKm30
 // RUN: not %run %t -30 1000000 2>&1 | FileCheck %s --check-prefix=CHECKMm30
 // RUN: not %run %t 1000000 1000000 2>&1 | FileCheck %s --check-prefix=CHECKM
 
 // Test OOB within the granule.
-// Misses the bug when malloc is left-aligned, catches it otherwise.
-// RUN:                                           %run %t 31
-// RUN: %env_hwasan_opts=malloc_align_right=2 not %run %t 31 2>&1 | FileCheck %s --check-prefix=CHECK31
-
-// RUN:                                           %run %t 30 20
-// RUN: %env_hwasan_opts=malloc_align_right=9 not %run %t 30 20 2>&1 | FileCheck %s --check-prefix=CHECK20-RIGHT8
-
-// RUN: %env_hwasan_opts=malloc_align_right=42 not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-WRONG-FLAG
+// RUN: not %run %t 31 2>&1 | FileCheck %s --check-prefix=CHECK31
+// RUN: not %run %t 30 20 2>&1 | FileCheck %s --check-prefix=CHECK20
 
 // REQUIRES: stable-runtime
 
@@ -33,15 +25,11 @@
   fprintf(stderr, "base: %p access: %p\n", x, &x[offset]);
   sink = x[offset];
 
-// CHECK40-LEFT: allocated heap chunk; size: 32 offset: 8
-// CHECK40-LEFT: is located 10 bytes to the right of 30-byte region
-// CHECK40-RIGHT: allocated heap chunk; size: 32 offset:
-// CHECK40-RIGHT: is located 10 bytes to the right of 30-byte region
+// CHECK40: allocated heap chunk; size: 32 offset: 8
+// CHECK40: is located 10 bytes to the right of 30-byte region
 //
-// CHECK80-LEFT: allocated heap chunk; size: 32 offset: 16
-// CHECK80-LEFT: is located 50 bytes to the right of 30-byte region
-// CHECK80-RIGHT: allocated heap chunk; size: 32 offset:
-// CHECK80-RIGHT: is located 50 bytes to the right of 30-byte region
+// CHECK80: allocated heap chunk; size: 32 offset: 16
+// CHECK80: is located 50 bytes to the right of 30-byte region
 //
 // CHECKm30: is located 30 bytes to the left of 30-byte region
 //
@@ -51,10 +39,13 @@
 // CHECKM: is a large allocated heap chunk; size: 1003520 offset: 1000000
 // CHECKM: is located 0 bytes to the right of 1000000-byte region
 //
+// CHECK31: tags: [[TAG:..]]/0e (ptr/mem)
 // CHECK31: is located 1 bytes to the right of 30-byte region
+// CHECK31: Memory tags around the buggy address
+// CHECK31: [0e]
+// CHECK31: Tags for short granules around the buggy address
+// CHECK31: {{\[}}[[TAG]]]
 //
-// CHECK20-RIGHT8: is located 10 bytes to the right of 20-byte region [0x{{.*}}8,0x{{.*}}c)
-//
-// CHECK-WRONG-FLAG: ERROR: unsupported value of malloc_align_right flag: 42
+// CHECK20: is located 10 bytes to the right of 20-byte region [0x{{.*}}0,0x{{.*}}4)
   free(x);
 }
Index: compiler-rt/lib/hwasan/hwasan_report.h
===================================================================
--- compiler-rt/lib/hwasan/hwasan_report.h
+++ compiler-rt/lib/hwasan/hwasan_report.h
@@ -25,7 +25,7 @@
                        bool is_store, bool fatal, uptr *registers_frame);
 void ReportInvalidFree(StackTrace *stack, uptr addr);
 void ReportTailOverwritten(StackTrace *stack, uptr addr, uptr orig_size,
-                           uptr tail_size, const u8 *expected);
+                           const u8 *expected);
 void ReportRegisters(uptr *registers_frame, uptr pc);
 void ReportAtExitStatistics();
 
Index: compiler-rt/lib/hwasan/hwasan_report.cpp
===================================================================
--- compiler-rt/lib/hwasan/hwasan_report.cpp
+++ compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -208,6 +208,19 @@
   }
 }
 
+// Returns true if tag == *tag_ptr, reading tags from short granules if
+// necessary. This may return a false positive if tags 1-15 are used as a
+// regular tag rather than a short granule marker.
+static bool TagsEqual(tag_t tag, tag_t *tag_ptr) {
+  if (tag == *tag_ptr)
+    return true;
+  if (*tag_ptr == 0 || *tag_ptr > kShadowAlignment - 1)
+    return false;
+  uptr mem = ShadowToMem(reinterpret_cast<uptr>(tag_ptr));
+  tag_t inline_tag = *reinterpret_cast<tag_t *>(mem + kShadowAlignment - 1);
+  return tag == inline_tag;
+}
+
 void PrintAddressDescription(
     uptr tagged_addr, uptr access_size,
     StackAllocationsRingBuffer *current_stack_allocations) {
@@ -235,39 +248,36 @@
   // check the allocator if it has a live chunk there.
   tag_t addr_tag = GetTagFromPointer(tagged_addr);
   tag_t *tag_ptr = reinterpret_cast<tag_t*>(MemToShadow(untagged_addr));
-  if (*tag_ptr != addr_tag) { // should be true usually.
-    tag_t *left = tag_ptr, *right = tag_ptr;
-    // scan left.
-    for (int i = 0; i < 1000 && *left == *tag_ptr; i++, left--){}
-    // scan right.
-    for (int i = 0; i < 1000 && *right == *tag_ptr; i++, right++){}
-    // Chose the object that has addr_tag and that is closer to addr.
-    tag_t *candidate = nullptr;
-    if (*right == addr_tag && *left == addr_tag)
-      candidate = right - tag_ptr < tag_ptr - left ? right : left;
-    else if (*right == addr_tag)
-      candidate = right;
-    else if (*left == addr_tag)
+  tag_t *candidate = nullptr, *left = tag_ptr, *right = tag_ptr;
+  for (int i = 0; i < 1000; i++) {
+    if (TagsEqual(addr_tag, left)) {
       candidate = left;
+      break;
+    }
+    --left;
+    if (TagsEqual(addr_tag, right)) {
+      candidate = right;
+      break;
+    }
+    ++right;
+  }
 
-    if (candidate) {
-      uptr mem = ShadowToMem(reinterpret_cast<uptr>(candidate));
-      HwasanChunkView chunk = FindHeapChunkByAddress(mem);
-      if (chunk.IsAllocated()) {
-        Printf("%s", d.Location());
-        Printf(
-            "%p is located %zd bytes to the %s of %zd-byte region [%p,%p)\n",
-            untagged_addr,
-            candidate == left ? untagged_addr - chunk.End()
-            : chunk.Beg() - untagged_addr,
-            candidate == right ? "left" : "right", chunk.UsedSize(),
-            chunk.Beg(), chunk.End());
-        Printf("%s", d.Allocation());
-        Printf("allocated here:\n");
-        Printf("%s", d.Default());
-        GetStackTraceFromId(chunk.GetAllocStackId()).Print();
-        num_descriptions_printed++;
-      }
+  if (candidate) {
+    uptr mem = ShadowToMem(reinterpret_cast<uptr>(candidate));
+    HwasanChunkView chunk = FindHeapChunkByAddress(mem);
+    if (chunk.IsAllocated()) {
+      Printf("%s", d.Location());
+      Printf("%p is located %zd bytes to the %s of %zd-byte region [%p,%p)\n",
+             untagged_addr,
+             candidate == left ? untagged_addr - chunk.End()
+                               : chunk.Beg() - untagged_addr,
+             candidate == left ? "right" : "left", chunk.UsedSize(),
+             chunk.Beg(), chunk.End());
+      Printf("%s", d.Allocation());
+      Printf("allocated here:\n");
+      Printf("%s", d.Default());
+      GetStackTraceFromId(chunk.GetAllocStackId()).Print();
+      num_descriptions_printed++;
     }
   }
 
@@ -325,13 +335,10 @@
 
 void ReportStats() {}
 
-static void PrintTagsAroundAddr(tag_t *tag_ptr) {
-  Printf(
-      "Memory tags around the buggy address (one tag corresponds to %zd "
-      "bytes):\n", kShadowAlignment);
-
+static void PrintTagInfoAroundAddr(tag_t *tag_ptr, uptr num_rows,
+                                   void (*print_tag)(InternalScopedString &s,
+                                                     tag_t *tag)) {
   const uptr row_len = 16;  // better be power of two.
-  const uptr num_rows = 17;
   tag_t *center_row_beg = reinterpret_cast<tag_t *>(
       RoundDownTo(reinterpret_cast<uptr>(tag_ptr), row_len));
   tag_t *beg_row = center_row_beg - row_len * (num_rows / 2);
@@ -341,7 +348,7 @@
     s.append("%s", row == center_row_beg ? "=>" : "  ");
     for (uptr i = 0; i < row_len; i++) {
       s.append("%s", row + i == tag_ptr ? "[" : " ");
-      s.append("%02x", row[i]);
+      print_tag(s, &row[i]);
       s.append("%s", row + i == tag_ptr ? "]" : " ");
     }
     s.append("%s\n", row == center_row_beg ? "<=" : "  ");
@@ -349,6 +356,34 @@
   Printf("%s", s.data());
 }
 
+static void PrintTagsAroundAddr(tag_t *tag_ptr) {
+  Printf(
+      "Memory tags around the buggy address (one tag corresponds to %zd "
+      "bytes):\n", kShadowAlignment);
+  PrintTagInfoAroundAddr(tag_ptr, 17, [](InternalScopedString &s, tag_t *tag) {
+    s.append("%02x", *tag);
+  });
+
+  Printf(
+      "Tags for short granules around the buggy address (one tag corresponds "
+      "to %zd bytes):\n",
+      kShadowAlignment);
+  PrintTagInfoAroundAddr(tag_ptr, 3, [](InternalScopedString &s, tag_t *tag) {
+    if (*tag >= 1 && *tag <= kShadowAlignment) {
+      uptr granule_addr = ShadowToMem(reinterpret_cast<uptr>(tag));
+      s.append("%02x",
+               *reinterpret_cast<u8 *>(granule_addr + kShadowAlignment - 1));
+    } else {
+      s.append("..");
+    }
+  });
+  Printf(
+      "See "
+      "https://clang.llvm.org/docs/";
+      "HardwareAssistedAddressSanitizerDesign.html#short-granules for a "
+      "description of short granule tags\n");
+}
+
 void ReportInvalidFree(StackTrace *stack, uptr tagged_addr) {
   ScopedReport R(flags()->halt_on_error);
 
@@ -376,7 +411,8 @@
 }
 
 void ReportTailOverwritten(StackTrace *stack, uptr tagged_addr, uptr orig_size,
-                           uptr tail_size, const u8 *expected) {
+                           const u8 *expected) {
+  uptr tail_size = kShadowAlignment - (orig_size % kShadowAlignment);
   ScopedReport R(flags()->halt_on_error);
   Decorator d;
   uptr untagged_addr = UntagAddr(tagged_addr);
@@ -420,11 +456,9 @@
     "to the right of a heap object, but within the %zd-byte granule, e.g.\n"
     "   char *x = new char[20];\n"
     "   x[25] = 42;\n"
-    "By default %s does not detect such bugs at the time of write,\n"
-    "but can detect them at the time of free/delete.\n"
-    "To disable this feature set HWASAN_OPTIONS=free_checks_tail_magic=0;\n"
-    "To enable checking at the time of access, set "
-    "HWASAN_OPTIONS=malloc_align_right to non-zero\n\n",
+    "%s does not detect such bugs in uninstrumented code at the time of write,"
+    "\nbut can detect them at the time of free/delete.\n"
+    "To disable this feature set HWASAN_OPTIONS=free_checks_tail_magic=0\n",
     kShadowAlignment, SanitizerToolName);
   Printf("%s", s.data());
   GetCurrentThread()->Announce();
Index: compiler-rt/lib/hwasan/hwasan_flags.inc
===================================================================
--- compiler-rt/lib/hwasan/hwasan_flags.inc
+++ compiler-rt/lib/hwasan/hwasan_flags.inc
@@ -37,32 +37,6 @@
     "HWASan allocator flag. max_malloc_fill_size is the maximal amount of "
     "bytes that will be filled with malloc_fill_byte on malloc.")
 
-// Rules for malloc alignment on aarch64:
-//   * If the size is 16-aligned, then malloc should return 16-aligned memory.
-//   * Otherwise, malloc should return 8-alignment memory.
-// So,
-//   * If the size is 16-aligned, we don't need to do anything.
-//   * Otherwise we don't have to obey 16-alignment, just the 8-alignment.
-//   * We may want to break the 8-alignment rule to catch more buffer overflows
-//     but this will break valid code in some rare cases, like this:
-//     struct Foo {
-//       // accessed via atomic instructions that require 8-alignment.
-//       std::atomic<int64_t> atomic_stuff;
-//       ...
-//       char vla[1];  // the actual size of vla could be anything.
-//     }
-// Which means that the safe values for malloc_align_right are 0, 8, 9,
-// and the values 1 and 2 may require changes in otherwise valid code.
-
-HWASAN_FLAG(
-    int, malloc_align_right, 0,  // off by default
-    "HWASan allocator flag. "
-    "0 (default): allocations are always aligned left to 16-byte boundary; "
-    "1: allocations are sometimes aligned right to 1-byte boundary (risky); "
-    "2: allocations are always aligned right to 1-byte boundary (risky); "
-    "8: allocations are sometimes aligned right to 8-byte boundary; "
-    "9: allocations are always aligned right to 8-byte boundary."
-  )
 HWASAN_FLAG(bool, free_checks_tail_magic, 1,
     "If set, free() will check the magic values "
     "to the right of the allocated object "
Index: compiler-rt/lib/hwasan/hwasan_checks.h
===================================================================
--- compiler-rt/lib/hwasan/hwasan_checks.h
+++ compiler-rt/lib/hwasan/hwasan_checks.h
@@ -61,15 +61,29 @@
   // __builtin_unreachable();
 }
 
+__attribute__((always_inline, nodebug)) static bool PossiblyShortTagMatches(
+    tag_t mem_tag, uptr ptr, uptr sz) {
+  tag_t ptr_tag = GetTagFromPointer(ptr);
+  if (ptr_tag == mem_tag)
+    return true;
+  if (mem_tag > 15)
+    return false;
+  if ((ptr & 15) + sz > mem_tag)
+    return false;
+#ifndef __aarch64__
+  ptr = UntagAddr(ptr);
+#endif
+  return *(u8 *)(ptr | 15) == ptr_tag;
+}
+
 enum class ErrorAction { Abort, Recover };
 enum class AccessType { Load, Store };
 
 template <ErrorAction EA, AccessType AT, unsigned LogSize>
 __attribute__((always_inline, nodebug)) static void CheckAddress(uptr p) {
-  tag_t ptr_tag = GetTagFromPointer(p);
   uptr ptr_raw = p & ~kAddressTagMask;
   tag_t mem_tag = *(tag_t *)MemToShadow(ptr_raw);
-  if (UNLIKELY(ptr_tag != mem_tag)) {
+  if (UNLIKELY(!PossiblyShortTagMatches(mem_tag, p, 1 << LogSize))) {
     SigTrap<0x20 * (EA == ErrorAction::Recover) +
             0x10 * (AT == AccessType::Store) + LogSize>(p);
     if (EA == ErrorAction::Abort)
@@ -85,15 +99,25 @@
   tag_t ptr_tag = GetTagFromPointer(p);
   uptr ptr_raw = p & ~kAddressTagMask;
   tag_t *shadow_first = (tag_t *)MemToShadow(ptr_raw);
-  tag_t *shadow_last = (tag_t *)MemToShadow(ptr_raw + sz - 1);
-  for (tag_t *t = shadow_first; t <= shadow_last; ++t)
+  tag_t *shadow_last = (tag_t *)MemToShadow(ptr_raw + sz);
+  for (tag_t *t = shadow_first; t < shadow_last; ++t)
     if (UNLIKELY(ptr_tag != *t)) {
       SigTrap<0x20 * (EA == ErrorAction::Recover) +
               0x10 * (AT == AccessType::Store) + 0xf>(p, sz);
       if (EA == ErrorAction::Abort)
         __builtin_unreachable();
     }
+  uptr end = p + sz;
+  uptr tail_sz = end & 0xf;
+  if (UNLIKELY(tail_sz != 0 && !PossiblyShortTagMatches(
+                                   *shadow_last, end & ~0xfull, tail_sz))) {
+    SigTrap<0x20 * (EA == ErrorAction::Recover) +
+            0x10 * (AT == AccessType::Store) + 0xf>(p, sz);
+    if (EA == ErrorAction::Abort)
+      __builtin_unreachable();
+  }
 }
+
 }  // end namespace __hwasan
 
 #endif  // HWASAN_CHECKS_H
Index: compiler-rt/lib/hwasan/hwasan_allocator.cpp
===================================================================
--- compiler-rt/lib/hwasan/hwasan_allocator.cpp
+++ compiler-rt/lib/hwasan/hwasan_allocator.cpp
@@ -16,6 +16,7 @@
 #include "sanitizer_common/sanitizer_stackdepot.h"
 #include "hwasan.h"
 #include "hwasan_allocator.h"
+#include "hwasan_checks.h"
 #include "hwasan_mapping.h"
 #include "hwasan_malloc_bisect.h"
 #include "hwasan_thread.h"
@@ -42,13 +43,8 @@
   kRightAlignAlways
 };
 
-// These two variables are initialized from flags()->malloc_align_right
-// in HwasanAllocatorInit and are never changed afterwards.
-static RightAlignMode right_align_mode = kRightAlignNever;
-static bool right_align_8 = false;
-
 // Initialized in HwasanAllocatorInit, an never changed.
-static ALIGNED(16) u8 tail_magic[kShadowAlignment];
+static ALIGNED(16) u8 tail_magic[kShadowAlignment - 1];
 
 bool HwasanChunkView::IsAllocated() const {
   return metadata_ && metadata_->alloc_context_id && metadata_->requested_size;
@@ -58,8 +54,6 @@
 static uptr AlignRight(uptr addr, uptr requested_size) {
   uptr tail_size = requested_size % kShadowAlignment;
   if (!tail_size) return addr;
-  if (right_align_8)
-    return tail_size > 8 ? addr : addr + 8;
   return addr + kShadowAlignment - tail_size;
 }
 
@@ -95,30 +89,7 @@
                        !flags()->disable_allocator_tagging);
   SetAllocatorMayReturnNull(common_flags()->allocator_may_return_null);
   allocator.Init(common_flags()->allocator_release_to_os_interval_ms);
-  switch (flags()->malloc_align_right) {
-    case 0: break;
-    case 1:
-      right_align_mode = kRightAlignSometimes;
-      right_align_8 = false;
-      break;
-    case 2:
-      right_align_mode = kRightAlignAlways;
-      right_align_8 = false;
-      break;
-    case 8:
-      right_align_mode = kRightAlignSometimes;
-      right_align_8 = true;
-      break;
-    case 9:
-      right_align_mode = kRightAlignAlways;
-      right_align_8 = true;
-      break;
-    default:
-      Report("ERROR: unsupported value of malloc_align_right flag: %d\n",
-             flags()->malloc_align_right);
-      Die();
-  }
-  for (uptr i = 0; i < kShadowAlignment; i++)
+  for (uptr i = 0; i < sizeof(tail_magic); i++)
     tail_magic[i] = GetCurrentThread()->GenerateRandomTag();
 }
 
@@ -172,9 +143,10 @@
     uptr fill_size = Min(size, (uptr)flags()->max_malloc_fill_size);
     internal_memset(allocated, flags()->malloc_fill_byte, fill_size);
   }
-  if (!right_align_mode)
+  if (size != orig_size) {
     internal_memcpy(reinterpret_cast<u8 *>(allocated) + orig_size, tail_magic,
-                    size - orig_size);
+                    size - orig_size - 1);
+  }
 
   void *user_ptr = allocated;
   // Tagging can only be skipped when both tag_in_malloc and tag_in_free are
@@ -185,16 +157,14 @@
     tag_t tag = flags()->tag_in_malloc && malloc_bisect(stack, orig_size)
                     ? (t ? t->GenerateRandomTag() : kFallbackAllocTag)
                     : 0;
-    user_ptr = (void *)TagMemoryAligned((uptr)user_ptr, size, tag);
-  }
-
-  if ((orig_size % kShadowAlignment) && (alignment <= kShadowAlignment) &&
-      right_align_mode) {
-    uptr as_uptr = reinterpret_cast<uptr>(user_ptr);
-    if (right_align_mode == kRightAlignAlways ||
-        GetTagFromPointer(as_uptr) & 1) {  // use a tag bit as a random bit.
-      user_ptr = reinterpret_cast<void *>(AlignRight(as_uptr, orig_size));
-      meta->right_aligned = 1;
+    uptr tag_size = orig_size ? orig_size : 1;
+    uptr full_granule_size = RoundDownTo(tag_size, kShadowAlignment);
+    user_ptr = (void *)TagMemoryAligned((uptr)user_ptr, full_granule_size, tag);
+    if (full_granule_size != tag_size) {
+      u8 *short_granule = reinterpret_cast<u8 *>(allocated) + full_granule_size;
+      TagMemoryAligned((uptr)short_granule, kShadowAlignment,
+                       tag_size % kShadowAlignment);
+      short_granule[kShadowAlignment - 1] = tag;
     }
   }
 
@@ -204,10 +174,10 @@
 
 static bool PointerAndMemoryTagsMatch(void *tagged_ptr) {
   CHECK(tagged_ptr);
-  tag_t ptr_tag = GetTagFromPointer(reinterpret_cast<uptr>(tagged_ptr));
+  uptr tagged_uptr = reinterpret_cast<uptr>(tagged_ptr);
   tag_t mem_tag = *reinterpret_cast<tag_t *>(
       MemToShadow(reinterpret_cast<uptr>(UntagPtr(tagged_ptr))));
-  return ptr_tag == mem_tag;
+  return PossiblyShortTagMatches(mem_tag, tagged_uptr, 1);
 }
 
 static void HwasanDeallocate(StackTrace *stack, void *tagged_ptr) {
@@ -228,14 +198,15 @@
 
   // Check tail magic.
   uptr tagged_size = TaggedSize(orig_size);
-  if (flags()->free_checks_tail_magic && !right_align_mode && orig_size) {
-    uptr tail_size = tagged_size - orig_size;
+  if (flags()->free_checks_tail_magic && orig_size &&
+      tagged_size != orig_size) {
+    uptr tail_size = tagged_size - orig_size - 1;
     CHECK_LT(tail_size, kShadowAlignment);
     void *tail_beg = reinterpret_cast<void *>(
         reinterpret_cast<uptr>(aligned_ptr) + orig_size);
     if (tail_size && internal_memcmp(tail_beg, tail_magic, tail_size))
       ReportTailOverwritten(stack, reinterpret_cast<uptr>(tagged_ptr),
-                            orig_size, tail_size, tail_magic);
+                            orig_size, tail_magic);
   }
 
   meta->requested_size = 0;
Index: clang/docs/HardwareAssistedAddressSanitizerDesign.rst
===================================================================
--- clang/docs/HardwareAssistedAddressSanitizerDesign.rst
+++ clang/docs/HardwareAssistedAddressSanitizerDesign.rst
@@ -38,6 +38,30 @@
 
 For a more detailed discussion of this approach see https://arxiv.org/pdf/1802.09517.pdf
 
+Short granules
+--------------
+
+A short granule is a granule of size between 1 and `TG-1` bytes. The size
+of a short granule is stored at the location in shadow memory where the
+granule's tag is normally stored, while the granule's actual tag is stored
+in the last byte of the granule. This means that in order to verify that a
+pointer tag matches a memory tag, HWASAN must check for two possibilities:
+
+* the pointer tag is equal to the memory tag in shadow memory, or
+* the shadow memory tag is actually a short granule size, the value being loaded
+  is in bounds of the granule and the pointer tag is equal to the last byte of
+  the granule.
+
+Pointer tags between 1 to `TG-1` are possible and are as likely as any other
+tag. This means that these tags in memory have two interpretations: the full
+tag interpretation (where the pointer tag is between 1 and `TG-1` and the
+last byte of the granule is ordinary data) and the short tag interpretation
+(where the pointer tag is stored in the granule).
+
+When HWASAN detects an error near a memory tag between 1 and `TG-1`, it
+will show both the memory tag and the last byte of the granule. Currently,
+it is up to the user to disambiguate the two possibilities.
+
 Instrumentation
 ===============
 
@@ -46,24 +70,40 @@
 All memory accesses are prefixed with an inline instruction sequence that
 verifies the tags. Currently, the following sequence is used:
 
-
 .. code-block:: none
 
   // int foo(int *a) { return *a; }
-  // clang -O2 --target=aarch64-linux -fsanitize=hwaddress -c load.c
+  // clang -O2 --target=aarch64-linux -fsanitize=hwaddress -fsanitize-recover=hwaddress -c load.c
   foo:
-       0:	08 00 00 90 	adrp	x8, 0 <__hwasan_shadow>
-       4:	08 01 40 f9 	ldr	x8, [x8]          // shadow base (to be resolved by the loader)
-       8:	09 dc 44 d3 	ubfx	x9, x0, #4, #52 // shadow offset
-       c:	28 69 68 38 	ldrb	w8, [x9, x8]    // load shadow tag
-      10:	09 fc 78 d3 	lsr	x9, x0, #56       // extract address tag
-      14:	3f 01 08 6b 	cmp	w9, w8            // compare tags
-      18:	61 00 00 54 	b.ne	24              // jump on mismatch
-      1c:	00 00 40 b9 	ldr	w0, [x0]          // original load
-      20:	c0 03 5f d6 	ret
-      24:	40 20 21 d4 	brk	#0x902            // trap
+       0:	90000008 	adrp	x8, 0 <__hwasan_shadow>
+       4:	f9400108 	ldr	x8, [x8]         // shadow base (to be resolved by the loader)
+       8:	d344dc09 	ubfx	x9, x0, #4, #52  // shadow offset
+       c:	38696909 	ldrb	w9, [x8, x9]     // load shadow tag
+      10:	d378fc08 	lsr	x8, x0, #56      // extract address tag
+      14:	6b09011f 	cmp	w8, w9           // compare tags
+      18:	54000061 	b.ne	24 <foo+0x24>    // jump to short tag handler on mismatch
+      1c:	b9400000 	ldr	w0, [x0]         // original load
+      20:	d65f03c0 	ret
+      24:	7100413f 	cmp	w9, #0x10        // is this a short tag?
+      28:	54000142 	b.cs	50 <foo+0x50>    // if not, trap
+      2c:	12000c0a 	and	w10, w0, #0xf    // find the address's position in the short granule
+      30:	11000d4a 	add	w10, w10, #0x3   // adjust to the position of the last byte loaded
+      34:	6b09015f 	cmp	w10, w9          // check that position is in bounds
+      38:	540000c2 	b.cs	50 <foo+0x50>    // if not, trap
+      3c:	9240dc09 	and	x9, x0, #0xffffffffffffff
+      40:	b2400d29 	orr	x9, x9, #0xf     // compute address of last byte of granule
+      44:	39400129 	ldrb	w9, [x9]         // load tag from it
+      48:	6b09011f 	cmp	w8, w9           // compare with pointer tag
+      4c:	54fffe80 	b.eq	1c <foo+0x1c>    // if so, continue
+      50:	d4212440 	brk	#0x922           // otherwise trap
+      54:	b9400000 	ldr	w0, [x0]         // tail duplicated original load (to handle recovery)
+      58:	d65f03c0 	ret
 
 Alternatively, memory accesses are prefixed with a function call.
+On AArch64, a function call is used by default in trapping mode. The code size
+and performance overhead of the call is reduced by using a custom calling
+convention that preserves most registers and is specialized to the register
+containing the address and the type and size of the memory access.
 
 Heap
 ----
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to