================ @@ -2150,58 +2179,94 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { return DAG.getBuildVector(Node->getValueType(0), dl, Ops); } -// We can init constant f16x2 with a single .b32 move. Normally it +// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it // would get lowered as two constant loads and vector-packing move. -// mov.b16 %h1, 0x4000; -// mov.b16 %h2, 0x3C00; -// mov.b32 %hh2, {%h2, %h1}; // Instead we want just a constant move: -// mov.b32 %hh2, 0x40003C00 -// -// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0 -// generates good SASS in both cases. +// mov.b32 %r2, 0x40003C00 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op->getValueType(0); - if (!(Isv2x16VT(VT))) + if (!(Isv2x16VT(VT) || VT == MVT::v4i8)) return Op; - APInt E0; - APInt E1; - if (VT == MVT::v2f16 || VT == MVT::v2bf16) { - if (!(isa<ConstantFPSDNode>(Op->getOperand(0)) && - isa<ConstantFPSDNode>(Op->getOperand(1)))) - return Op; - - E0 = cast<ConstantFPSDNode>(Op->getOperand(0)) - ->getValueAPF() - .bitcastToAPInt(); - E1 = cast<ConstantFPSDNode>(Op->getOperand(1)) - ->getValueAPF() - .bitcastToAPInt(); - } else { - assert(VT == MVT::v2i16); - if (!(isa<ConstantSDNode>(Op->getOperand(0)) && - isa<ConstantSDNode>(Op->getOperand(1)))) - return Op; - E0 = cast<ConstantSDNode>(Op->getOperand(0))->getAPIntValue(); - E1 = cast<ConstantSDNode>(Op->getOperand(1))->getAPIntValue(); + SDLoc DL(Op); + + if (!llvm::all_of(Op->ops(), [](SDValue Operand) { + return Operand->isUndef() || isa<ConstantSDNode>(Operand) || + isa<ConstantFPSDNode>(Operand); + })) { + // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us + // to optimize calculation of constant parts. + if (VT == MVT::v4i8) { + SDValue C8 = DAG.getConstant(8, DL, MVT::i32); + SDValue E01 = DAG.getNode( + NVPTXISD::BFI, DL, MVT::i32, + DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32), + DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8); + SDValue E012 = + DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, + DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32), + E01, DAG.getConstant(16, DL, MVT::i32), C8); + SDValue E0123 = + DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, + DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32), + E012, DAG.getConstant(24, DL, MVT::i32), C8); + return DAG.getNode(ISD::BITCAST, DL, VT, E0123); + } + return Op; } - SDValue Const = - DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32); + + // Get value or the Nth operand as an APInt(32). Undef values treated as 0. + auto GetOperand = [](SDValue Op, int N) -> APInt { + const SDValue &Operand = Op->getOperand(N); + EVT VT = Op->getValueType(0); + if (Operand->isUndef()) + return APInt(32, 0); + APInt Value; + if (VT == MVT::v2f16 || VT == MVT::v2bf16) + Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt(); + else if (VT == MVT::v2i16 || VT == MVT::v4i8) + Value = cast<ConstantSDNode>(Operand)->getAPIntValue(); + else + llvm_unreachable("Unsupported type"); + return Value.zext(32); + }; + APInt Value; + if (Isv2x16VT(VT)) { + Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16); + } else if (VT == MVT::v4i8) { + Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) | + GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24); ---------------- Artem-B wrote:
This misses zext(32). Will fix shortly. https://github.com/llvm/llvm-project/pull/67866 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits