From c93bdcfef59e9d2cb9d2e4fca119b77cf11824dc Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Fri, 8 Mar 2024 22:58:14 +0000 Subject: [PATCH 01/19] add TensorDescType, TensorDescAttr, and createNdDescOp. --- mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h | 4 + .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 78 ++++ .../mlir/Dialect/XeGPU/IR/XeGPUDialect.td | 4 +- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 135 +++++++ .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 103 ++++- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 72 +++- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 380 +++++++++++++++++- 7 files changed, 769 insertions(+), 7 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h index 7aaa4ecc7ee77a..8dc3ff78d25ede 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h @@ -9,7 +9,11 @@ #ifndef MLIR_DIALECT_XEGPU_IR_XEGPU_H #define MLIR_DIALECT_XEGPU_IR_XEGPU_H +#include +#include #include +#include +#include namespace mlir { namespace xegpu { diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index bb325c272e3324..6e4c1bce6d0d59 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -10,6 +10,7 @@ #define MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td" +include "mlir/IR/EnumAttr.td" class XeGPUAttr traits = [], string baseCppClass = "::mlir::Attribute"> @@ -17,4 +18,81 @@ class XeGPUAttr traits = [], let mnemonic = attrMnemonic; } +def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> { + let parameters = (ins + OptionalParameter<"MemoryScopeKindAttr">: $memory_scope, + OptionalParameter<"IntegerAttr", "1">: $array_length, + OptionalParameter<"BoolAttr", "true">: $boundary_check + ); + + let builders = [ + AttrBuilder<(ins + CArg<"xegpu::MemoryScopeKind", "xegpu::MemoryScopeKind::Global">:$memory_scope, + CArg<"int", "1">:$array_length, + CArg<"bool", "true">: $boundary_check + )> + ]; + + let assemblyFormat = "`<` struct(params) `>`"; +} + +//===----------------------------------------------------------------------===// +// XeGPU Memory Scope Enums. +//===----------------------------------------------------------------------===// +def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">; +def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">; +def XeGPU_MemoryScopeKind: I32EnumAttr<"MemoryScopeKind", + "The address space of the memory the tensor descritor is created for", + [XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::xegpu"; +} + +def XeGPU_MemoryScopeAttr: + EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + +//===----------------------------------------------------------------------===// +// XeGPU Operator Mode Enums. +//===----------------------------------------------------------------------===// +def XeGPU_OpModeSIMT : I32EnumAttrCase<"SIMT", 0, "simt">; +def XeGPU_OpModeVectorCompute : I32EnumAttrCase<"VectorCompute", 1, "vc">; +def XeGPU_ModeKind : I32EnumAttr<"ModeKind", + "The Mode an operator runs on", + [XeGPU_OpModeSIMT, XeGPU_OpModeVectorCompute]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::xegpu"; +} + +def XeGPU_ModeAttr: + EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + +//===----------------------------------------------------------------------===// +// XeGPU Cache Enums. +//===----------------------------------------------------------------------===// +def XeGPU_CacheKindCached: I32EnumAttrCase<"CACHED", 0, "cached">; // valid for read and write +def XeGPU_CacheKindUncached: I32EnumAttrCase<"UNCACHED", 1, "uncached">; // valid for read and write +def XeGPU_CacheKindStreaming: I32EnumAttrCase<"STREAMING", 2, "streaming">; // valid for read only +def XeGPU_CacheKindInvalid: I32EnumAttrCase<"READ_INVALIDATE", 3, "read_invalidate">; // valid for read only +def XeGPU_CacheKindWriteBack: I32EnumAttrCase<"WRITE_BACK", 4, "write_back">; // valid for write only +def XeGPU_CacheKindWriteThrough: I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">; // valid for write only + +def XeGPU_CacheKind : I32EnumAttr<"CacheKind", "Cache kind", + [XeGPU_CacheKindCached, XeGPU_CacheKindUncached, + XeGPU_CacheKindStreaming, XeGPU_CacheKindInvalid, + XeGPU_CacheKindWriteBack, XeGPU_CacheKindWriteThrough]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::xegpu"; +} + +def XeGPU_CacheAttr + : EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + + + #endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td index 3851275ad30a0a..c2f09319c790e0 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td @@ -23,8 +23,8 @@ def XeGPU_Dialect : Dialect { the lower-level GPU compiler. }]; - // let useDefaultTypePrinterParser = true; - // let useDefaultAttributePrinterParser = true; + let useDefaultTypePrinterParser = true; + let useDefaultAttributePrinterParser = true; } #endif // MLIR_DIALECT_XEGPU_IR_XEGPUDIALECT_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 5825ef9195b03f..a321d36f2ae271 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -12,6 +12,22 @@ include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td" include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td" include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td" +include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/IR/AttrTypeBase.td" + + +include "mlir/IR/OpBase.td" +include "mlir/IR/OpAsmInterface.td" +include "mlir/IR/AttrTypeBase.td" +include "mlir/IR/BuiltinTypes.td" +include "mlir/IR/BuiltinTypeInterfaces.td" +include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/Interfaces/ViewLikeInterface.td" +include "mlir/Interfaces/CastInterfaces.td" +include "mlir/Interfaces/ControlFlowInterfaces.td" +include "mlir/Interfaces/CopyOpInterface.td" +include "mlir/Interfaces/InferTypeOpInterface.td" +include "mlir/Interfaces/ShapedOpInterfaces.td" // Base class for dialect operations. This operation inherits from the base @@ -23,4 +39,123 @@ class XeGPU_Op traits = []>: Op; +def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSegments]> { + + let summary = "create nd tensor descriptor operation"; + let description = [{ + The "create_nd_tdesc" operation creates a TensorDescType which represents + a sub-view of a 2D memory region (It can be extended to support N-D memory + region if needed in future). Elements in the subview continuous in each + dimention. It encodes the following important information for supporting + Intel hardware features: + + * source: an object representing (starting address/pointer of) a 2D memory region. + It can be either a 2D memref object, or simply a pointer represented by uint64_t type. + for the later case, the shape and layout information of the 2D memory region should + be explicitly passed via `dynamic_shape` and `dynamic_strides` parameters. + * offsets: two index values represents offsets from the "source" at the each dimension + at which the subview of the target memory will be created. It is encoded via two + variables, including "dynamic_offsets" and "static_offsets", such that it can + accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4])). + * shape: the shape information of the memory region pointed by the "source". It is + typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>. + But if "source" is simply a pointer represented as uint64_t type, or a memref + type without shape information e.g., memref, the shape information has + to be explicitly passed via the "dynamic_shape" argument. Currently "dynamic_shape" + only accepts operands(e.g., [%c4096, %c4096]), not attributes(e.g., [4096, 4096]). + * strides: the strides of the memory region pointed by the "source". Similar to shape, + it is typically encoded via the MemRefType of the source too. But if "source" is + simply a pointer represented as uint64_t type, or a memref type without shape + information e.g., memref, the strides information has to be explicitly + passed via the "dynamic_strides" argument. And it currently only accepts operands two. + + Example 1 (suppose the tensor shape inferred by the compiler is 8x16): + %0 = memref.alloc() : memref<1024x1024xf32> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = xegpu.create_nd_tdesc %0[%c0, %c0]: memref<1024x1024xf32> -> TensorDesc<8x16xf32> + + Example 2 (suppose the tensor shape inferred by the compiler is 8x16): + %0 = memref.alloc(%h, %w) : memref + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: memref -> TensorDesc<8x16xf32> + + Example 3 (suppose the tensor shape inferred by the compiler is 8x16): + %0 = ... : ui64 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32> + }]; + + let arguments = (ins XeGPU_BaseAddrType: $source, + Variadic: $dynamic_offsets, + Variadic: $dynamic_shape, + Variadic: $dynamic_strides, + DenseI64ArrayAttr: $static_offsets); + let results = (outs XeGPU_TensorDesc:$TensorDesc); + + let hasCustomAssemblyFormat = 1; + let skipDefaultBuilders = 1; + let hasVerifier = 1; + + let builders = [ + OpBuilder<(ins "Type": $TensorDesc, "Value": $source, "ValueRange": $offsets, + "ValueRange": $shape, "ValueRange": $strides, + "llvm::ArrayRef": $static_offsets)>, + + OpBuilder<(ins "Type": $tdesc, "Value": $source, + "llvm::ArrayRef": $offsets)>, + + OpBuilder<(ins "Type": $tdesc, "Value": $source, + "llvm::ArrayRef": $offsets, + "ValueRange": $shape, "ValueRange": $stride)> + ]; + + let extraClassDeclaration = [{ + /// Returns the type of the source memref operand. + Type getSourceType() { + return getSource().getType(); + } + + /// Returns the type of the result TensorDesc. + xegpu::TensorDescType getType() { + return getTensorDesc().getType(); + } + + /// Returns the offsets info to the source. It consolidates + /// information from both dynamic_offsets and static_offsets + /// parameters. static_offsets parameter always has the expected + /// ranks with some dim could have ShapeType::kDynamic value + /// indicating the corresponding value should be from dynamic_offsets. + llvm::SmallVector getOffsets(); + + /// returns the shape info of the source. It is either from the + /// memref type, if source is a memref with static shape + /// information or from the dynamic_shape parameter. If both + /// exists, the dynamic_shape parameter will be used and the + /// shape information from memref type will be ignored. + llvm::SmallVector getShape(); + + /// returns the strides info of the source. It is either from the + /// memref type, if source is a memref with static shape + /// information or from the dynamic_stride parameter. If both + /// exists, the dynamic_strides parameter will be used and the + /// strides information from memref type will be ignored. + llvm::SmallVector getStrides(); + + /// Return the element type of the TensorDesc + Type getElementType() { + return getType().getElementType(); + } + + /// Return the shape of the TensorDesc + llvm::ArrayRef getTensorDescShape() { + return getType().getShape(); + } + }]; + +} + + #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index 1d75bb4e2906fe..319e16b3ae326b 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -9,9 +9,9 @@ #ifndef MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD #define MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD -include "mlir/IR/BuiltinTypes.td" include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td" include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td" +include "mlir/IR/BuiltinTypes.td" def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64, UI1, UI8, UI16, UI32, UI64]>; def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>; @@ -30,4 +30,105 @@ class XeGPUTypeDef traits = [], let mnemonic = typeMnemonic; } +def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", + [ShapedTypeInterface], "::mlir::TensorType"> { + let summary = "TensorDesc describing regions of interested data."; + let description = [{ + TensorDesc is a type designed to describe regions of the interested data as well as some + features that are unique to Intel hardware. Different with the builtin tensor type in MLIR, + it essentially only contains the meta data, and doesn't hold the data by itself. It is designed + to mainly support 2d block load/store and DPAS (matrix multiplication instruction) on Intel GPU. + It encodes the following information: + + * shape: the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows + and each row contains 16 continious data element. The rows could be + either continuous or not, depends on whether the encoding attribute + is set or not. + * element_type: the data type of the data element, e.g., f16, f32. + + Similar to the builtin tensor, it also provides an optinal attribute to encoding + the following information via the TensorDescAttr object: + * memory_scope (xegpu::MemoryScope): [optional] where the data is located, + global memory or shared memory. It is default to Global. + * array_length (int): [optional] The number of continuous blocks with size as `shape`, + that will be loaded by block load at a time. It is default to 1. + * boundary_check (bool): [optional] indicates whether the operation detects the boundary + and pads with zero for out-of-boundary access. It is default to do boundary check. + + + Syntax: + + ``` + TensorDesc-type ::= `tensor_desc` `<` dim-list element-type (attr-list)? `>` + element-type ::= float-type | integer-type | index-type + dim-list := (static-dim-list `x`)? + static-dim-list ::= decimal-literal `x` decimal-literal + attr-list = (, memory_scope = value)? (, arr_len = value)? (, boundary_check = value)? + ``` + + Examples: + + ```mlir + // A block TensorDesc with 8x16 i32 elements + xegpu.tensor_desc<8x16xi32> + + // A block TensorDesc with 8x16 f32 elements + xegpu.tensor_desc<8x16xf32> + + // A TensorDesc with 8x16 f32 elements for a memory region in shared memory space. + xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + ``` + }]; + + let parameters = (ins ArrayRefParameter<"int64_t">: $shape, + "mlir::Type": $elementType, + OptionalParameter<"mlir::Attribute">: $encoding); + + let extraClassDeclaration = [{ + using TensorType::clone; + using mlir::ShapedType::Trait::getElementTypeBitWidth; + using mlir::ShapedType::Trait::getRank; + using mlir::ShapedType::Trait::getNumElements; + using mlir::ShapedType::Trait::isDynamicDim; + using mlir::ShapedType::Trait::hasStaticShape; + using mlir::ShapedType::Trait::getNumDynamicDims; + using mlir::ShapedType::Trait::getDimSize; + using mlir::ShapedType::Trait::getDynamicDimIndex; + + TensorDescType clone(::mlir::Type elementType) { + return llvm::cast(cloneWith(getShape(), elementType)); + } + + TensorDescAttr getEncodingAsTensorDescAttr() const { + return llvm::dyn_cast_if_present(getEncoding()); + } + + xegpu::MemoryScopeKind getMemoryScope() const { + auto attr = getEncodingAsTensorDescAttr(); + if (attr && attr.getMemoryScope()) + return attr.getMemoryScope().getValue(); + // return default value + return MemoryScopeKind::Global; + } + + int getArrayLength() { + auto attr = getEncodingAsTensorDescAttr(); + if (attr && attr.getArrayLength()) + return attr.getArrayLength().getInt(); + // return default value + return 1; + } + + bool getBoundaryCheck() { + auto attr = getEncodingAsTensorDescAttr(); + if (attr && attr.getBoundaryCheck()) + return attr.getBoundaryCheck().getValue(); + // return default value + return true; + } + }]; + + let hasCustomAssemblyFormat = true; +} + #endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 4f839ee773476b..bd72d5c17b6ea1 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -6,7 +6,10 @@ // //===----------------------------------------------------------------------===// +#include #include +#include +#include namespace mlir { namespace xegpu { @@ -26,8 +29,73 @@ void XeGPUDialect::initialize() { >(); } -// this file is for position occupation, -// we will add functions in following PRs. + +//===----------------------------------------------------------------------===// +// XeGPU_TensorDescAttr +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// XeGPU_TensorDescType +//===----------------------------------------------------------------------===// +mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) { + llvm::SmallVector shape; + mlir::Type elementType; + mlir::FailureOr encoding; + + // Parse literal '<' + if (parser.parseLess()) + return {}; + + auto shapeLoc = parser.getCurrentLocation(); + if (mlir::failed(parser.parseDimensionList(shape))) { + parser.emitError(shapeLoc, "failed to parse parameter 'shape'"); + return {}; + } + + auto elemTypeLoc = parser.getCurrentLocation(); + if (mlir::failed(parser.parseType(elementType))) { + parser.emitError(elemTypeLoc, "failed to parse parameter 'elementType'"); + return {}; + } + + // parse optional attributes + if (mlir::succeeded(parser.parseOptionalComma())) { + encoding = mlir::FieldParser::parse(parser); + if (mlir::failed(encoding)) { + parser.emitError(parser.getCurrentLocation(), + "Failed to parse the attribute field for TensorDescType.\n"); + return {}; + } + } + + // Parse literal '>' + if (parser.parseGreater()) + return {}; + + return TensorDescType::get(parser.getContext(), shape, elementType, + encoding.value_or(mlir::Attribute())); +} + +void TensorDescType::print(::mlir::AsmPrinter &printer) const { + printer << "<"; + + auto shape = getShape(); + for (int64_t dim : shape) { + if (mlir::ShapedType::isDynamic(dim)) + printer << '?'; + else + printer << dim; + printer << 'x'; + } + + printer << getElementType(); + + if (auto encoding = getEncoding()) + printer << ", " << encoding; + + printer << ">"; +} + } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 0e89ac4df6ef28..74557eaca0869c 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -6,14 +6,390 @@ // //===----------------------------------------------------------------------===// +#include #include +#include +#include #define DEBUG_TYPE "xegpu" namespace mlir { namespace xegpu { -// this file is for position occupation, -// we will add functions in following PRs. + +bool printDefaultValues() {return false;} + +static size_t getRankOf(Value value) { + if (value.getType().isIntOrIndexOrFloat()) + return 0; + if (auto ty = llvm::dyn_cast_if_present(value.getType())) + return ty.getRank(); + if (auto ty = llvm::dyn_cast_if_present(value.getType())) + return ty.getRank(); + llvm_unreachable("Unsupported value for getRankOf"); +} + +static ParseResult +parseOptionalAttrDictWithCustomAttrs(OpAsmParser &parser, + OperationState &result) { + // no optional attributes, return success + if (failed(parser.parseOptionalLBrace())) + return success(); + + llvm::SmallDenseSet seenKeys; + auto parseElt = [&]() -> ParseResult { + // The name of an attribute can either be a keyword, or a string. + // as compared to mlir::parseOptionalAttrList, the cases of using + // TOken::bare_identifier and Token::inttype as key maybe not handlered + std::string nameId; + auto loc = parser.getCurrentLocation(); + if (parser.parseOptionalKeywordOrString(&nameId)) + return parser.emitError(loc, "invalid attribute name: ") + << nameId << ".\n"; + + if (nameId.empty()) + return parser.emitError(loc, "expected valid attribute name"); + + if (!seenKeys.insert(nameId).second) + return parser.emitError(loc, "duplicate key '") + << nameId << "' in dictionary attribute."; + + // Lazy load a dialect in the context if there is a possible namespace. + auto splitName = StringRef(nameId).split('.'); + if (!splitName.second.empty()) + parser.getContext()->getOrLoadDialect(splitName.first); + + // Try to parse the '=' for the attribute value. + if (parser.parseEqual()) { + // If there is no '=', it is treated as a unit attribute. + result.addAttribute(nameId, parser.getBuilder().getUnitAttr()); + return success(); + } + + // for xegpu specific attributes + if (nameId == "mode") { + ModeKindAttr attr; + return parser.parseCustomAttributeWithFallback(attr, Type{}, nameId, + result.attributes); + } else if (nameId == "l1_hint" || nameId == "l2_hint" || + nameId == "l3_hint") { + CacheKindAttr attr; + return parser.parseCustomAttributeWithFallback(attr, Type{}, nameId, + result.attributes); + } else if (nameId == "transpose") { + // in form of [4, 5], acctually it is a copy of DenseI63ArrayAttr::parse() + if (succeeded(parser.parseOptionalLSquare())) { + Attribute attr; + // handle empty list case + if (succeeded(parser.parseOptionalRSquare())) { + attr = DenseI64ArrayAttr::get(parser.getContext(), {}); + } else { + attr = DenseI64ArrayAttr::parseWithoutBraces(parser, Type{}); + if (failed(parser.parseRSquare())) + return failure(); + } + if (!attr) + return failure(); + result.addAttribute(nameId, attr); + return success(); + } else { + // in form of array + DenseI64ArrayAttr attr; + return parser.parseAttribute(attr, nameId, result.attributes); + } + } else { + Attribute attr; + return parser.parseAttribute(attr, nameId, result.attributes); + } + }; + + if (parser.parseCommaSeparatedList(parseElt)) + return failure(); + + return parser.parseRBrace(); +} + + +//===----------------------------------------------------------------------===// +// XeGPU_CreateNdDescOp +//===----------------------------------------------------------------------===// +void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, + Type TensorDesc, Value source, ValueRange offsets, + ValueRange shape, ValueRange strides, + llvm::ArrayRef static_offsets) { + auto offsetRank = static_offsets.size(); + auto shapeRank = shape.size() ? shape.size() : getRankOf(source); + + size_t dynOffsetRank = + std::count_if(static_offsets.begin(), static_offsets.end(), + [](int64_t d) { return ShapedType::isDynamic(d); }); + + // shape and strides should exists at the same time + // and the final rank for shape and offset (dynamic + static) + // should be the same + assert(shape.size() == strides.size() && shapeRank == offsetRank && + offsets.size() == dynOffsetRank); + + state.addOperands(source); + state.addOperands(offsets); + state.addOperands(shape); + state.addOperands(strides); + state.addAttribute( + getOperandSegmentSizesAttrName(state.name), + builder.getDenseI32ArrayAttr({1, static_cast(offsets.size()), + static_cast(shape.size()), + static_cast(strides.size())})); + state.addAttribute(getStaticOffsetsAttrName(state.name), + builder.getDenseI64ArrayAttr(static_offsets)); + state.addTypes(TensorDesc); +} + +void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, + Type tdesc, Value source, + llvm::ArrayRef offsets) { + auto ty = llvm::dyn_cast_if_present(source.getType()); + assert(ty && ty.hasStaticShape() && offsets.size() == getRankOf(source)); + + llvm::SmallVector staticOffsets; + llvm::SmallVector dynamicOffsets; + dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); + + build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */, + ValueRange({}) /* empty dynamic shape */, + ValueRange({}) /* empty dynamic strides */, + staticOffsets /* static offsets */); +} + +void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, + Type tdesc, Value source, + llvm::ArrayRef offsets, + ValueRange shape, ValueRange stride) { + assert(shape.size() && offsets.size() && stride.size() && + shape.size() == stride.size() && shape.size() == offsets.size()); + + llvm::SmallVector staticOffsets; + llvm::SmallVector dynamicOffsets; + + dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); + + build(builder, state, tdesc, source, /* dynamic_offsets = */ dynamicOffsets, + /* dynamic shape = */ shape , /* dynamic strides = */ stride, + /* static offsets = */ staticOffsets); +} + +ParseResult CreateNdDescOp::parse(OpAsmParser &parser, OperationState &result) { + // parse the source operand + llvm::SmallVector sourceOperands(1); + llvm::SMLoc sourceOperandsLoc = parser.getCurrentLocation(); + if (parser.parseOperand(sourceOperands[0])) + return failure(); + + // parse the offset operand, in format of [x, y] + llvm::SmallVector offsetsOperands; + DenseI64ArrayAttr static_offsetsAttr; + llvm::SMLoc offsetsOperandsLoc = parser.getCurrentLocation(); + if (parseDynamicIndexList(parser, offsetsOperands, static_offsetsAttr)) + return failure(); + result.addAttribute("static_offsets", static_offsetsAttr); + + llvm::SmallVector shapeOperands; + llvm::SMLoc shapeOperandsLoc; + + llvm::SmallVector stridesOperands; + llvm::SMLoc stridesOperandsLoc; + // parse optional shape and strides, shape and strides should always come + // together + if (succeeded(parser.parseOptionalComma())) { + // parse shape part, in form of [x, y] + if (parser.parseLSquare()) + return failure(); + shapeOperandsLoc = parser.getCurrentLocation(); + if (parser.parseOperandList(shapeOperands)) + return failure(); + if (parser.parseRSquare()) + return failure(); + + if (parser.parseComma()) + return failure(); + + // parse stride part, in form of [x, y] + if (parser.parseLSquare()) + return failure(); + stridesOperandsLoc = parser.getCurrentLocation(); + if (parser.parseOperandList(stridesOperands)) + return failure(); + if (parser.parseRSquare()) + return failure(); + } + + auto loc = parser.getCurrentLocation(); + if (parseOptionalAttrDictWithCustomAttrs(parser, result)) + return failure(); + + if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() { + return parser.emitError(loc) + << "'" << result.name.getStringRef() << "' op "; + }))) + return failure(); + + if (parser.parseColon()) + return failure(); + + llvm::SmallVector sourceTypes(1); + if (parser.parseType(sourceTypes[0])) + return failure(); + + if (parser.parseArrow()) + return failure(); + + llvm::SmallVector TensorDescTypes(1); + if (parser.parseType(TensorDescTypes[0])) + return failure(); + result.addAttribute("operandSegmentSizes", + parser.getBuilder().getDenseI32ArrayAttr( + {1, static_cast(offsetsOperands.size()), + static_cast(shapeOperands.size()), + static_cast(stridesOperands.size())})); + + result.addTypes(TensorDescTypes); + if (parser.resolveOperands(sourceOperands, sourceTypes, sourceOperandsLoc, + result.operands)) + return failure(); + + Type indexType = parser.getBuilder().getIndexType(); + if (parser.resolveOperands(offsetsOperands, indexType, offsetsOperandsLoc, + result.operands)) + return failure(); + if (parser.resolveOperands(shapeOperands, indexType, shapeOperandsLoc, + result.operands)) + return failure(); + if (parser.resolveOperands(stridesOperands, indexType, stridesOperandsLoc, + result.operands)) + return failure(); + return success(); +} + +void CreateNdDescOp::print(OpAsmPrinter &printer) { + printer << ' '; + printer << getSource(); + printDynamicIndexList(printer, *this, getDynamicOffsets(), + getStaticOffsetsAttr()); + if (!getDynamicShape().empty()) { + printer << ","; + printer << ' ' << "["; + printer << getDynamicShape(); + printer << "]"; + } + + if (!getDynamicStrides().empty()) { + printer << ","; + printer << ' ' << "["; + printer << getDynamicStrides(); + printer << "]"; + } + + llvm::SmallVector elidedAttrs; + elidedAttrs.push_back("static_offsets"); + elidedAttrs.push_back("operandSegmentSizes"); + printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs); + printer << ' ' << ":"; + printer << ' '; + printer << getSourceType(); + printer << ' ' << "->"; + printer << ' '; + printer << getType(); +} + +LogicalResult CreateNdDescOp::verify() { + auto offsetRank = getOffsets().size(); + auto shapeRank = getShape().size(); + auto stridesRank = getStrides().size(); + auto baseRank = getRankOf(getSource()) ? getRankOf(getSource()) : 2; + + if (offsetRank != shapeRank || shapeRank != stridesRank || + shapeRank != baseRank) + + return emitOpError( + "Expecting the rank of shape, strides, offsets and memref type " + "should match with each other (they currently should be 2D)."); + return success(); +} + +// compute consolidated offsets from dynamic_offsets and static_offsets parameters +llvm::SmallVector CreateNdDescOp::getOffsets() { + llvm::SmallVector offsets; + auto dynamicOffsets = getDynamicOffsets(); // dynamic_offsets variable + auto staticOffsets = getStaticOffsets(); // static_offsets attribute + + // in case static_offsets is missing, dynamic_offsets will be used + if (staticOffsets.size() == 0) { + offsets.assign(dynamicOffsets.begin(), dynamicOffsets.end()); + return offsets; + } + + // use static offsets for each dim if it has valid value, + // othwise use the value from dynamic_offsets + for (size_t i = 0, j = 0; i < staticOffsets.size(); i++) { + if (ShapedType::isDynamic(staticOffsets[i])) { + assert(j < dynamicOffsets.size()); + offsets.push_back(dynamicOffsets[j++]); + } else { + auto ty = IndexType::get(getContext()); + auto attr = IntegerAttr::get(ty, staticOffsets[i]); + offsets.push_back(attr); + } + } + return offsets; +} + +// get the consolidated shape of the 2D memory region. +// It prefer dynamic_shape than the static shape of +// memref type. +llvm::SmallVector CreateNdDescOp::getShape() { + llvm::SmallVector shape; + auto dynShape = getDynamicShape(); + if (dynShape.size()) { + shape.append(dynShape.begin(), dynShape.end()); + return shape; + } + + auto ty = llvm::dyn_cast_if_present(getSourceType()); + if (ty && ty.hasStaticShape()) { + for (auto dim : ty.getShape()) { + auto attr = IntegerAttr::get(IndexType::get(getContext()), dim); + shape.push_back(attr); + } + return shape; + } + + this->emitError("The shape information of the memory is missing.\n"); + return {}; +} + +// get the consolidated strides of the 2D memory region. +// It prefer dynamic_stride than the static strides of +// memref type. +llvm::SmallVector CreateNdDescOp::getStrides() { + llvm::SmallVector strides; + + auto dynStrides = getDynamicStrides(); + if (dynStrides.size()) { + strides.append(dynStrides.begin(), dynStrides.end()); + return strides; + } + + auto ty = llvm::dyn_cast_if_present(getSourceType()); + if (ty && ty.hasStaticShape()) { + auto [staticStrides, offset] = getStridesAndOffset(ty); + for (auto dim : staticStrides) { + auto attr = IntegerAttr::get(IndexType::get(getContext()), dim); + strides.push_back(attr); + } + return strides; + } + + this->emitError("The strides information of the memory is missing.\n"); + return {}; +} } // namespace xegpu } // namespace mlir From facb3b40613319915d52a7c6e5f539b686085535 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Sun, 10 Mar 2024 12:57:06 -0500 Subject: [PATCH 02/19] add prefetch_nd, load_nd, and store_nd --- .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 53 +-- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 92 +++++- .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 5 +- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 310 +++++++----------- 4 files changed, 215 insertions(+), 245 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 6e4c1bce6d0d59..cd38549f1ccf43 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -20,14 +20,14 @@ class XeGPUAttr traits = [], def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> { let parameters = (ins - OptionalParameter<"MemoryScopeKindAttr">: $memory_scope, + OptionalParameter<"MemoryScopeAttr">: $memory_scope, OptionalParameter<"IntegerAttr", "1">: $array_length, OptionalParameter<"BoolAttr", "true">: $boundary_check ); let builders = [ AttrBuilder<(ins - CArg<"xegpu::MemoryScopeKind", "xegpu::MemoryScopeKind::Global">:$memory_scope, + CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope, CArg<"int", "1">:$array_length, CArg<"bool", "true">: $boundary_check )> @@ -41,7 +41,7 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> { //===----------------------------------------------------------------------===// def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">; def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">; -def XeGPU_MemoryScopeKind: I32EnumAttr<"MemoryScopeKind", +def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope", "The address space of the memory the tensor descritor is created for", [XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> { let genSpecializedAttr = 0; @@ -49,47 +49,30 @@ def XeGPU_MemoryScopeKind: I32EnumAttr<"MemoryScopeKind", } def XeGPU_MemoryScopeAttr: - EnumAttr { - let assemblyFormat = "`<` $value `>`"; -} - -//===----------------------------------------------------------------------===// -// XeGPU Operator Mode Enums. -//===----------------------------------------------------------------------===// -def XeGPU_OpModeSIMT : I32EnumAttrCase<"SIMT", 0, "simt">; -def XeGPU_OpModeVectorCompute : I32EnumAttrCase<"VectorCompute", 1, "vc">; -def XeGPU_ModeKind : I32EnumAttr<"ModeKind", - "The Mode an operator runs on", - [XeGPU_OpModeSIMT, XeGPU_OpModeVectorCompute]> { - let genSpecializedAttr = 0; - let cppNamespace = "::mlir::xegpu"; -} - -def XeGPU_ModeAttr: - EnumAttr { - let assemblyFormat = "`<` $value `>`"; + EnumAttr { + let assemblyFormat = "$value"; } //===----------------------------------------------------------------------===// // XeGPU Cache Enums. //===----------------------------------------------------------------------===// -def XeGPU_CacheKindCached: I32EnumAttrCase<"CACHED", 0, "cached">; // valid for read and write -def XeGPU_CacheKindUncached: I32EnumAttrCase<"UNCACHED", 1, "uncached">; // valid for read and write -def XeGPU_CacheKindStreaming: I32EnumAttrCase<"STREAMING", 2, "streaming">; // valid for read only -def XeGPU_CacheKindInvalid: I32EnumAttrCase<"READ_INVALIDATE", 3, "read_invalidate">; // valid for read only -def XeGPU_CacheKindWriteBack: I32EnumAttrCase<"WRITE_BACK", 4, "write_back">; // valid for write only -def XeGPU_CacheKindWriteThrough: I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">; // valid for write only - -def XeGPU_CacheKind : I32EnumAttr<"CacheKind", "Cache kind", - [XeGPU_CacheKindCached, XeGPU_CacheKindUncached, - XeGPU_CacheKindStreaming, XeGPU_CacheKindInvalid, - XeGPU_CacheKindWriteBack, XeGPU_CacheKindWriteThrough]> { +def XeGPU_CachePolicyCached: I32EnumAttrCase<"CACHED", 0, "cached">; // valid for read and write +def XeGPU_CachePolicyUncached: I32EnumAttrCase<"UNCACHED", 1, "uncached">; // valid for read and write +def XeGPU_CachePolicyStreaming: I32EnumAttrCase<"STREAMING", 2, "streaming">; // valid for read only +def XeGPU_CachePolicyInvalid: I32EnumAttrCase<"READ_INVALIDATE", 3, "read_invalidate">; // valid for read only +def XeGPU_CachePolicyWriteBack: I32EnumAttrCase<"WRITE_BACK", 4, "write_back">; // valid for write only +def XeGPU_CachePolicyWriteThrough: I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">; // valid for write only + +def XeGPU_CachePolicyEnums : I32EnumAttr<"CachePolicy", "Cache policy", + [XeGPU_CachePolicyCached, XeGPU_CachePolicyUncached, + XeGPU_CachePolicyStreaming, XeGPU_CachePolicyInvalid, + XeGPU_CachePolicyWriteBack, XeGPU_CachePolicyWriteThrough]> { let genSpecializedAttr = 0; let cppNamespace = "::mlir::xegpu"; } -def XeGPU_CacheAttr - : EnumAttr { +def XeGPU_CacheHintAttr + : EnumAttr { let assemblyFormat = "`<` $value `>`"; } diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index a321d36f2ae271..dd3719f101e8c8 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -88,14 +88,21 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSeg %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32> }]; - let arguments = (ins XeGPU_BaseAddrType: $source, - Variadic: $dynamic_offsets, - Variadic: $dynamic_shape, - Variadic: $dynamic_strides, - DenseI64ArrayAttr: $static_offsets); - let results = (outs XeGPU_TensorDesc:$TensorDesc); - - let hasCustomAssemblyFormat = 1; + let arguments = (ins + XeGPU_BaseAddrType: $source, + Variadic: $dynamic_offsets, + Variadic: $dynamic_shape, + Variadic: $dynamic_strides, + DenseI64ArrayAttr: $static_offsets + ); + let results = (outs XeGPU_TensorDesc: $TensorDesc); + + let assemblyFormat = [{ + $source `` + custom($dynamic_offsets, $static_offsets) + (`,` `[` $dynamic_shape^ `]` `,` `[` $dynamic_strides `]`)? + attr-dict `:` type($source) `->` type($TensorDesc) + }]; let skipDefaultBuilders = 1; let hasVerifier = 1; @@ -154,8 +161,77 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSeg return getType().getShape(); } }]; +} + +def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { + let summary = "prefetches a nD block to cache"; + let arguments = (ins XeGPU_TensorDesc: $TensorDesc, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); + + // Format: xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint, + // l2_hint = #xegpu.cache_hint, + // l3_hint = #xegpu.cache_hint} + // : !xegpu.tensor_desc<8x16xf16> + let assemblyFormat = "$TensorDesc attr-dict `:` type($TensorDesc)"; +} + + +def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> { + let summary = "loads a n-D block from memory (represented by TensorDesc)" + "to registers (represented by vector)"; + let description = [{ + LoadNDOp essentially mimics the hardware block read instruction to read + a block of data from memory to register. It takes a set of cache hints + for each level of cache, L1, L2 and L3. If hardware does not have a + correspoding cache, Corresponding cache hint attribute will be masked. + If both transpose and vnni_axis present at the same time. It assume to + perform transpose first and then vnni transform. + }]; + + let arguments = (ins XeGPU_TensorDesc: $TensorDesc, + OptionalAttr: $vnni_axis, + OptionalAttr: $transpose, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); + + let results = (outs XeGPU_ValueType: $value); + + let extraClassDeclaration = [{ + VectorType getType() { + return llvm::dyn_cast(getValue().getType()); + } + + xegpu::TensorDescType getTensorDescType() { + return getTensorDesc().getType(); + } + }]; + // Format: xegpu.load_nd %1 {transpose = [1, 0], + // l1_hint = #xegpu.cache_hint, + // l2_hint = #xegpu.cache_hint, + // l3_hint = #xegpu.cache_hint} + // : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32> + let assemblyFormat = "$TensorDesc attr-dict `:` type($TensorDesc) `->` type($value)"; + let hasVerifier = 1; } +def XeGPU_StoreNDOp : XeGPU_Op<"store_nd", []> { + let summary = "stores a n-D block register region back to memory, currently only supports 2D"; + let arguments = (ins XeGPU_ValueType: $value, + XeGPU_TensorDesc: $TensorDesc, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); + + // Format: xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint, + // l2_hint = #xegpu.cache_hint, + // l3_hint = #xegpu.cache_hint} + // : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> + let assemblyFormat = "$value `,` $TensorDesc attr-dict `:` type($value) `,` type($TensorDesc)"; + let hasVerifier = 1; +} #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index 319e16b3ae326b..36b04ea12bcad0 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -103,12 +103,12 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", return llvm::dyn_cast_if_present(getEncoding()); } - xegpu::MemoryScopeKind getMemoryScope() const { + xegpu::MemoryScope getMemoryScope() const { auto attr = getEncodingAsTensorDescAttr(); if (attr && attr.getMemoryScope()) return attr.getMemoryScope().getValue(); // return default value - return MemoryScopeKind::Global; + return MemoryScope::Global; } int getArrayLength() { @@ -129,6 +129,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", }]; let hasCustomAssemblyFormat = true; + } #endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 74557eaca0869c..727c241a027f77 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -28,86 +28,28 @@ static size_t getRankOf(Value value) { llvm_unreachable("Unsupported value for getRankOf"); } -static ParseResult -parseOptionalAttrDictWithCustomAttrs(OpAsmParser &parser, - OperationState &result) { - // no optional attributes, return success - if (failed(parser.parseOptionalLBrace())) - return success(); - - llvm::SmallDenseSet seenKeys; - auto parseElt = [&]() -> ParseResult { - // The name of an attribute can either be a keyword, or a string. - // as compared to mlir::parseOptionalAttrList, the cases of using - // TOken::bare_identifier and Token::inttype as key maybe not handlered - std::string nameId; - auto loc = parser.getCurrentLocation(); - if (parser.parseOptionalKeywordOrString(&nameId)) - return parser.emitError(loc, "invalid attribute name: ") - << nameId << ".\n"; - - if (nameId.empty()) - return parser.emitError(loc, "expected valid attribute name"); - - if (!seenKeys.insert(nameId).second) - return parser.emitError(loc, "duplicate key '") - << nameId << "' in dictionary attribute."; - - // Lazy load a dialect in the context if there is a possible namespace. - auto splitName = StringRef(nameId).split('.'); - if (!splitName.second.empty()) - parser.getContext()->getOrLoadDialect(splitName.first); - - // Try to parse the '=' for the attribute value. - if (parser.parseEqual()) { - // If there is no '=', it is treated as a unit attribute. - result.addAttribute(nameId, parser.getBuilder().getUnitAttr()); - return success(); - } - - // for xegpu specific attributes - if (nameId == "mode") { - ModeKindAttr attr; - return parser.parseCustomAttributeWithFallback(attr, Type{}, nameId, - result.attributes); - } else if (nameId == "l1_hint" || nameId == "l2_hint" || - nameId == "l3_hint") { - CacheKindAttr attr; - return parser.parseCustomAttributeWithFallback(attr, Type{}, nameId, - result.attributes); - } else if (nameId == "transpose") { - // in form of [4, 5], acctually it is a copy of DenseI63ArrayAttr::parse() - if (succeeded(parser.parseOptionalLSquare())) { - Attribute attr; - // handle empty list case - if (succeeded(parser.parseOptionalRSquare())) { - attr = DenseI64ArrayAttr::get(parser.getContext(), {}); - } else { - attr = DenseI64ArrayAttr::parseWithoutBraces(parser, Type{}); - if (failed(parser.parseRSquare())) - return failure(); - } - if (!attr) - return failure(); - result.addAttribute(nameId, attr); - return success(); - } else { - // in form of array - DenseI64ArrayAttr attr; - return parser.parseAttribute(attr, nameId, result.attributes); - } - } else { - Attribute attr; - return parser.parseAttribute(attr, nameId, result.attributes); - } - }; - - if (parser.parseCommaSeparatedList(parseElt)) - return failure(); - - return parser.parseRBrace(); +static void transpose(llvm::ArrayRef trans, + std::vector &shape) { + std::vector old = shape; + for (size_t i = 0; i < trans.size(); i++) + shape[i] = old[trans[i]]; } +template +static std::string makeString(T array, bool breakline = false) { + std::string buf; + buf.clear(); + llvm::raw_string_ostream os(buf); + os << "["; + for (size_t i = 1; i < array.size(); i++) { + os << array[i - 1] << ", "; + if (breakline) + os << "\n\t\t"; + } + os << array.back() << "]"; + os.flush(); + return buf; +} //===----------------------------------------------------------------------===// // XeGPU_CreateNdDescOp @@ -176,128 +118,6 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, /* static offsets = */ staticOffsets); } -ParseResult CreateNdDescOp::parse(OpAsmParser &parser, OperationState &result) { - // parse the source operand - llvm::SmallVector sourceOperands(1); - llvm::SMLoc sourceOperandsLoc = parser.getCurrentLocation(); - if (parser.parseOperand(sourceOperands[0])) - return failure(); - - // parse the offset operand, in format of [x, y] - llvm::SmallVector offsetsOperands; - DenseI64ArrayAttr static_offsetsAttr; - llvm::SMLoc offsetsOperandsLoc = parser.getCurrentLocation(); - if (parseDynamicIndexList(parser, offsetsOperands, static_offsetsAttr)) - return failure(); - result.addAttribute("static_offsets", static_offsetsAttr); - - llvm::SmallVector shapeOperands; - llvm::SMLoc shapeOperandsLoc; - - llvm::SmallVector stridesOperands; - llvm::SMLoc stridesOperandsLoc; - // parse optional shape and strides, shape and strides should always come - // together - if (succeeded(parser.parseOptionalComma())) { - // parse shape part, in form of [x, y] - if (parser.parseLSquare()) - return failure(); - shapeOperandsLoc = parser.getCurrentLocation(); - if (parser.parseOperandList(shapeOperands)) - return failure(); - if (parser.parseRSquare()) - return failure(); - - if (parser.parseComma()) - return failure(); - - // parse stride part, in form of [x, y] - if (parser.parseLSquare()) - return failure(); - stridesOperandsLoc = parser.getCurrentLocation(); - if (parser.parseOperandList(stridesOperands)) - return failure(); - if (parser.parseRSquare()) - return failure(); - } - - auto loc = parser.getCurrentLocation(); - if (parseOptionalAttrDictWithCustomAttrs(parser, result)) - return failure(); - - if (failed(verifyInherentAttrs(result.name, result.attributes, [&]() { - return parser.emitError(loc) - << "'" << result.name.getStringRef() << "' op "; - }))) - return failure(); - - if (parser.parseColon()) - return failure(); - - llvm::SmallVector sourceTypes(1); - if (parser.parseType(sourceTypes[0])) - return failure(); - - if (parser.parseArrow()) - return failure(); - - llvm::SmallVector TensorDescTypes(1); - if (parser.parseType(TensorDescTypes[0])) - return failure(); - result.addAttribute("operandSegmentSizes", - parser.getBuilder().getDenseI32ArrayAttr( - {1, static_cast(offsetsOperands.size()), - static_cast(shapeOperands.size()), - static_cast(stridesOperands.size())})); - - result.addTypes(TensorDescTypes); - if (parser.resolveOperands(sourceOperands, sourceTypes, sourceOperandsLoc, - result.operands)) - return failure(); - - Type indexType = parser.getBuilder().getIndexType(); - if (parser.resolveOperands(offsetsOperands, indexType, offsetsOperandsLoc, - result.operands)) - return failure(); - if (parser.resolveOperands(shapeOperands, indexType, shapeOperandsLoc, - result.operands)) - return failure(); - if (parser.resolveOperands(stridesOperands, indexType, stridesOperandsLoc, - result.operands)) - return failure(); - return success(); -} - -void CreateNdDescOp::print(OpAsmPrinter &printer) { - printer << ' '; - printer << getSource(); - printDynamicIndexList(printer, *this, getDynamicOffsets(), - getStaticOffsetsAttr()); - if (!getDynamicShape().empty()) { - printer << ","; - printer << ' ' << "["; - printer << getDynamicShape(); - printer << "]"; - } - - if (!getDynamicStrides().empty()) { - printer << ","; - printer << ' ' << "["; - printer << getDynamicStrides(); - printer << "]"; - } - - llvm::SmallVector elidedAttrs; - elidedAttrs.push_back("static_offsets"); - elidedAttrs.push_back("operandSegmentSizes"); - printer.printOptionalAttrDict((*this)->getAttrs(), elidedAttrs); - printer << ' ' << ":"; - printer << ' '; - printer << getSourceType(); - printer << ' ' << "->"; - printer << ' '; - printer << getType(); -} LogicalResult CreateNdDescOp::verify() { auto offsetRank = getOffsets().size(); @@ -391,6 +211,96 @@ llvm::SmallVector CreateNdDescOp::getStrides() { return {}; } +//===----------------------------------------------------------------------===// +// XeGPU_LoadNDOp +//===----------------------------------------------------------------------===// +LogicalResult LoadNDOp::verify() { + auto tdescTy = getTensorDescType(); + auto valueTy = getType(); + + if (tdescTy.getRank() != 2) + return emitOpError( + "The TensorDesc for LoadNDOp should be a 2D TensorDesc."); + + if (!valueTy) + return emitOpError("Invalid result, it should be a VectorType.\n"); + + auto tdescElemTy = tdescTy.getElementType(); + auto valueElemTy = valueTy.getElementType(); + + if (tdescElemTy != valueElemTy) + return emitOpError( + "Value should have the same element type as TensorDesc."); + + auto array_len = tdescTy.getArrayLength(); + auto tdescShape = tdescTy.getShape().vec(); + auto valueShape = valueTy.getShape().vec(); + + if (getTranspose()) { + auto trans = getTranspose().value(); + if (tdescShape.size() >= trans.size()) + transpose(trans, tdescShape); + else + emitWarning("Invalid transpose attr. It is ignored."); + } + + if (getVnniAxis()) { + auto axis = getVnniAxis().value(); + auto vnni_factor = valueShape.back(); + tdescShape[axis] /= vnni_factor; + tdescShape.push_back(vnni_factor); + } + + if (array_len > 1) { + auto it = tdescShape.begin(); + tdescShape.insert(it, array_len); + } + + if (tdescShape != valueShape) + return emitOpError("Result shape doesn't match TensorDesc shape.") + << "\nThe expected shape is " << makeString(tdescShape) << "." + << "\nBut the given shape is " << makeString(valueShape) << "." + << "\nIn VC mode, when VNNI is not enabled, the result should have " + << "the same shape (or transposed shape if transpose is enabled) " + << "as TensorDesc; \nwhen VNNI is enabled, the result should have " + << "one more dimention than the TensorDesc, with last dimention " + << "having vnni factor, \nbut having same number of total data " + << "elements. The vnni factor are typically calculated as " + << "simd_lane_width / elementTypeBitWidth. \nFor element type " + << "having more than 32 bits, vnni shouldn't be used. \nIn SIMT " + << "mode, the shape is derived from the mapping attributes.\n"; + return success(); +} + +//===----------------------------------------------------------------------===// +// XeGPU_StoreNDOp +//===----------------------------------------------------------------------===// +LogicalResult StoreNDOp::verify() { + auto dstTy = getTensorDesc().getType(); // Tile + // auto valTy = llvm::dyn_cast(getValue().getType()); // Vector + auto valTy = getValue().getType().cast(); // Vector + + if (dstTy.getRank() != 2) + return emitOpError( + "The TensorDesc for StoreNdOp should be a 2D TensorDesc."); + + if (!valTy) + return emitOpError("Invalid value operand, it should be a VectorType.\n"); + + auto dstElemTy = dstTy.getElementType(); + auto valElemTy = valTy.getElementType(); + + if (dstElemTy != valElemTy) { + return emitOpError("The elem type of the value doesn't " + "match the elem type of the TensorDesc.\n"); + } + + if (dstTy.getShape() != valTy.getShape()) + return emitOpError("The value shape doesn't match " + "the TensorDesc shape.\n"); + return success(); +} + } // namespace xegpu } // namespace mlir From 9ea71f80deff3af28fd473a481fd12c5a5ad9781 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Sun, 10 Mar 2024 13:15:50 -0500 Subject: [PATCH 03/19] add test cases --- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 2 - mlir/test/Dialect/XeGPU/XeGPUOps.mlir | 57 ++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 mlir/test/Dialect/XeGPU/XeGPUOps.mlir diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 727c241a027f77..cabcf0bf071046 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -16,8 +16,6 @@ namespace mlir { namespace xegpu { -bool printDefaultValues() {return false;} - static size_t getRankOf(Value value) { if (value.getType().isIntOrIndexOrFloat()) return 0; diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir new file mode 100644 index 00000000000000..cfb22ce2b8942f --- /dev/null +++ b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir @@ -0,0 +1,57 @@ +// RUN: mlir-opt %s | FileCheck %s +// Verify the printed output can be parsed. +// RUN: mlir-opt %s | mlir-opt | FileCheck %s +// Verify the generic form can be parsed. +// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s + +// CHECK-LABEL: gpu.module @test { +gpu.module @test { +// CHECK: gpu.func @test_create_nd_tdesc_vc_1(%[[arg0:.*]]: memref<24x32xf32>) { +gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) { + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> <8x16xf32> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + gpu.return +} + +// CHECK: gpu.func @test_create_nd_tdesc_vc_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) { +gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) { + //CHECK: %[[C:.*]] = arith.constant 1 : index + %c1 = arith.constant 1 : index + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> <8x16xf32> + %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32> + gpu.return +} + +gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) { + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> <24x16xf32> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr> + gpu.return +} + +// CHECK-LABEL: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) { +gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>, %x : index, %y : index) { + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> <24x32xf16> + %1 = xegpu.create_nd_tdesc %src[%x, %y] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> + // CHECK: xegpu.prefetch_nd %[[REG]] {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} : <8x16xf16> + xegpu.prefetch_nd %1 {li_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}: !xegpu.tensor_desc<8x16xf16> + gpu.return +} + +// CHECK-LABEL: func @test_load_nd_vc({{.*}}) { +gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) { + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %2 = xegpu.load_nd %1 {vnni_axis = 0, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} + : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> + gpu.return +} + +// CHECK-LABEL: func @test_store_nd_vc({{.*}}) { +gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) { + %1 = arith.constant dense<1.0>: vector<24x32xf16> + %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> + xegpu.store_nd %1, %2 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} + : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16> + gpu.return +} + +} \ No newline at end of file From fdd2253802801bcda61747e5e828ee40d5960508 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Sun, 10 Mar 2024 20:12:36 +0000 Subject: [PATCH 04/19] fix printformat issue and update testcases --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 8 ++--- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 30 ++++++------------ mlir/test/Dialect/XeGPU/XeGPUOps.mlir | 31 +++++++++++-------- 3 files changed, 31 insertions(+), 38 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index dd3719f101e8c8..9d37d77e03a0c5 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -101,7 +101,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSeg $source `` custom($dynamic_offsets, $static_offsets) (`,` `[` $dynamic_shape^ `]` `,` `[` $dynamic_strides `]`)? - attr-dict `:` type($source) `->` type($TensorDesc) + attr-dict `:` type($source) `->` qualified(type($TensorDesc)) }]; let skipDefaultBuilders = 1; let hasVerifier = 1; @@ -174,7 +174,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { // l2_hint = #xegpu.cache_hint, // l3_hint = #xegpu.cache_hint} // : !xegpu.tensor_desc<8x16xf16> - let assemblyFormat = "$TensorDesc attr-dict `:` type($TensorDesc)"; + let assemblyFormat = "$TensorDesc attr-dict `:` qualified(type($TensorDesc))"; } @@ -214,7 +214,7 @@ def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> { // l2_hint = #xegpu.cache_hint, // l3_hint = #xegpu.cache_hint} // : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32> - let assemblyFormat = "$TensorDesc attr-dict `:` type($TensorDesc) `->` type($value)"; + let assemblyFormat = "$TensorDesc attr-dict `:` qualified(type($TensorDesc)) `->` type($value)"; let hasVerifier = 1; } @@ -230,7 +230,7 @@ def XeGPU_StoreNDOp : XeGPU_Op<"store_nd", []> { // l2_hint = #xegpu.cache_hint, // l3_hint = #xegpu.cache_hint} // : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> - let assemblyFormat = "$value `,` $TensorDesc attr-dict `:` type($value) `,` type($TensorDesc)"; + let assemblyFormat = "$value `,` $TensorDesc attr-dict `:` type($value) `,` qualified(type($TensorDesc))"; let hasVerifier = 1; } diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index cabcf0bf071046..a388db4f5c2dc6 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -255,18 +255,9 @@ LogicalResult LoadNDOp::verify() { } if (tdescShape != valueShape) - return emitOpError("Result shape doesn't match TensorDesc shape.") - << "\nThe expected shape is " << makeString(tdescShape) << "." - << "\nBut the given shape is " << makeString(valueShape) << "." - << "\nIn VC mode, when VNNI is not enabled, the result should have " - << "the same shape (or transposed shape if transpose is enabled) " - << "as TensorDesc; \nwhen VNNI is enabled, the result should have " - << "one more dimention than the TensorDesc, with last dimention " - << "having vnni factor, \nbut having same number of total data " - << "elements. The vnni factor are typically calculated as " - << "simd_lane_width / elementTypeBitWidth. \nFor element type " - << "having more than 32 bits, vnni shouldn't be used. \nIn SIMT " - << "mode, the shape is derived from the mapping attributes.\n"; + return emitOpError() <<"Result shape doesn't match TensorDesc shape." + << "The expected shape is " << makeString(tdescShape) << ". " + << "But the given shape is " << makeString(valueShape) << ".\n"; return success(); } @@ -274,28 +265,25 @@ LogicalResult LoadNDOp::verify() { // XeGPU_StoreNDOp //===----------------------------------------------------------------------===// LogicalResult StoreNDOp::verify() { - auto dstTy = getTensorDesc().getType(); // Tile - // auto valTy = llvm::dyn_cast(getValue().getType()); // Vector + auto dstTy = getTensorDesc().getType(); // Tile auto valTy = getValue().getType().cast(); // Vector if (dstTy.getRank() != 2) - return emitOpError( - "The TensorDesc for StoreNdOp should be a 2D TensorDesc."); + return emitOpError("Expecting a 2D TensorDesc shape.\n"); if (!valTy) - return emitOpError("Invalid value operand, it should be a VectorType.\n"); + return emitOpError("Exepcting a VectorType result.\n"); auto dstElemTy = dstTy.getElementType(); auto valElemTy = valTy.getElementType(); if (dstElemTy != valElemTy) { - return emitOpError("The elem type of the value doesn't " - "match the elem type of the TensorDesc.\n"); + return emitOpError() << "The element type of the value should " + "match the elementtype of the TensorDesc.\n"; } if (dstTy.getShape() != valTy.getShape()) - return emitOpError("The value shape doesn't match " - "the TensorDesc shape.\n"); + return emitOpError() << "The result shape should match the TensorDesc shape.\n"; return success(); } diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir index cfb22ce2b8942f..f9b3510beb4335 100644 --- a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir +++ b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir @@ -8,7 +8,7 @@ gpu.module @test { // CHECK: gpu.func @test_create_nd_tdesc_vc_1(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> <8x16xf32> + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> gpu.return } @@ -17,40 +17,45 @@ gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) { gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) { //CHECK: %[[C:.*]] = arith.constant 1 : index %c1 = arith.constant 1 : index - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> <8x16xf32> + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32> %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32> gpu.return } +// CHECK: gpu.func @test_create_nd_tdesc_vc_3(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> <24x16xf32> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr> + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr> gpu.return } -// CHECK-LABEL: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) { -gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>, %x : index, %y : index) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> <24x32xf16> - %1 = xegpu.create_nd_tdesc %src[%x, %y] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> - // CHECK: xegpu.prefetch_nd %[[REG]] {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} : <8x16xf16> +// CHECK: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) { +gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) { + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> + // CHECK: xegpu.prefetch_nd %[[R0]] {l2_hint = #xegpu.cache_hint, li_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<8x16xf16> xegpu.prefetch_nd %1 {li_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}: !xegpu.tensor_desc<8x16xf16> gpu.return } -// CHECK-LABEL: func @test_load_nd_vc({{.*}}) { +// CHECK: func @test_load_nd_vc(%[[arg0:.*]]: memref<8x16xf16>) { gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) { + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, vnni_axis = 0 : i64} : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> %2 = xegpu.load_nd %1 {vnni_axis = 0, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> gpu.return } -// CHECK-LABEL: func @test_store_nd_vc({{.*}}) { +// CHECK: func @test_store_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) { gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) { + // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x32xf16> %1 = arith.constant dense<1.0>: vector<24x32xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> - xegpu.store_nd %1, %2 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} - : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16> + // CHECK: xegpu.store_nd %[[C]], %[[R0]] {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16> + xegpu.store_nd %1, %2 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16> gpu.return } From ad27a81fd0fddbffb7e5b3529017f3c532b0db7d Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Sun, 10 Mar 2024 20:35:47 +0000 Subject: [PATCH 05/19] add XeGPU 2D block operators --- mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h | 4 + .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 61 ++++ .../mlir/Dialect/XeGPU/IR/XeGPUDialect.td | 4 +- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 211 +++++++++++++ .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 104 ++++++- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 72 ++++- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 276 +++++++++++++++++- mlir/test/Dialect/XeGPU/XeGPUOps.mlir | 62 ++++ 8 files changed, 787 insertions(+), 7 deletions(-) create mode 100644 mlir/test/Dialect/XeGPU/XeGPUOps.mlir diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h index 7aaa4ecc7ee77a..8dc3ff78d25ede 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h @@ -9,7 +9,11 @@ #ifndef MLIR_DIALECT_XEGPU_IR_XEGPU_H #define MLIR_DIALECT_XEGPU_IR_XEGPU_H +#include +#include #include +#include +#include namespace mlir { namespace xegpu { diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index bb325c272e3324..cd38549f1ccf43 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -10,6 +10,7 @@ #define MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td" +include "mlir/IR/EnumAttr.td" class XeGPUAttr traits = [], string baseCppClass = "::mlir::Attribute"> @@ -17,4 +18,64 @@ class XeGPUAttr traits = [], let mnemonic = attrMnemonic; } +def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> { + let parameters = (ins + OptionalParameter<"MemoryScopeAttr">: $memory_scope, + OptionalParameter<"IntegerAttr", "1">: $array_length, + OptionalParameter<"BoolAttr", "true">: $boundary_check + ); + + let builders = [ + AttrBuilder<(ins + CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope, + CArg<"int", "1">:$array_length, + CArg<"bool", "true">: $boundary_check + )> + ]; + + let assemblyFormat = "`<` struct(params) `>`"; +} + +//===----------------------------------------------------------------------===// +// XeGPU Memory Scope Enums. +//===----------------------------------------------------------------------===// +def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">; +def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">; +def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope", + "The address space of the memory the tensor descritor is created for", + [XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::xegpu"; +} + +def XeGPU_MemoryScopeAttr: + EnumAttr { + let assemblyFormat = "$value"; +} + +//===----------------------------------------------------------------------===// +// XeGPU Cache Enums. +//===----------------------------------------------------------------------===// +def XeGPU_CachePolicyCached: I32EnumAttrCase<"CACHED", 0, "cached">; // valid for read and write +def XeGPU_CachePolicyUncached: I32EnumAttrCase<"UNCACHED", 1, "uncached">; // valid for read and write +def XeGPU_CachePolicyStreaming: I32EnumAttrCase<"STREAMING", 2, "streaming">; // valid for read only +def XeGPU_CachePolicyInvalid: I32EnumAttrCase<"READ_INVALIDATE", 3, "read_invalidate">; // valid for read only +def XeGPU_CachePolicyWriteBack: I32EnumAttrCase<"WRITE_BACK", 4, "write_back">; // valid for write only +def XeGPU_CachePolicyWriteThrough: I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">; // valid for write only + +def XeGPU_CachePolicyEnums : I32EnumAttr<"CachePolicy", "Cache policy", + [XeGPU_CachePolicyCached, XeGPU_CachePolicyUncached, + XeGPU_CachePolicyStreaming, XeGPU_CachePolicyInvalid, + XeGPU_CachePolicyWriteBack, XeGPU_CachePolicyWriteThrough]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::xegpu"; +} + +def XeGPU_CacheHintAttr + : EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + + + #endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td index 3851275ad30a0a..c2f09319c790e0 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td @@ -23,8 +23,8 @@ def XeGPU_Dialect : Dialect { the lower-level GPU compiler. }]; - // let useDefaultTypePrinterParser = true; - // let useDefaultAttributePrinterParser = true; + let useDefaultTypePrinterParser = true; + let useDefaultAttributePrinterParser = true; } #endif // MLIR_DIALECT_XEGPU_IR_XEGPUDIALECT_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 5825ef9195b03f..9d37d77e03a0c5 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -12,6 +12,22 @@ include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td" include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td" include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td" +include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/IR/AttrTypeBase.td" + + +include "mlir/IR/OpBase.td" +include "mlir/IR/OpAsmInterface.td" +include "mlir/IR/AttrTypeBase.td" +include "mlir/IR/BuiltinTypes.td" +include "mlir/IR/BuiltinTypeInterfaces.td" +include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/Interfaces/ViewLikeInterface.td" +include "mlir/Interfaces/CastInterfaces.td" +include "mlir/Interfaces/ControlFlowInterfaces.td" +include "mlir/Interfaces/CopyOpInterface.td" +include "mlir/Interfaces/InferTypeOpInterface.td" +include "mlir/Interfaces/ShapedOpInterfaces.td" // Base class for dialect operations. This operation inherits from the base @@ -23,4 +39,199 @@ class XeGPU_Op traits = []>: Op; +def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSegments]> { + + let summary = "create nd tensor descriptor operation"; + let description = [{ + The "create_nd_tdesc" operation creates a TensorDescType which represents + a sub-view of a 2D memory region (It can be extended to support N-D memory + region if needed in future). Elements in the subview continuous in each + dimention. It encodes the following important information for supporting + Intel hardware features: + + * source: an object representing (starting address/pointer of) a 2D memory region. + It can be either a 2D memref object, or simply a pointer represented by uint64_t type. + for the later case, the shape and layout information of the 2D memory region should + be explicitly passed via `dynamic_shape` and `dynamic_strides` parameters. + * offsets: two index values represents offsets from the "source" at the each dimension + at which the subview of the target memory will be created. It is encoded via two + variables, including "dynamic_offsets" and "static_offsets", such that it can + accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4])). + * shape: the shape information of the memory region pointed by the "source". It is + typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>. + But if "source" is simply a pointer represented as uint64_t type, or a memref + type without shape information e.g., memref, the shape information has + to be explicitly passed via the "dynamic_shape" argument. Currently "dynamic_shape" + only accepts operands(e.g., [%c4096, %c4096]), not attributes(e.g., [4096, 4096]). + * strides: the strides of the memory region pointed by the "source". Similar to shape, + it is typically encoded via the MemRefType of the source too. But if "source" is + simply a pointer represented as uint64_t type, or a memref type without shape + information e.g., memref, the strides information has to be explicitly + passed via the "dynamic_strides" argument. And it currently only accepts operands two. + + Example 1 (suppose the tensor shape inferred by the compiler is 8x16): + %0 = memref.alloc() : memref<1024x1024xf32> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = xegpu.create_nd_tdesc %0[%c0, %c0]: memref<1024x1024xf32> -> TensorDesc<8x16xf32> + + Example 2 (suppose the tensor shape inferred by the compiler is 8x16): + %0 = memref.alloc(%h, %w) : memref + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: memref -> TensorDesc<8x16xf32> + + Example 3 (suppose the tensor shape inferred by the compiler is 8x16): + %0 = ... : ui64 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32> + }]; + + let arguments = (ins + XeGPU_BaseAddrType: $source, + Variadic: $dynamic_offsets, + Variadic: $dynamic_shape, + Variadic: $dynamic_strides, + DenseI64ArrayAttr: $static_offsets + ); + let results = (outs XeGPU_TensorDesc: $TensorDesc); + + let assemblyFormat = [{ + $source `` + custom($dynamic_offsets, $static_offsets) + (`,` `[` $dynamic_shape^ `]` `,` `[` $dynamic_strides `]`)? + attr-dict `:` type($source) `->` qualified(type($TensorDesc)) + }]; + let skipDefaultBuilders = 1; + let hasVerifier = 1; + + let builders = [ + OpBuilder<(ins "Type": $TensorDesc, "Value": $source, "ValueRange": $offsets, + "ValueRange": $shape, "ValueRange": $strides, + "llvm::ArrayRef": $static_offsets)>, + + OpBuilder<(ins "Type": $tdesc, "Value": $source, + "llvm::ArrayRef": $offsets)>, + + OpBuilder<(ins "Type": $tdesc, "Value": $source, + "llvm::ArrayRef": $offsets, + "ValueRange": $shape, "ValueRange": $stride)> + ]; + + let extraClassDeclaration = [{ + /// Returns the type of the source memref operand. + Type getSourceType() { + return getSource().getType(); + } + + /// Returns the type of the result TensorDesc. + xegpu::TensorDescType getType() { + return getTensorDesc().getType(); + } + + /// Returns the offsets info to the source. It consolidates + /// information from both dynamic_offsets and static_offsets + /// parameters. static_offsets parameter always has the expected + /// ranks with some dim could have ShapeType::kDynamic value + /// indicating the corresponding value should be from dynamic_offsets. + llvm::SmallVector getOffsets(); + + /// returns the shape info of the source. It is either from the + /// memref type, if source is a memref with static shape + /// information or from the dynamic_shape parameter. If both + /// exists, the dynamic_shape parameter will be used and the + /// shape information from memref type will be ignored. + llvm::SmallVector getShape(); + + /// returns the strides info of the source. It is either from the + /// memref type, if source is a memref with static shape + /// information or from the dynamic_stride parameter. If both + /// exists, the dynamic_strides parameter will be used and the + /// strides information from memref type will be ignored. + llvm::SmallVector getStrides(); + + /// Return the element type of the TensorDesc + Type getElementType() { + return getType().getElementType(); + } + + /// Return the shape of the TensorDesc + llvm::ArrayRef getTensorDescShape() { + return getType().getShape(); + } + }]; +} + +def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { + let summary = "prefetches a nD block to cache"; + let arguments = (ins XeGPU_TensorDesc: $TensorDesc, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); + + // Format: xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint, + // l2_hint = #xegpu.cache_hint, + // l3_hint = #xegpu.cache_hint} + // : !xegpu.tensor_desc<8x16xf16> + let assemblyFormat = "$TensorDesc attr-dict `:` qualified(type($TensorDesc))"; +} + + +def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> { + let summary = "loads a n-D block from memory (represented by TensorDesc)" + "to registers (represented by vector)"; + let description = [{ + LoadNDOp essentially mimics the hardware block read instruction to read + a block of data from memory to register. It takes a set of cache hints + for each level of cache, L1, L2 and L3. If hardware does not have a + correspoding cache, Corresponding cache hint attribute will be masked. + If both transpose and vnni_axis present at the same time. It assume to + perform transpose first and then vnni transform. + }]; + + let arguments = (ins XeGPU_TensorDesc: $TensorDesc, + OptionalAttr: $vnni_axis, + OptionalAttr: $transpose, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); + + let results = (outs XeGPU_ValueType: $value); + + let extraClassDeclaration = [{ + VectorType getType() { + return llvm::dyn_cast(getValue().getType()); + } + + xegpu::TensorDescType getTensorDescType() { + return getTensorDesc().getType(); + } + }]; + + // Format: xegpu.load_nd %1 {transpose = [1, 0], + // l1_hint = #xegpu.cache_hint, + // l2_hint = #xegpu.cache_hint, + // l3_hint = #xegpu.cache_hint} + // : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32> + let assemblyFormat = "$TensorDesc attr-dict `:` qualified(type($TensorDesc)) `->` type($value)"; + let hasVerifier = 1; +} + +def XeGPU_StoreNDOp : XeGPU_Op<"store_nd", []> { + let summary = "stores a n-D block register region back to memory, currently only supports 2D"; + let arguments = (ins XeGPU_ValueType: $value, + XeGPU_TensorDesc: $TensorDesc, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); + + // Format: xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint, + // l2_hint = #xegpu.cache_hint, + // l3_hint = #xegpu.cache_hint} + // : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> + let assemblyFormat = "$value `,` $TensorDesc attr-dict `:` type($value) `,` qualified(type($TensorDesc))"; + let hasVerifier = 1; +} + #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index 1d75bb4e2906fe..36b04ea12bcad0 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -9,9 +9,9 @@ #ifndef MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD #define MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD -include "mlir/IR/BuiltinTypes.td" include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td" include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td" +include "mlir/IR/BuiltinTypes.td" def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64, UI1, UI8, UI16, UI32, UI64]>; def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>; @@ -30,4 +30,106 @@ class XeGPUTypeDef traits = [], let mnemonic = typeMnemonic; } +def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", + [ShapedTypeInterface], "::mlir::TensorType"> { + let summary = "TensorDesc describing regions of interested data."; + let description = [{ + TensorDesc is a type designed to describe regions of the interested data as well as some + features that are unique to Intel hardware. Different with the builtin tensor type in MLIR, + it essentially only contains the meta data, and doesn't hold the data by itself. It is designed + to mainly support 2d block load/store and DPAS (matrix multiplication instruction) on Intel GPU. + It encodes the following information: + + * shape: the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows + and each row contains 16 continious data element. The rows could be + either continuous or not, depends on whether the encoding attribute + is set or not. + * element_type: the data type of the data element, e.g., f16, f32. + + Similar to the builtin tensor, it also provides an optinal attribute to encoding + the following information via the TensorDescAttr object: + * memory_scope (xegpu::MemoryScope): [optional] where the data is located, + global memory or shared memory. It is default to Global. + * array_length (int): [optional] The number of continuous blocks with size as `shape`, + that will be loaded by block load at a time. It is default to 1. + * boundary_check (bool): [optional] indicates whether the operation detects the boundary + and pads with zero for out-of-boundary access. It is default to do boundary check. + + + Syntax: + + ``` + TensorDesc-type ::= `tensor_desc` `<` dim-list element-type (attr-list)? `>` + element-type ::= float-type | integer-type | index-type + dim-list := (static-dim-list `x`)? + static-dim-list ::= decimal-literal `x` decimal-literal + attr-list = (, memory_scope = value)? (, arr_len = value)? (, boundary_check = value)? + ``` + + Examples: + + ```mlir + // A block TensorDesc with 8x16 i32 elements + xegpu.tensor_desc<8x16xi32> + + // A block TensorDesc with 8x16 f32 elements + xegpu.tensor_desc<8x16xf32> + + // A TensorDesc with 8x16 f32 elements for a memory region in shared memory space. + xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + ``` + }]; + + let parameters = (ins ArrayRefParameter<"int64_t">: $shape, + "mlir::Type": $elementType, + OptionalParameter<"mlir::Attribute">: $encoding); + + let extraClassDeclaration = [{ + using TensorType::clone; + using mlir::ShapedType::Trait::getElementTypeBitWidth; + using mlir::ShapedType::Trait::getRank; + using mlir::ShapedType::Trait::getNumElements; + using mlir::ShapedType::Trait::isDynamicDim; + using mlir::ShapedType::Trait::hasStaticShape; + using mlir::ShapedType::Trait::getNumDynamicDims; + using mlir::ShapedType::Trait::getDimSize; + using mlir::ShapedType::Trait::getDynamicDimIndex; + + TensorDescType clone(::mlir::Type elementType) { + return llvm::cast(cloneWith(getShape(), elementType)); + } + + TensorDescAttr getEncodingAsTensorDescAttr() const { + return llvm::dyn_cast_if_present(getEncoding()); + } + + xegpu::MemoryScope getMemoryScope() const { + auto attr = getEncodingAsTensorDescAttr(); + if (attr && attr.getMemoryScope()) + return attr.getMemoryScope().getValue(); + // return default value + return MemoryScope::Global; + } + + int getArrayLength() { + auto attr = getEncodingAsTensorDescAttr(); + if (attr && attr.getArrayLength()) + return attr.getArrayLength().getInt(); + // return default value + return 1; + } + + bool getBoundaryCheck() { + auto attr = getEncodingAsTensorDescAttr(); + if (attr && attr.getBoundaryCheck()) + return attr.getBoundaryCheck().getValue(); + // return default value + return true; + } + }]; + + let hasCustomAssemblyFormat = true; + +} + #endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 4f839ee773476b..bd72d5c17b6ea1 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -6,7 +6,10 @@ // //===----------------------------------------------------------------------===// +#include #include +#include +#include namespace mlir { namespace xegpu { @@ -26,8 +29,73 @@ void XeGPUDialect::initialize() { >(); } -// this file is for position occupation, -// we will add functions in following PRs. + +//===----------------------------------------------------------------------===// +// XeGPU_TensorDescAttr +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// XeGPU_TensorDescType +//===----------------------------------------------------------------------===// +mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) { + llvm::SmallVector shape; + mlir::Type elementType; + mlir::FailureOr encoding; + + // Parse literal '<' + if (parser.parseLess()) + return {}; + + auto shapeLoc = parser.getCurrentLocation(); + if (mlir::failed(parser.parseDimensionList(shape))) { + parser.emitError(shapeLoc, "failed to parse parameter 'shape'"); + return {}; + } + + auto elemTypeLoc = parser.getCurrentLocation(); + if (mlir::failed(parser.parseType(elementType))) { + parser.emitError(elemTypeLoc, "failed to parse parameter 'elementType'"); + return {}; + } + + // parse optional attributes + if (mlir::succeeded(parser.parseOptionalComma())) { + encoding = mlir::FieldParser::parse(parser); + if (mlir::failed(encoding)) { + parser.emitError(parser.getCurrentLocation(), + "Failed to parse the attribute field for TensorDescType.\n"); + return {}; + } + } + + // Parse literal '>' + if (parser.parseGreater()) + return {}; + + return TensorDescType::get(parser.getContext(), shape, elementType, + encoding.value_or(mlir::Attribute())); +} + +void TensorDescType::print(::mlir::AsmPrinter &printer) const { + printer << "<"; + + auto shape = getShape(); + for (int64_t dim : shape) { + if (mlir::ShapedType::isDynamic(dim)) + printer << '?'; + else + printer << dim; + printer << 'x'; + } + + printer << getElementType(); + + if (auto encoding = getEncoding()) + printer << ", " << encoding; + + printer << ">"; +} + } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 0e89ac4df6ef28..a388db4f5c2dc6 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -6,14 +6,286 @@ // //===----------------------------------------------------------------------===// +#include #include +#include +#include #define DEBUG_TYPE "xegpu" namespace mlir { namespace xegpu { -// this file is for position occupation, -// we will add functions in following PRs. + +static size_t getRankOf(Value value) { + if (value.getType().isIntOrIndexOrFloat()) + return 0; + if (auto ty = llvm::dyn_cast_if_present(value.getType())) + return ty.getRank(); + if (auto ty = llvm::dyn_cast_if_present(value.getType())) + return ty.getRank(); + llvm_unreachable("Unsupported value for getRankOf"); +} + +static void transpose(llvm::ArrayRef trans, + std::vector &shape) { + std::vector old = shape; + for (size_t i = 0; i < trans.size(); i++) + shape[i] = old[trans[i]]; +} + +template +static std::string makeString(T array, bool breakline = false) { + std::string buf; + buf.clear(); + llvm::raw_string_ostream os(buf); + os << "["; + for (size_t i = 1; i < array.size(); i++) { + os << array[i - 1] << ", "; + if (breakline) + os << "\n\t\t"; + } + os << array.back() << "]"; + os.flush(); + return buf; +} + +//===----------------------------------------------------------------------===// +// XeGPU_CreateNdDescOp +//===----------------------------------------------------------------------===// +void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, + Type TensorDesc, Value source, ValueRange offsets, + ValueRange shape, ValueRange strides, + llvm::ArrayRef static_offsets) { + auto offsetRank = static_offsets.size(); + auto shapeRank = shape.size() ? shape.size() : getRankOf(source); + + size_t dynOffsetRank = + std::count_if(static_offsets.begin(), static_offsets.end(), + [](int64_t d) { return ShapedType::isDynamic(d); }); + + // shape and strides should exists at the same time + // and the final rank for shape and offset (dynamic + static) + // should be the same + assert(shape.size() == strides.size() && shapeRank == offsetRank && + offsets.size() == dynOffsetRank); + + state.addOperands(source); + state.addOperands(offsets); + state.addOperands(shape); + state.addOperands(strides); + state.addAttribute( + getOperandSegmentSizesAttrName(state.name), + builder.getDenseI32ArrayAttr({1, static_cast(offsets.size()), + static_cast(shape.size()), + static_cast(strides.size())})); + state.addAttribute(getStaticOffsetsAttrName(state.name), + builder.getDenseI64ArrayAttr(static_offsets)); + state.addTypes(TensorDesc); +} + +void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, + Type tdesc, Value source, + llvm::ArrayRef offsets) { + auto ty = llvm::dyn_cast_if_present(source.getType()); + assert(ty && ty.hasStaticShape() && offsets.size() == getRankOf(source)); + + llvm::SmallVector staticOffsets; + llvm::SmallVector dynamicOffsets; + dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); + + build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */, + ValueRange({}) /* empty dynamic shape */, + ValueRange({}) /* empty dynamic strides */, + staticOffsets /* static offsets */); +} + +void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, + Type tdesc, Value source, + llvm::ArrayRef offsets, + ValueRange shape, ValueRange stride) { + assert(shape.size() && offsets.size() && stride.size() && + shape.size() == stride.size() && shape.size() == offsets.size()); + + llvm::SmallVector staticOffsets; + llvm::SmallVector dynamicOffsets; + + dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); + + build(builder, state, tdesc, source, /* dynamic_offsets = */ dynamicOffsets, + /* dynamic shape = */ shape , /* dynamic strides = */ stride, + /* static offsets = */ staticOffsets); +} + + +LogicalResult CreateNdDescOp::verify() { + auto offsetRank = getOffsets().size(); + auto shapeRank = getShape().size(); + auto stridesRank = getStrides().size(); + auto baseRank = getRankOf(getSource()) ? getRankOf(getSource()) : 2; + + if (offsetRank != shapeRank || shapeRank != stridesRank || + shapeRank != baseRank) + + return emitOpError( + "Expecting the rank of shape, strides, offsets and memref type " + "should match with each other (they currently should be 2D)."); + return success(); +} + +// compute consolidated offsets from dynamic_offsets and static_offsets parameters +llvm::SmallVector CreateNdDescOp::getOffsets() { + llvm::SmallVector offsets; + auto dynamicOffsets = getDynamicOffsets(); // dynamic_offsets variable + auto staticOffsets = getStaticOffsets(); // static_offsets attribute + + // in case static_offsets is missing, dynamic_offsets will be used + if (staticOffsets.size() == 0) { + offsets.assign(dynamicOffsets.begin(), dynamicOffsets.end()); + return offsets; + } + + // use static offsets for each dim if it has valid value, + // othwise use the value from dynamic_offsets + for (size_t i = 0, j = 0; i < staticOffsets.size(); i++) { + if (ShapedType::isDynamic(staticOffsets[i])) { + assert(j < dynamicOffsets.size()); + offsets.push_back(dynamicOffsets[j++]); + } else { + auto ty = IndexType::get(getContext()); + auto attr = IntegerAttr::get(ty, staticOffsets[i]); + offsets.push_back(attr); + } + } + return offsets; +} + +// get the consolidated shape of the 2D memory region. +// It prefer dynamic_shape than the static shape of +// memref type. +llvm::SmallVector CreateNdDescOp::getShape() { + llvm::SmallVector shape; + auto dynShape = getDynamicShape(); + if (dynShape.size()) { + shape.append(dynShape.begin(), dynShape.end()); + return shape; + } + + auto ty = llvm::dyn_cast_if_present(getSourceType()); + if (ty && ty.hasStaticShape()) { + for (auto dim : ty.getShape()) { + auto attr = IntegerAttr::get(IndexType::get(getContext()), dim); + shape.push_back(attr); + } + return shape; + } + + this->emitError("The shape information of the memory is missing.\n"); + return {}; +} + +// get the consolidated strides of the 2D memory region. +// It prefer dynamic_stride than the static strides of +// memref type. +llvm::SmallVector CreateNdDescOp::getStrides() { + llvm::SmallVector strides; + + auto dynStrides = getDynamicStrides(); + if (dynStrides.size()) { + strides.append(dynStrides.begin(), dynStrides.end()); + return strides; + } + + auto ty = llvm::dyn_cast_if_present(getSourceType()); + if (ty && ty.hasStaticShape()) { + auto [staticStrides, offset] = getStridesAndOffset(ty); + for (auto dim : staticStrides) { + auto attr = IntegerAttr::get(IndexType::get(getContext()), dim); + strides.push_back(attr); + } + return strides; + } + + this->emitError("The strides information of the memory is missing.\n"); + return {}; +} + +//===----------------------------------------------------------------------===// +// XeGPU_LoadNDOp +//===----------------------------------------------------------------------===// +LogicalResult LoadNDOp::verify() { + auto tdescTy = getTensorDescType(); + auto valueTy = getType(); + + if (tdescTy.getRank() != 2) + return emitOpError( + "The TensorDesc for LoadNDOp should be a 2D TensorDesc."); + + if (!valueTy) + return emitOpError("Invalid result, it should be a VectorType.\n"); + + auto tdescElemTy = tdescTy.getElementType(); + auto valueElemTy = valueTy.getElementType(); + + if (tdescElemTy != valueElemTy) + return emitOpError( + "Value should have the same element type as TensorDesc."); + + auto array_len = tdescTy.getArrayLength(); + auto tdescShape = tdescTy.getShape().vec(); + auto valueShape = valueTy.getShape().vec(); + + if (getTranspose()) { + auto trans = getTranspose().value(); + if (tdescShape.size() >= trans.size()) + transpose(trans, tdescShape); + else + emitWarning("Invalid transpose attr. It is ignored."); + } + + if (getVnniAxis()) { + auto axis = getVnniAxis().value(); + auto vnni_factor = valueShape.back(); + tdescShape[axis] /= vnni_factor; + tdescShape.push_back(vnni_factor); + } + + if (array_len > 1) { + auto it = tdescShape.begin(); + tdescShape.insert(it, array_len); + } + + if (tdescShape != valueShape) + return emitOpError() <<"Result shape doesn't match TensorDesc shape." + << "The expected shape is " << makeString(tdescShape) << ". " + << "But the given shape is " << makeString(valueShape) << ".\n"; + return success(); +} + +//===----------------------------------------------------------------------===// +// XeGPU_StoreNDOp +//===----------------------------------------------------------------------===// +LogicalResult StoreNDOp::verify() { + auto dstTy = getTensorDesc().getType(); // Tile + auto valTy = getValue().getType().cast(); // Vector + + if (dstTy.getRank() != 2) + return emitOpError("Expecting a 2D TensorDesc shape.\n"); + + if (!valTy) + return emitOpError("Exepcting a VectorType result.\n"); + + auto dstElemTy = dstTy.getElementType(); + auto valElemTy = valTy.getElementType(); + + if (dstElemTy != valElemTy) { + return emitOpError() << "The element type of the value should " + "match the elementtype of the TensorDesc.\n"; + } + + if (dstTy.getShape() != valTy.getShape()) + return emitOpError() << "The result shape should match the TensorDesc shape.\n"; + return success(); +} } // namespace xegpu } // namespace mlir diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir new file mode 100644 index 00000000000000..f9b3510beb4335 --- /dev/null +++ b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir @@ -0,0 +1,62 @@ +// RUN: mlir-opt %s | FileCheck %s +// Verify the printed output can be parsed. +// RUN: mlir-opt %s | mlir-opt | FileCheck %s +// Verify the generic form can be parsed. +// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s + +// CHECK-LABEL: gpu.module @test { +gpu.module @test { +// CHECK: gpu.func @test_create_nd_tdesc_vc_1(%[[arg0:.*]]: memref<24x32xf32>) { +gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) { + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + gpu.return +} + +// CHECK: gpu.func @test_create_nd_tdesc_vc_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) { +gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) { + //CHECK: %[[C:.*]] = arith.constant 1 : index + %c1 = arith.constant 1 : index + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32> + %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32> + gpu.return +} + +// CHECK: gpu.func @test_create_nd_tdesc_vc_3(%[[arg0:.*]]: memref<24x32xf32>) { +gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) { + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr> + gpu.return +} + +// CHECK: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) { +gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) { + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> + // CHECK: xegpu.prefetch_nd %[[R0]] {l2_hint = #xegpu.cache_hint, li_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<8x16xf16> + xegpu.prefetch_nd %1 {li_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}: !xegpu.tensor_desc<8x16xf16> + gpu.return +} + +// CHECK: func @test_load_nd_vc(%[[arg0:.*]]: memref<8x16xf16>) { +gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) { + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, vnni_axis = 0 : i64} : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> + %2 = xegpu.load_nd %1 {vnni_axis = 0, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} + : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> + gpu.return +} + +// CHECK: func @test_store_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) { +gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) { + // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x32xf16> + %1 = arith.constant dense<1.0>: vector<24x32xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> + %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> + // CHECK: xegpu.store_nd %[[C]], %[[R0]] {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16> + xegpu.store_nd %1, %2 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16> + gpu.return +} + +} \ No newline at end of file From 74bd038f61985874694c01023c16f04e070e1419 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Sun, 10 Mar 2024 20:38:46 +0000 Subject: [PATCH 06/19] run clang-format --- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 5 ++-- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 33 ++++++++++++---------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index bd72d5c17b6ea1..43337a6ab43dcd 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -29,7 +29,6 @@ void XeGPUDialect::initialize() { >(); } - //===----------------------------------------------------------------------===// // XeGPU_TensorDescAttr //===----------------------------------------------------------------------===// @@ -62,7 +61,8 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) { if (mlir::succeeded(parser.parseOptionalComma())) { encoding = mlir::FieldParser::parse(parser); if (mlir::failed(encoding)) { - parser.emitError(parser.getCurrentLocation(), + parser.emitError( + parser.getCurrentLocation(), "Failed to parse the attribute field for TensorDescType.\n"); return {}; } @@ -96,7 +96,6 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const { printer << ">"; } - } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index a388db4f5c2dc6..be631c4678eacb 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -8,8 +8,8 @@ #include #include -#include #include +#include #define DEBUG_TYPE "xegpu" @@ -112,11 +112,10 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); build(builder, state, tdesc, source, /* dynamic_offsets = */ dynamicOffsets, - /* dynamic shape = */ shape , /* dynamic strides = */ stride, + /* dynamic shape = */ shape, /* dynamic strides = */ stride, /* static offsets = */ staticOffsets); } - LogicalResult CreateNdDescOp::verify() { auto offsetRank = getOffsets().size(); auto shapeRank = getShape().size(); @@ -132,7 +131,8 @@ LogicalResult CreateNdDescOp::verify() { return success(); } -// compute consolidated offsets from dynamic_offsets and static_offsets parameters +// compute consolidated offsets from dynamic_offsets and static_offsets +// parameters llvm::SmallVector CreateNdDescOp::getOffsets() { llvm::SmallVector offsets; auto dynamicOffsets = getDynamicOffsets(); // dynamic_offsets variable @@ -144,7 +144,7 @@ llvm::SmallVector CreateNdDescOp::getOffsets() { return offsets; } - // use static offsets for each dim if it has valid value, + // use static offsets for each dim if it has valid value, // othwise use the value from dynamic_offsets for (size_t i = 0, j = 0; i < staticOffsets.size(); i++) { if (ShapedType::isDynamic(staticOffsets[i])) { @@ -159,8 +159,8 @@ llvm::SmallVector CreateNdDescOp::getOffsets() { return offsets; } -// get the consolidated shape of the 2D memory region. -// It prefer dynamic_shape than the static shape of +// get the consolidated shape of the 2D memory region. +// It prefer dynamic_shape than the static shape of // memref type. llvm::SmallVector CreateNdDescOp::getShape() { llvm::SmallVector shape; @@ -178,13 +178,13 @@ llvm::SmallVector CreateNdDescOp::getShape() { } return shape; } - + this->emitError("The shape information of the memory is missing.\n"); return {}; } -// get the consolidated strides of the 2D memory region. -// It prefer dynamic_stride than the static strides of +// get the consolidated strides of the 2D memory region. +// It prefer dynamic_stride than the static strides of // memref type. llvm::SmallVector CreateNdDescOp::getStrides() { llvm::SmallVector strides; @@ -255,9 +255,11 @@ LogicalResult LoadNDOp::verify() { } if (tdescShape != valueShape) - return emitOpError() <<"Result shape doesn't match TensorDesc shape." - << "The expected shape is " << makeString(tdescShape) << ". " - << "But the given shape is " << makeString(valueShape) << ".\n"; + return emitOpError() << "Result shape doesn't match TensorDesc shape." + << "The expected shape is " << makeString(tdescShape) + << ". " + << "But the given shape is " << makeString(valueShape) + << ".\n"; return success(); } @@ -279,11 +281,12 @@ LogicalResult StoreNDOp::verify() { if (dstElemTy != valElemTy) { return emitOpError() << "The element type of the value should " - "match the elementtype of the TensorDesc.\n"; + "match the elementtype of the TensorDesc.\n"; } if (dstTy.getShape() != valTy.getShape()) - return emitOpError() << "The result shape should match the TensorDesc shape.\n"; + return emitOpError() + << "The result shape should match the TensorDesc shape.\n"; return success(); } From 778d4d2c09eed97231db300614387e6bd3fb1608 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Mon, 11 Mar 2024 23:18:35 +0000 Subject: [PATCH 07/19] sync for OffsetSizeAndStrideOpInterface --- mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h | 12 ++-- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 59 +++++++++++-------- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 45 ++++++++------ 3 files changed, 67 insertions(+), 49 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h index 8dc3ff78d25ede..662fd7ef197414 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h @@ -9,11 +9,13 @@ #ifndef MLIR_DIALECT_XEGPU_IR_XEGPU_H #define MLIR_DIALECT_XEGPU_IR_XEGPU_H -#include -#include -#include -#include -#include +#include "mlir/Bytecode/BytecodeOpInterface.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Dialect.h" +#include "mlir/Interfaces/ShapedOpInterfaces.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" +#include "mlir/Interfaces/ViewLikeInterface.h" + namespace mlir { namespace xegpu { diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 9d37d77e03a0c5..d8eba0588c7c86 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -9,26 +9,13 @@ #ifndef MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD #define MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD +include "mlir/IR/AttrTypeBase.td" include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td" include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td" include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td" -include "mlir/Interfaces/SideEffectInterfaces.td" -include "mlir/IR/AttrTypeBase.td" - - -include "mlir/IR/OpBase.td" -include "mlir/IR/OpAsmInterface.td" -include "mlir/IR/AttrTypeBase.td" -include "mlir/IR/BuiltinTypes.td" -include "mlir/IR/BuiltinTypeInterfaces.td" +include "mlir/Interfaces/ShapedOpInterfaces.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/Interfaces/ViewLikeInterface.td" -include "mlir/Interfaces/CastInterfaces.td" -include "mlir/Interfaces/ControlFlowInterfaces.td" -include "mlir/Interfaces/CopyOpInterface.td" -include "mlir/Interfaces/InferTypeOpInterface.td" -include "mlir/Interfaces/ShapedOpInterfaces.td" - // Base class for dialect operations. This operation inherits from the base // `Op` class in OpBase.td, and provides: @@ -39,7 +26,8 @@ class XeGPU_Op traits = []>: Op; -def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSegments]> { +def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, + AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> { let summary = "create nd tensor descriptor operation"; let description = [{ @@ -90,17 +78,20 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSeg let arguments = (ins XeGPU_BaseAddrType: $source, - Variadic: $dynamic_offsets, - Variadic: $dynamic_shape, - Variadic: $dynamic_strides, - DenseI64ArrayAttr: $static_offsets + Variadic: $offsets, + Variadic: $shape, + Variadic: $strides, + DenseI64ArrayAttr: $static_offsets, + DefaultValuedAttr: $static_shape, + DefaultValuedAttr: $static_strides ); let results = (outs XeGPU_TensorDesc: $TensorDesc); let assemblyFormat = [{ $source `` - custom($dynamic_offsets, $static_offsets) - (`,` `[` $dynamic_shape^ `]` `,` `[` $dynamic_strides `]`)? + custom($offsets, $static_offsets) + (`,` custom($shape, $static_shape)^ + `,` custom($strides, $static_strides))? attr-dict `:` type($source) `->` qualified(type($TensorDesc)) }]; let skipDefaultBuilders = 1; @@ -135,21 +126,21 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSeg /// parameters. static_offsets parameter always has the expected /// ranks with some dim could have ShapeType::kDynamic value /// indicating the corresponding value should be from dynamic_offsets. - llvm::SmallVector getOffsets(); + llvm::SmallVector getEffectiveOffsets(); /// returns the shape info of the source. It is either from the /// memref type, if source is a memref with static shape /// information or from the dynamic_shape parameter. If both /// exists, the dynamic_shape parameter will be used and the /// shape information from memref type will be ignored. - llvm::SmallVector getShape(); + llvm::SmallVector getEffectiveShape(); /// returns the strides info of the source. It is either from the /// memref type, if source is a memref with static shape /// information or from the dynamic_stride parameter. If both /// exists, the dynamic_strides parameter will be used and the /// strides information from memref type will be ignored. - llvm::SmallVector getStrides(); + llvm::SmallVector getEffectiveStrides(); /// Return the element type of the TensorDesc Type getElementType() { @@ -160,6 +151,24 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSeg llvm::ArrayRef getTensorDescShape() { return getType().getShape(); } + + /// Return the expected rank of each of the`static_offsets`, `static_sizes` + /// and `static_strides` attributes. + std::array getArrayAttrMaxRanks() { + return {2, 2, 2}; + } + + mlir::OperandRange getSizes() { + return getShape(); + } + + llvm::ArrayRef getStaticSizes() { + return getStaticShape(); + } + + /// Return the number of leading operands before the `offsets`, `sizes` and + /// and `strides` operands. + static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; } }]; } diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index a388db4f5c2dc6..34d1a90a50a488 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -8,9 +8,10 @@ #include #include -#include #include +#include + #define DEBUG_TYPE "xegpu" namespace mlir { @@ -118,24 +119,30 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, LogicalResult CreateNdDescOp::verify() { - auto offsetRank = getOffsets().size(); - auto shapeRank = getShape().size(); - auto stridesRank = getStrides().size(); - auto baseRank = getRankOf(getSource()) ? getRankOf(getSource()) : 2; - - if (offsetRank != shapeRank || shapeRank != stridesRank || - shapeRank != baseRank) - - return emitOpError( - "Expecting the rank of shape, strides, offsets and memref type " - "should match with each other (they currently should be 2D)."); + // auto offsetRank = getEffectiveOffsets().size(); + // auto shapeRank = getEffectiveShape().size(); + // auto stridesRank = getEffectiveStrides().size(); + // auto baseRank = getRankOf(getSource()) ? getRankOf(getSource()) : 2; + + llvm::dbgs() << "\nNum of mixed Offsets: " << getMixedOffsets().size() + << "\nNum of mixed Sizes: " << getMixedSizes().size() + << "\nNum of mixed Strides: " << getMixedStrides().size() + << "\n"; + + // if (offsetRank != shapeRank || shapeRank != stridesRank || + // shapeRank != baseRank) + + // return emitOpError( + // "Expecting the rank of shape, strides, offsets and memref type " + // "should match with each other (they currently should be 2D)."); return success(); } -// compute consolidated offsets from dynamic_offsets and static_offsets parameters -llvm::SmallVector CreateNdDescOp::getOffsets() { +// compute consolidated offsets from dynamic_offsets and static_offsets +// parameters +llvm::SmallVector CreateNdDescOp::getEffectiveOffsets() { llvm::SmallVector offsets; - auto dynamicOffsets = getDynamicOffsets(); // dynamic_offsets variable + auto dynamicOffsets = getOffsets(); // offsets variable auto staticOffsets = getStaticOffsets(); // static_offsets attribute // in case static_offsets is missing, dynamic_offsets will be used @@ -162,9 +169,9 @@ llvm::SmallVector CreateNdDescOp::getOffsets() { // get the consolidated shape of the 2D memory region. // It prefer dynamic_shape than the static shape of // memref type. -llvm::SmallVector CreateNdDescOp::getShape() { +llvm::SmallVector CreateNdDescOp::getEffectiveShape() { llvm::SmallVector shape; - auto dynShape = getDynamicShape(); + auto dynShape = getShape(); if (dynShape.size()) { shape.append(dynShape.begin(), dynShape.end()); return shape; @@ -186,10 +193,10 @@ llvm::SmallVector CreateNdDescOp::getShape() { // get the consolidated strides of the 2D memory region. // It prefer dynamic_stride than the static strides of // memref type. -llvm::SmallVector CreateNdDescOp::getStrides() { +llvm::SmallVector CreateNdDescOp::getEffectiveStrides() { llvm::SmallVector strides; - auto dynStrides = getDynamicStrides(); + auto dynStrides = getStrides(); if (dynStrides.size()) { strides.append(dynStrides.begin(), dynStrides.end()); return strides; From 3c37828ce6bb54c1e4af99a3726eb898fd55b61b Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 12 Mar 2024 00:13:55 +0000 Subject: [PATCH 08/19] sync --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 83 +++++++----- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 126 +++++++++--------- 2 files changed, 109 insertions(+), 100 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index d8eba0588c7c86..447ea2e0f3982b 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -81,34 +81,31 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, Variadic: $offsets, Variadic: $shape, Variadic: $strides, - DenseI64ArrayAttr: $static_offsets, - DefaultValuedAttr: $static_shape, - DefaultValuedAttr: $static_strides + DenseI64ArrayAttr: $static_offsets ); let results = (outs XeGPU_TensorDesc: $TensorDesc); let assemblyFormat = [{ $source `` custom($offsets, $static_offsets) - (`,` custom($shape, $static_shape)^ - `,` custom($strides, $static_strides))? + (`,` `[` $shape^ `]` `,` `[` $strides `]`)? attr-dict `:` type($source) `->` qualified(type($TensorDesc)) }]; - let skipDefaultBuilders = 1; - let hasVerifier = 1; - - let builders = [ - OpBuilder<(ins "Type": $TensorDesc, "Value": $source, "ValueRange": $offsets, - "ValueRange": $shape, "ValueRange": $strides, - "llvm::ArrayRef": $static_offsets)>, - OpBuilder<(ins "Type": $tdesc, "Value": $source, - "llvm::ArrayRef": $offsets)>, + let hasVerifier = 1; - OpBuilder<(ins "Type": $tdesc, "Value": $source, - "llvm::ArrayRef": $offsets, - "ValueRange": $shape, "ValueRange": $stride)> - ]; +// let builders = [ +// OpBuilder<(ins "Type": $TensorDesc, "Value": $source, "ValueRange": $offsets, +// "ValueRange": $shape, "ValueRange": $strides, +// "llvm::ArrayRef": $static_offsets)>, +// +// OpBuilder<(ins "Type": $tdesc, "Value": $source, +// "llvm::ArrayRef": $offsets)>, +// +// OpBuilder<(ins "Type": $tdesc, "Value": $source, +// "llvm::ArrayRef": $offsets, +// "ValueRange": $shape, "ValueRange": $stride)> +// ]; let extraClassDeclaration = [{ /// Returns the type of the source memref operand. @@ -121,6 +118,35 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, return getTensorDesc().getType(); } + /// Return the element type of the TensorDesc + Type getElementType() { + return getType().getElementType(); + } + + + + /// Return the shape of the TensorDesc + llvm::ArrayRef getTensorDescShape() { + return getType().getShape(); + } + + OperandRange getSizes() { + return getShape(); + } + + SmallVector getStaticSizes() { + if (auto ty = getSourceType().dyn_cast()) { + return SmallVector(ty.getShape()); + } + } + + SmallVector getStaticStrides() { + if (auto ty = getSourceType().dyn_cast()) { + auto [strides, offset] = getStridesAndOffset(ty); + return strides; + } + } + /// Returns the offsets info to the source. It consolidates /// information from both dynamic_offsets and static_offsets /// parameters. static_offsets parameter always has the expected @@ -142,30 +168,13 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, /// strides information from memref type will be ignored. llvm::SmallVector getEffectiveStrides(); - /// Return the element type of the TensorDesc - Type getElementType() { - return getType().getElementType(); - } - /// Return the shape of the TensorDesc - llvm::ArrayRef getTensorDescShape() { - return getType().getShape(); - } - - /// Return the expected rank of each of the`static_offsets`, `static_sizes` - /// and `static_strides` attributes. + /// Return the expected rank of each of the`static_offsets`, + /// `static_sizes` and `static_strides` attributes. std::array getArrayAttrMaxRanks() { return {2, 2, 2}; } - - mlir::OperandRange getSizes() { - return getShape(); - } - llvm::ArrayRef getStaticSizes() { - return getStaticShape(); - } - /// Return the number of leading operands before the `offsets`, `sizes` and /// and `strides` operands. static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; } diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 34d1a90a50a488..5a2bc2d72bfd8c 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -53,69 +53,69 @@ static std::string makeString(T array, bool breakline = false) { //===----------------------------------------------------------------------===// // XeGPU_CreateNdDescOp //===----------------------------------------------------------------------===// -void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, - Type TensorDesc, Value source, ValueRange offsets, - ValueRange shape, ValueRange strides, - llvm::ArrayRef static_offsets) { - auto offsetRank = static_offsets.size(); - auto shapeRank = shape.size() ? shape.size() : getRankOf(source); - - size_t dynOffsetRank = - std::count_if(static_offsets.begin(), static_offsets.end(), - [](int64_t d) { return ShapedType::isDynamic(d); }); - - // shape and strides should exists at the same time - // and the final rank for shape and offset (dynamic + static) - // should be the same - assert(shape.size() == strides.size() && shapeRank == offsetRank && - offsets.size() == dynOffsetRank); - - state.addOperands(source); - state.addOperands(offsets); - state.addOperands(shape); - state.addOperands(strides); - state.addAttribute( - getOperandSegmentSizesAttrName(state.name), - builder.getDenseI32ArrayAttr({1, static_cast(offsets.size()), - static_cast(shape.size()), - static_cast(strides.size())})); - state.addAttribute(getStaticOffsetsAttrName(state.name), - builder.getDenseI64ArrayAttr(static_offsets)); - state.addTypes(TensorDesc); -} - -void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, - Type tdesc, Value source, - llvm::ArrayRef offsets) { - auto ty = llvm::dyn_cast_if_present(source.getType()); - assert(ty && ty.hasStaticShape() && offsets.size() == getRankOf(source)); - - llvm::SmallVector staticOffsets; - llvm::SmallVector dynamicOffsets; - dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); - - build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */, - ValueRange({}) /* empty dynamic shape */, - ValueRange({}) /* empty dynamic strides */, - staticOffsets /* static offsets */); -} - -void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, - Type tdesc, Value source, - llvm::ArrayRef offsets, - ValueRange shape, ValueRange stride) { - assert(shape.size() && offsets.size() && stride.size() && - shape.size() == stride.size() && shape.size() == offsets.size()); - - llvm::SmallVector staticOffsets; - llvm::SmallVector dynamicOffsets; - - dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); - - build(builder, state, tdesc, source, /* dynamic_offsets = */ dynamicOffsets, - /* dynamic shape = */ shape , /* dynamic strides = */ stride, - /* static offsets = */ staticOffsets); -} +// void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, +// Type TensorDesc, Value source, ValueRange offsets, +// ValueRange shape, ValueRange strides, +// llvm::ArrayRef static_offsets) { +// auto offsetRank = static_offsets.size(); +// auto shapeRank = shape.size() ? shape.size() : getRankOf(source); + +// size_t dynOffsetRank = +// std::count_if(static_offsets.begin(), static_offsets.end(), +// [](int64_t d) { return ShapedType::isDynamic(d); }); + +// // shape and strides should exists at the same time +// // and the final rank for shape and offset (dynamic + static) +// // should be the same +// assert(shape.size() == strides.size() && shapeRank == offsetRank && +// offsets.size() == dynOffsetRank); + +// state.addOperands(source); +// state.addOperands(offsets); +// state.addOperands(shape); +// state.addOperands(strides); +// state.addAttribute( +// getOperandSegmentSizesAttrName(state.name), +// builder.getDenseI32ArrayAttr({1, static_cast(offsets.size()), +// static_cast(shape.size()), +// static_cast(strides.size())})); +// state.addAttribute(getStaticOffsetsAttrName(state.name), +// builder.getDenseI64ArrayAttr(static_offsets)); +// state.addTypes(TensorDesc); +// } + +// void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, +// Type tdesc, Value source, +// llvm::ArrayRef offsets) { +// auto ty = llvm::dyn_cast_if_present(source.getType()); +// assert(ty && ty.hasStaticShape() && offsets.size() == getRankOf(source)); + +// llvm::SmallVector staticOffsets; +// llvm::SmallVector dynamicOffsets; +// dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); + +// build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */, +// ValueRange({}) /* empty dynamic shape */, +// ValueRange({}) /* empty dynamic strides */, +// staticOffsets /* static offsets */); +// } + +// void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, +// Type tdesc, Value source, +// llvm::ArrayRef offsets, +// ValueRange shape, ValueRange stride) { +// assert(shape.size() && offsets.size() && stride.size() && +// shape.size() == stride.size() && shape.size() == offsets.size()); + +// llvm::SmallVector staticOffsets; +// llvm::SmallVector dynamicOffsets; + +// dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); + +// build(builder, state, tdesc, source, /* dynamic_offsets = */ dynamicOffsets, +// /* dynamic shape = */ shape , /* dynamic strides = */ stride, +// /* static offsets = */ staticOffsets); +// } LogicalResult CreateNdDescOp::verify() { From b40a514960f81d88bbecf0090c9616ed9d098789 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Mon, 11 Mar 2024 20:01:00 -0500 Subject: [PATCH 09/19] clean up code --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 46 ++++------ mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 83 ------------------- 2 files changed, 16 insertions(+), 113 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 447ea2e0f3982b..c961bce4e51094 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -123,56 +123,42 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, return getType().getElementType(); } - - /// Return the shape of the TensorDesc llvm::ArrayRef getTensorDescShape() { return getType().getShape(); } + /// wrapper for matching with OffsetSizeAndStrideOpInterface OperandRange getSizes() { return getShape(); } + /// wrapper for matching with OffsetSizeAndStrideOpInterface SmallVector getStaticSizes() { - if (auto ty = getSourceType().dyn_cast()) { - return SmallVector(ty.getShape()); + if (getSourceType().dyn_cast()) { + auto dims = getMixedOffsets().size(); + return SmallVector(dims, ShapedType::kDynamic); } + auto memrefType = getSourceType().dyn_cast(); + return SmallVector(memrefType.getShape()); } + /// wrapper for matching with OffsetSizeAndStrideOpInterface SmallVector getStaticStrides() { - if (auto ty = getSourceType().dyn_cast()) { - auto [strides, offset] = getStridesAndOffset(ty); - return strides; + if (getSourceType().dyn_cast()) { + auto dims = getMixedOffsets().size(); + return SmallVector(dims, ShapedType::kDynamic); } + auto memrefType = getSourceType().dyn_cast(); + auto [strides, offset] = getStridesAndOffset(memrefType); + return strides; } - /// Returns the offsets info to the source. It consolidates - /// information from both dynamic_offsets and static_offsets - /// parameters. static_offsets parameter always has the expected - /// ranks with some dim could have ShapeType::kDynamic value - /// indicating the corresponding value should be from dynamic_offsets. - llvm::SmallVector getEffectiveOffsets(); - - /// returns the shape info of the source. It is either from the - /// memref type, if source is a memref with static shape - /// information or from the dynamic_shape parameter. If both - /// exists, the dynamic_shape parameter will be used and the - /// shape information from memref type will be ignored. - llvm::SmallVector getEffectiveShape(); - - /// returns the strides info of the source. It is either from the - /// memref type, if source is a memref with static shape - /// information or from the dynamic_stride parameter. If both - /// exists, the dynamic_strides parameter will be used and the - /// strides information from memref type will be ignored. - llvm::SmallVector getEffectiveStrides(); - - /// Return the expected rank of each of the`static_offsets`, /// `static_sizes` and `static_strides` attributes. std::array getArrayAttrMaxRanks() { - return {2, 2, 2}; + auto rank = getMixedOffsets().size(); + return {rank, rank, rank}; } /// Return the number of leading operands before the `offsets`, `sizes` and diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 5a2bc2d72bfd8c..0c9ab064d62b29 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -124,11 +124,6 @@ LogicalResult CreateNdDescOp::verify() { // auto stridesRank = getEffectiveStrides().size(); // auto baseRank = getRankOf(getSource()) ? getRankOf(getSource()) : 2; - llvm::dbgs() << "\nNum of mixed Offsets: " << getMixedOffsets().size() - << "\nNum of mixed Sizes: " << getMixedSizes().size() - << "\nNum of mixed Strides: " << getMixedStrides().size() - << "\n"; - // if (offsetRank != shapeRank || shapeRank != stridesRank || // shapeRank != baseRank) @@ -138,84 +133,6 @@ LogicalResult CreateNdDescOp::verify() { return success(); } -// compute consolidated offsets from dynamic_offsets and static_offsets -// parameters -llvm::SmallVector CreateNdDescOp::getEffectiveOffsets() { - llvm::SmallVector offsets; - auto dynamicOffsets = getOffsets(); // offsets variable - auto staticOffsets = getStaticOffsets(); // static_offsets attribute - - // in case static_offsets is missing, dynamic_offsets will be used - if (staticOffsets.size() == 0) { - offsets.assign(dynamicOffsets.begin(), dynamicOffsets.end()); - return offsets; - } - - // use static offsets for each dim if it has valid value, - // othwise use the value from dynamic_offsets - for (size_t i = 0, j = 0; i < staticOffsets.size(); i++) { - if (ShapedType::isDynamic(staticOffsets[i])) { - assert(j < dynamicOffsets.size()); - offsets.push_back(dynamicOffsets[j++]); - } else { - auto ty = IndexType::get(getContext()); - auto attr = IntegerAttr::get(ty, staticOffsets[i]); - offsets.push_back(attr); - } - } - return offsets; -} - -// get the consolidated shape of the 2D memory region. -// It prefer dynamic_shape than the static shape of -// memref type. -llvm::SmallVector CreateNdDescOp::getEffectiveShape() { - llvm::SmallVector shape; - auto dynShape = getShape(); - if (dynShape.size()) { - shape.append(dynShape.begin(), dynShape.end()); - return shape; - } - - auto ty = llvm::dyn_cast_if_present(getSourceType()); - if (ty && ty.hasStaticShape()) { - for (auto dim : ty.getShape()) { - auto attr = IntegerAttr::get(IndexType::get(getContext()), dim); - shape.push_back(attr); - } - return shape; - } - - this->emitError("The shape information of the memory is missing.\n"); - return {}; -} - -// get the consolidated strides of the 2D memory region. -// It prefer dynamic_stride than the static strides of -// memref type. -llvm::SmallVector CreateNdDescOp::getEffectiveStrides() { - llvm::SmallVector strides; - - auto dynStrides = getStrides(); - if (dynStrides.size()) { - strides.append(dynStrides.begin(), dynStrides.end()); - return strides; - } - - auto ty = llvm::dyn_cast_if_present(getSourceType()); - if (ty && ty.hasStaticShape()) { - auto [staticStrides, offset] = getStridesAndOffset(ty); - for (auto dim : staticStrides) { - auto attr = IntegerAttr::get(IndexType::get(getContext()), dim); - strides.push_back(attr); - } - return strides; - } - - this->emitError("The strides information of the memory is missing.\n"); - return {}; -} - //===----------------------------------------------------------------------===// // XeGPU_LoadNDOp //===----------------------------------------------------------------------===// From b050207d16e0b90c87ed9fd668a04a2454a2e7af Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 12 Mar 2024 09:23:00 -0500 Subject: [PATCH 10/19] fix typos and improve CreateNdDescOp::verifier --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 47 +++--- .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 6 +- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 136 ++++++------------ 3 files changed, 76 insertions(+), 113 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index c961bce4e51094..5d0d6f359292d9 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -32,7 +32,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, let summary = "create nd tensor descriptor operation"; let description = [{ The "create_nd_tdesc" operation creates a TensorDescType which represents - a sub-view of a 2D memory region (It can be extended to support N-D memory + a sub-view of a 2D memory region (It can be extended to support n-D memory region if needed in future). Elements in the subview continuous in each dimention. It encodes the following important information for supporting Intel hardware features: @@ -94,18 +94,14 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, let hasVerifier = 1; -// let builders = [ -// OpBuilder<(ins "Type": $TensorDesc, "Value": $source, "ValueRange": $offsets, -// "ValueRange": $shape, "ValueRange": $strides, -// "llvm::ArrayRef": $static_offsets)>, -// -// OpBuilder<(ins "Type": $tdesc, "Value": $source, -// "llvm::ArrayRef": $offsets)>, -// -// OpBuilder<(ins "Type": $tdesc, "Value": $source, -// "llvm::ArrayRef": $offsets, -// "ValueRange": $shape, "ValueRange": $stride)> -// ]; + let builders = [ + OpBuilder<(ins "Type": $tdesc, "TypedValue": $source, + "llvm::ArrayRef": $offsets)>, + + OpBuilder<(ins "Type": $tdesc, "TypedValue ": $source, + "llvm::ArrayRef": $offsets, + "ValueRange": $shape, "ValueRange": $stride)> + ]; let extraClassDeclaration = [{ /// Returns the type of the source memref operand. @@ -134,6 +130,9 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, } /// wrapper for matching with OffsetSizeAndStrideOpInterface + /// If source is integer it will simply return an array of + /// ShapedType::kDynamic representing dynamic shape encoded + /// in the `shape` argument will be used. SmallVector getStaticSizes() { if (getSourceType().dyn_cast()) { auto dims = getMixedOffsets().size(); @@ -144,6 +143,9 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, } /// wrapper for matching with OffsetSizeAndStrideOpInterface + /// If source is integer it will simply return an array of + /// ShapedType::kDynamic representing dynamic strides encoded + /// in the `strides` argument will be used. SmallVector getStaticStrides() { if (getSourceType().dyn_cast()) { auto dims = getMixedOffsets().size(); @@ -155,14 +157,19 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, } /// Return the expected rank of each of the`static_offsets`, - /// `static_sizes` and `static_strides` attributes. + /// `static_shape` and `static_strides` attributes. std::array getArrayAttrMaxRanks() { - auto rank = getMixedOffsets().size(); + unsigned rank; + if (auto ty = getSourceType().dyn_cast()) { + rank = ty.getRank(); + } else { + rank = (unsigned)getMixedOffsets().size(); + } return {rank, rank, rank}; } - /// Return the number of leading operands before the `offsets`, `sizes` and - /// and `strides` operands. + /// Return the number of leading operands before the `offsets`, + /// `shape` and `strides` operands. static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; } }]; } @@ -182,11 +189,11 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { } -def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> { +def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> { let summary = "loads a n-D block from memory (represented by TensorDesc)" "to registers (represented by vector)"; let description = [{ - LoadNDOp essentially mimics the hardware block read instruction to read + LoadNdOp essentially mimics the hardware block read instruction to read a block of data from memory to register. It takes a set of cache hints for each level of cache, L1, L2 and L3. If hardware does not have a correspoding cache, Corresponding cache hint attribute will be masked. @@ -222,7 +229,7 @@ def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> { let hasVerifier = 1; } -def XeGPU_StoreNDOp : XeGPU_Op<"store_nd", []> { +def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", []> { let summary = "stores a n-D block register region back to memory, currently only supports 2D"; let arguments = (ins XeGPU_ValueType: $value, XeGPU_TensorDesc: $TensorDesc, diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index 36b04ea12bcad0..8734c1c364e572 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -41,8 +41,8 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", It encodes the following information: * shape: the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows - and each row contains 16 continious data element. The rows could be - either continuous or not, depends on whether the encoding attribute + and each row contains 16 contiguous data element. The rows could be + either contiguous or not, depends on whether the encoding attribute is set or not. * element_type: the data type of the data element, e.g., f16, f32. @@ -50,7 +50,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", the following information via the TensorDescAttr object: * memory_scope (xegpu::MemoryScope): [optional] where the data is located, global memory or shared memory. It is default to Global. - * array_length (int): [optional] The number of continuous blocks with size as `shape`, + * array_length (int): [optional] The number of contiguous blocks with size as `shape`, that will be loaded by block load at a time. It is default to 1. * boundary_check (bool): [optional] indicates whether the operation detects the boundary and pads with zero for out-of-boundary access. It is default to do boundary check. diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 0c9ab064d62b29..7b8f853827e41c 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -10,23 +10,11 @@ #include #include -#include - #define DEBUG_TYPE "xegpu" namespace mlir { namespace xegpu { -static size_t getRankOf(Value value) { - if (value.getType().isIntOrIndexOrFloat()) - return 0; - if (auto ty = llvm::dyn_cast_if_present(value.getType())) - return ty.getRank(); - if (auto ty = llvm::dyn_cast_if_present(value.getType())) - return ty.getRank(); - llvm_unreachable("Unsupported value for getRankOf"); -} - static void transpose(llvm::ArrayRef trans, std::vector &shape) { std::vector old = shape; @@ -53,96 +41,64 @@ static std::string makeString(T array, bool breakline = false) { //===----------------------------------------------------------------------===// // XeGPU_CreateNdDescOp //===----------------------------------------------------------------------===// -// void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, -// Type TensorDesc, Value source, ValueRange offsets, -// ValueRange shape, ValueRange strides, -// llvm::ArrayRef static_offsets) { -// auto offsetRank = static_offsets.size(); -// auto shapeRank = shape.size() ? shape.size() : getRankOf(source); - -// size_t dynOffsetRank = -// std::count_if(static_offsets.begin(), static_offsets.end(), -// [](int64_t d) { return ShapedType::isDynamic(d); }); - -// // shape and strides should exists at the same time -// // and the final rank for shape and offset (dynamic + static) -// // should be the same -// assert(shape.size() == strides.size() && shapeRank == offsetRank && -// offsets.size() == dynOffsetRank); - -// state.addOperands(source); -// state.addOperands(offsets); -// state.addOperands(shape); -// state.addOperands(strides); -// state.addAttribute( -// getOperandSegmentSizesAttrName(state.name), -// builder.getDenseI32ArrayAttr({1, static_cast(offsets.size()), -// static_cast(shape.size()), -// static_cast(strides.size())})); -// state.addAttribute(getStaticOffsetsAttrName(state.name), -// builder.getDenseI64ArrayAttr(static_offsets)); -// state.addTypes(TensorDesc); -// } - -// void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, -// Type tdesc, Value source, -// llvm::ArrayRef offsets) { -// auto ty = llvm::dyn_cast_if_present(source.getType()); -// assert(ty && ty.hasStaticShape() && offsets.size() == getRankOf(source)); - -// llvm::SmallVector staticOffsets; -// llvm::SmallVector dynamicOffsets; -// dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); - -// build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */, -// ValueRange({}) /* empty dynamic shape */, -// ValueRange({}) /* empty dynamic strides */, -// staticOffsets /* static offsets */); -// } - -// void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, -// Type tdesc, Value source, -// llvm::ArrayRef offsets, -// ValueRange shape, ValueRange stride) { -// assert(shape.size() && offsets.size() && stride.size() && -// shape.size() == stride.size() && shape.size() == offsets.size()); - -// llvm::SmallVector staticOffsets; -// llvm::SmallVector dynamicOffsets; - -// dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); - -// build(builder, state, tdesc, source, /* dynamic_offsets = */ dynamicOffsets, -// /* dynamic shape = */ shape , /* dynamic strides = */ stride, -// /* static offsets = */ staticOffsets); -// } +void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, + Type tdesc, TypedValue source, + llvm::ArrayRef offsets) { + auto ty = llvm::dyn_cast_if_present(source.getType()); + assert(ty && ty.hasStaticShape() && offsets.size() == ty.getRank()); + + llvm::SmallVector staticOffsets; + llvm::SmallVector dynamicOffsets; + dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); + + build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */, + ValueRange({}) /* empty dynamic shape */, + ValueRange({}) /* empty dynamic strides */, + staticOffsets /* static offsets */); +} +void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, + Type tdesc, TypedValue source, + llvm::ArrayRef offsets, + ValueRange shape, ValueRange stride) { + assert(shape.size() && offsets.size() && stride.size() && + shape.size() == stride.size() && shape.size() == offsets.size()); -LogicalResult CreateNdDescOp::verify() { - // auto offsetRank = getEffectiveOffsets().size(); - // auto shapeRank = getEffectiveShape().size(); - // auto stridesRank = getEffectiveStrides().size(); - // auto baseRank = getRankOf(getSource()) ? getRankOf(getSource()) : 2; + llvm::SmallVector staticOffsets; + llvm::SmallVector dynamicOffsets; - // if (offsetRank != shapeRank || shapeRank != stridesRank || - // shapeRank != baseRank) + dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); - // return emitOpError( - // "Expecting the rank of shape, strides, offsets and memref type " - // "should match with each other (they currently should be 2D)."); + build(builder, state, tdesc, source, /* dynamic_offsets = */ dynamicOffsets, + /* dynamic shape = */ shape , /* dynamic strides = */ stride, + /* static offsets = */ staticOffsets); +} + +LogicalResult CreateNdDescOp::verify() { + auto rank = getMixedOffsets().size(); + bool invalid = (rank != 2); + auto memrefTy = getSourceType().dyn_cast(); + if (memrefTy) { + invalid |= (memrefTy.getRank() != rank); + } + if (invalid) { + return emitOpError("Expecting the rank of shape, strides, offsets and " + "memref type (if source is a memref) should match " + "with each other. They currenlty are 2D."); + } return success(); } //===----------------------------------------------------------------------===// -// XeGPU_LoadNDOp +// XeGPU_LoadNdOp //===----------------------------------------------------------------------===// -LogicalResult LoadNDOp::verify() { +LogicalResult LoadNdOp::verify() { auto tdescTy = getTensorDescType(); auto valueTy = getType(); if (tdescTy.getRank() != 2) return emitOpError( - "The TensorDesc for LoadNDOp should be a 2D TensorDesc."); + "The TensorDesc for LoadNdOp should be a 2D TensorDesc."); if (!valueTy) return emitOpError("Invalid result, it should be a VectorType.\n"); @@ -186,9 +142,9 @@ LogicalResult LoadNDOp::verify() { } //===----------------------------------------------------------------------===// -// XeGPU_StoreNDOp +// XeGPU_StoreNdOp //===----------------------------------------------------------------------===// -LogicalResult StoreNDOp::verify() { +LogicalResult StoreNdOp::verify() { auto dstTy = getTensorDesc().getType(); // Tile auto valTy = getValue().getType().cast(); // Vector From 2ca12a7bb74f03030aad0dff2f37469110786b6e Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 12 Mar 2024 09:29:49 -0500 Subject: [PATCH 11/19] code format --- mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h | 1 - mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 5 ++-- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 27 +++++++++++++--------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h index 662fd7ef197414..87aabdc015fea5 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h @@ -16,7 +16,6 @@ #include "mlir/Interfaces/SideEffectInterfaces.h" #include "mlir/Interfaces/ViewLikeInterface.h" - namespace mlir { namespace xegpu { // placeholder diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index bd72d5c17b6ea1..43337a6ab43dcd 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -29,7 +29,6 @@ void XeGPUDialect::initialize() { >(); } - //===----------------------------------------------------------------------===// // XeGPU_TensorDescAttr //===----------------------------------------------------------------------===// @@ -62,7 +61,8 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) { if (mlir::succeeded(parser.parseOptionalComma())) { encoding = mlir::FieldParser::parse(parser); if (mlir::failed(encoding)) { - parser.emitError(parser.getCurrentLocation(), + parser.emitError( + parser.getCurrentLocation(), "Failed to parse the attribute field for TensorDescType.\n"); return {}; } @@ -96,7 +96,6 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const { printer << ">"; } - } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 7b8f853827e41c..ee57f7a4f748ae 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -70,21 +70,24 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); build(builder, state, tdesc, source, /* dynamic_offsets = */ dynamicOffsets, - /* dynamic shape = */ shape , /* dynamic strides = */ stride, + /* dynamic shape = */ shape, /* dynamic strides = */ stride, /* static offsets = */ staticOffsets); } LogicalResult CreateNdDescOp::verify() { auto rank = getMixedOffsets().size(); bool invalid = (rank != 2); + auto memrefTy = getSourceType().dyn_cast(); - if (memrefTy) { + if (memrefTy) invalid |= (memrefTy.getRank() != rank); - } + + invalid = (getTensorDescType().getRank() != rank); + if (invalid) { - return emitOpError("Expecting the rank of shape, strides, offsets and " - "memref type (if source is a memref) should match " - "with each other. They currenlty are 2D."); + return emitOpError("Expecting the rank of shape, strides, offsets, " + "source memref type (if source is a memref) and TensorDesc " + "should match with each other. They currenlty are 2D."); } return success(); } @@ -135,9 +138,10 @@ LogicalResult LoadNdOp::verify() { } if (tdescShape != valueShape) - return emitOpError() <<"Result shape doesn't match TensorDesc shape." - << "The expected shape is " << makeString(tdescShape) << ". " - << "But the given shape is " << makeString(valueShape) << ".\n"; + return emitOpError() << "Result shape doesn't match TensorDesc shape." + << "The expected shape is " << makeString(tdescShape) + << ". But the given shape is " << makeString(valueShape) + << ".\n"; return success(); } @@ -159,11 +163,12 @@ LogicalResult StoreNdOp::verify() { if (dstElemTy != valElemTy) { return emitOpError() << "The element type of the value should " - "match the elementtype of the TensorDesc.\n"; + "match the elementtype of the TensorDesc.\n"; } if (dstTy.getShape() != valTy.getShape()) - return emitOpError() << "The result shape should match the TensorDesc shape.\n"; + return emitOpError() + << "The result shape should match the TensorDesc shape.\n"; return success(); } From 9039b5fc36daaedfa2db8160486cb5d6bd795036 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 12 Mar 2024 09:31:00 -0500 Subject: [PATCH 12/19] code format --- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index ee57f7a4f748ae..a21c7607c73b5d 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -85,9 +85,10 @@ LogicalResult CreateNdDescOp::verify() { invalid = (getTensorDescType().getRank() != rank); if (invalid) { - return emitOpError("Expecting the rank of shape, strides, offsets, " - "source memref type (if source is a memref) and TensorDesc " - "should match with each other. They currenlty are 2D."); + return emitOpError( + "Expecting the rank of shape, strides, offsets, " + "source memref type (if source is a memref) and TensorDesc " + "should match with each other. They currenlty are 2D."); } return success(); } @@ -140,8 +141,8 @@ LogicalResult LoadNdOp::verify() { if (tdescShape != valueShape) return emitOpError() << "Result shape doesn't match TensorDesc shape." << "The expected shape is " << makeString(tdescShape) - << ". But the given shape is " << makeString(valueShape) - << ".\n"; + << ". But the given shape is " + << makeString(valueShape) << ".\n"; return success(); } From 632637eda0688ba31a2892f3df5fbc4893467c62 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 12 Mar 2024 10:29:58 -0500 Subject: [PATCH 13/19] sync viewlikeOpInterface and some updates --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 24 +++++++++++-------- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 23 +++++++++++++----- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 5d0d6f359292d9..24fae9596994e3 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -26,8 +26,8 @@ class XeGPU_Op traits = []>: Op; -def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, - AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> { +def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface, + AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> { let summary = "create nd tensor descriptor operation"; let description = [{ @@ -130,11 +130,12 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, } /// wrapper for matching with OffsetSizeAndStrideOpInterface - /// If source is integer it will simply return an array of - /// ShapedType::kDynamic representing dynamic shape encoded - /// in the `shape` argument will be used. + /// If source is IntegerType and `shape` is filled, it will + /// return an array of ShapedType::kDynamic representing dynamic + /// shape encoded in the `shape` argument will be used. Presence + /// of `shape` overides static shape from source memref type. SmallVector getStaticSizes() { - if (getSourceType().dyn_cast()) { + if (getSourceType().isa() || getShape().size()) { auto dims = getMixedOffsets().size(); return SmallVector(dims, ShapedType::kDynamic); } @@ -143,11 +144,12 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, } /// wrapper for matching with OffsetSizeAndStrideOpInterface - /// If source is integer it will simply return an array of - /// ShapedType::kDynamic representing dynamic strides encoded - /// in the `strides` argument will be used. + /// If source is IntegerType or `strides` is filled, it will + /// return an array of ShapedType::kDynamic representing dynamic + /// strides encoded in the `strides` argument will be used. Presence + /// of `strides` overides static strides from source memref type. SmallVector getStaticStrides() { - if (getSourceType().dyn_cast()) { + if (getSourceType().isa() || getStrides().size()) { auto dims = getMixedOffsets().size(); return SmallVector(dims, ShapedType::kDynamic); } @@ -171,6 +173,8 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, /// Return the number of leading operands before the `offsets`, /// `shape` and `strides` operands. static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; } + + mlir::Value getViewSource() { return getSource(); } }]; } diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index a21c7607c73b5d..5adf1c2a6b9849 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -76,20 +76,31 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, LogicalResult CreateNdDescOp::verify() { auto rank = getMixedOffsets().size(); - bool invalid = (rank != 2); + bool invalidRank = (rank != 2); + bool invalidElemTy = false; + // check source type matches the rank if it is a memref auto memrefTy = getSourceType().dyn_cast(); - if (memrefTy) - invalid |= (memrefTy.getRank() != rank); + if (memrefTy) { + invalidRank |= (memrefTy.getRank() != rank); + // TensorDesc should have the same element type with memref. + invalidElemTy != memrefTy.getElementType() != getElementType(); + } - invalid = (getTensorDescType().getRank() != rank); + // check result type matches the rank + invalidRank = (getType().getRank() != rank); - if (invalid) { + if (invalidRank) return emitOpError( "Expecting the rank of shape, strides, offsets, " "source memref type (if source is a memref) and TensorDesc " "should match with each other. They currenlty are 2D."); - } + + if (invalidElemTy) + return emitOpError("TensorDesc should have the same element " + "type with the source if it is a memref.\n"); + + return success(); } From 447d623cf501fd19f6f9b9188ee65eca0b9a6dc0 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 12 Mar 2024 15:46:30 +0000 Subject: [PATCH 14/19] fix a typo --- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 5adf1c2a6b9849..077877cb444b47 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -44,8 +44,8 @@ static std::string makeString(T array, bool breakline = false) { void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, Type tdesc, TypedValue source, llvm::ArrayRef offsets) { - auto ty = llvm::dyn_cast_if_present(source.getType()); - assert(ty && ty.hasStaticShape() && offsets.size() == ty.getRank()); + auto ty = source.getType(); + assert(ty && ty.hasStaticShape() && offsets.size() == (size_t)ty.getRank()); llvm::SmallVector staticOffsets; llvm::SmallVector dynamicOffsets; @@ -75,21 +75,24 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, } LogicalResult CreateNdDescOp::verify() { - auto rank = getMixedOffsets().size(); + auto rank = (int64_t)getMixedOffsets().size(); bool invalidRank = (rank != 2); bool invalidElemTy = false; - // check source type matches the rank if it is a memref + // check source type matches the rank if it is a memref. + // It also should have the same ElementType as TensorDesc. auto memrefTy = getSourceType().dyn_cast(); if (memrefTy) { invalidRank |= (memrefTy.getRank() != rank); - // TensorDesc should have the same element type with memref. - invalidElemTy != memrefTy.getElementType() != getElementType(); + invalidElemTy |= memrefTy.getElementType() != getElementType(); } // check result type matches the rank invalidRank = (getType().getRank() != rank); + // mismatches among shape, strides, and offsets are + // already handeled by OffsetSizeAndStrideOpInterface. + // So they are not check here. if (invalidRank) return emitOpError( "Expecting the rank of shape, strides, offsets, " From 37a348d37dec5a8f3c0d0f85e41cdc7d008b2ece Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 12 Mar 2024 15:47:08 +0000 Subject: [PATCH 15/19] code format --- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 077877cb444b47..08723d12c278c8 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -84,7 +84,7 @@ LogicalResult CreateNdDescOp::verify() { auto memrefTy = getSourceType().dyn_cast(); if (memrefTy) { invalidRank |= (memrefTy.getRank() != rank); - invalidElemTy |= memrefTy.getElementType() != getElementType(); + invalidElemTy |= memrefTy.getElementType() != getElementType(); } // check result type matches the rank @@ -103,7 +103,6 @@ LogicalResult CreateNdDescOp::verify() { return emitOpError("TensorDesc should have the same element " "type with the source if it is a memref.\n"); - return success(); } From ff338280afdd024f7bc95417f947f64266f6b90e Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 12 Mar 2024 16:19:14 +0000 Subject: [PATCH 16/19] add ViewLikeOpInterface and OffsetSizeAndStrideOpInterface to createNdDescOp --- mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h | 11 +- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 121 ++++++------ .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 8 +- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 174 ++++-------------- 4 files changed, 113 insertions(+), 201 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h index 8dc3ff78d25ede..87aabdc015fea5 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h @@ -9,11 +9,12 @@ #ifndef MLIR_DIALECT_XEGPU_IR_XEGPU_H #define MLIR_DIALECT_XEGPU_IR_XEGPU_H -#include -#include -#include -#include -#include +#include "mlir/Bytecode/BytecodeOpInterface.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Dialect.h" +#include "mlir/Interfaces/ShapedOpInterfaces.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" +#include "mlir/Interfaces/ViewLikeInterface.h" namespace mlir { namespace xegpu { diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 9d37d77e03a0c5..24fae9596994e3 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -9,26 +9,13 @@ #ifndef MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD #define MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD +include "mlir/IR/AttrTypeBase.td" include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td" include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td" include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td" -include "mlir/Interfaces/SideEffectInterfaces.td" -include "mlir/IR/AttrTypeBase.td" - - -include "mlir/IR/OpBase.td" -include "mlir/IR/OpAsmInterface.td" -include "mlir/IR/AttrTypeBase.td" -include "mlir/IR/BuiltinTypes.td" -include "mlir/IR/BuiltinTypeInterfaces.td" +include "mlir/Interfaces/ShapedOpInterfaces.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/Interfaces/ViewLikeInterface.td" -include "mlir/Interfaces/CastInterfaces.td" -include "mlir/Interfaces/ControlFlowInterfaces.td" -include "mlir/Interfaces/CopyOpInterface.td" -include "mlir/Interfaces/InferTypeOpInterface.td" -include "mlir/Interfaces/ShapedOpInterfaces.td" - // Base class for dialect operations. This operation inherits from the base // `Op` class in OpBase.td, and provides: @@ -39,12 +26,13 @@ class XeGPU_Op traits = []>: Op; -def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSegments]> { +def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface, + AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> { let summary = "create nd tensor descriptor operation"; let description = [{ The "create_nd_tdesc" operation creates a TensorDescType which represents - a sub-view of a 2D memory region (It can be extended to support N-D memory + a sub-view of a 2D memory region (It can be extended to support n-D memory region if needed in future). Elements in the subview continuous in each dimention. It encodes the following important information for supporting Intel hardware features: @@ -90,31 +78,27 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSeg let arguments = (ins XeGPU_BaseAddrType: $source, - Variadic: $dynamic_offsets, - Variadic: $dynamic_shape, - Variadic: $dynamic_strides, + Variadic: $offsets, + Variadic: $shape, + Variadic: $strides, DenseI64ArrayAttr: $static_offsets ); let results = (outs XeGPU_TensorDesc: $TensorDesc); let assemblyFormat = [{ $source `` - custom($dynamic_offsets, $static_offsets) - (`,` `[` $dynamic_shape^ `]` `,` `[` $dynamic_strides `]`)? + custom($offsets, $static_offsets) + (`,` `[` $shape^ `]` `,` `[` $strides `]`)? attr-dict `:` type($source) `->` qualified(type($TensorDesc)) }]; - let skipDefaultBuilders = 1; + let hasVerifier = 1; let builders = [ - OpBuilder<(ins "Type": $TensorDesc, "Value": $source, "ValueRange": $offsets, - "ValueRange": $shape, "ValueRange": $strides, - "llvm::ArrayRef": $static_offsets)>, - - OpBuilder<(ins "Type": $tdesc, "Value": $source, + OpBuilder<(ins "Type": $tdesc, "TypedValue": $source, "llvm::ArrayRef": $offsets)>, - OpBuilder<(ins "Type": $tdesc, "Value": $source, + OpBuilder<(ins "Type": $tdesc, "TypedValue ": $source, "llvm::ArrayRef": $offsets, "ValueRange": $shape, "ValueRange": $stride)> ]; @@ -130,27 +114,6 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSeg return getTensorDesc().getType(); } - /// Returns the offsets info to the source. It consolidates - /// information from both dynamic_offsets and static_offsets - /// parameters. static_offsets parameter always has the expected - /// ranks with some dim could have ShapeType::kDynamic value - /// indicating the corresponding value should be from dynamic_offsets. - llvm::SmallVector getOffsets(); - - /// returns the shape info of the source. It is either from the - /// memref type, if source is a memref with static shape - /// information or from the dynamic_shape parameter. If both - /// exists, the dynamic_shape parameter will be used and the - /// shape information from memref type will be ignored. - llvm::SmallVector getShape(); - - /// returns the strides info of the source. It is either from the - /// memref type, if source is a memref with static shape - /// information or from the dynamic_stride parameter. If both - /// exists, the dynamic_strides parameter will be used and the - /// strides information from memref type will be ignored. - llvm::SmallVector getStrides(); - /// Return the element type of the TensorDesc Type getElementType() { return getType().getElementType(); @@ -160,6 +123,58 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSeg llvm::ArrayRef getTensorDescShape() { return getType().getShape(); } + + /// wrapper for matching with OffsetSizeAndStrideOpInterface + OperandRange getSizes() { + return getShape(); + } + + /// wrapper for matching with OffsetSizeAndStrideOpInterface + /// If source is IntegerType and `shape` is filled, it will + /// return an array of ShapedType::kDynamic representing dynamic + /// shape encoded in the `shape` argument will be used. Presence + /// of `shape` overides static shape from source memref type. + SmallVector getStaticSizes() { + if (getSourceType().isa() || getShape().size()) { + auto dims = getMixedOffsets().size(); + return SmallVector(dims, ShapedType::kDynamic); + } + auto memrefType = getSourceType().dyn_cast(); + return SmallVector(memrefType.getShape()); + } + + /// wrapper for matching with OffsetSizeAndStrideOpInterface + /// If source is IntegerType or `strides` is filled, it will + /// return an array of ShapedType::kDynamic representing dynamic + /// strides encoded in the `strides` argument will be used. Presence + /// of `strides` overides static strides from source memref type. + SmallVector getStaticStrides() { + if (getSourceType().isa() || getStrides().size()) { + auto dims = getMixedOffsets().size(); + return SmallVector(dims, ShapedType::kDynamic); + } + auto memrefType = getSourceType().dyn_cast(); + auto [strides, offset] = getStridesAndOffset(memrefType); + return strides; + } + + /// Return the expected rank of each of the`static_offsets`, + /// `static_shape` and `static_strides` attributes. + std::array getArrayAttrMaxRanks() { + unsigned rank; + if (auto ty = getSourceType().dyn_cast()) { + rank = ty.getRank(); + } else { + rank = (unsigned)getMixedOffsets().size(); + } + return {rank, rank, rank}; + } + + /// Return the number of leading operands before the `offsets`, + /// `shape` and `strides` operands. + static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; } + + mlir::Value getViewSource() { return getSource(); } }]; } @@ -178,11 +193,11 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { } -def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> { +def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> { let summary = "loads a n-D block from memory (represented by TensorDesc)" "to registers (represented by vector)"; let description = [{ - LoadNDOp essentially mimics the hardware block read instruction to read + LoadNdOp essentially mimics the hardware block read instruction to read a block of data from memory to register. It takes a set of cache hints for each level of cache, L1, L2 and L3. If hardware does not have a correspoding cache, Corresponding cache hint attribute will be masked. @@ -218,7 +233,7 @@ def XeGPU_LoadNDOp : XeGPU_Op<"load_nd"> { let hasVerifier = 1; } -def XeGPU_StoreNDOp : XeGPU_Op<"store_nd", []> { +def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", []> { let summary = "stores a n-D block register region back to memory, currently only supports 2D"; let arguments = (ins XeGPU_ValueType: $value, XeGPU_TensorDesc: $TensorDesc, diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index 36b04ea12bcad0..19ac1693712dd8 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -37,12 +37,12 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", TensorDesc is a type designed to describe regions of the interested data as well as some features that are unique to Intel hardware. Different with the builtin tensor type in MLIR, it essentially only contains the meta data, and doesn't hold the data by itself. It is designed - to mainly support 2d block load/store and DPAS (matrix multiplication instruction) on Intel GPU. + to mainly support 2D block load/store and DPAS (matrix multiplication instruction) on Intel GPU. It encodes the following information: * shape: the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows - and each row contains 16 continious data element. The rows could be - either continuous or not, depends on whether the encoding attribute + and each row contains 16 contiguous data element. The rows could be + either contiguous or not, depends on whether the encoding attribute is set or not. * element_type: the data type of the data element, e.g., f16, f32. @@ -50,7 +50,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", the following information via the TensorDescAttr object: * memory_scope (xegpu::MemoryScope): [optional] where the data is located, global memory or shared memory. It is default to Global. - * array_length (int): [optional] The number of continuous blocks with size as `shape`, + * array_length (int): [optional] The number of contiguous blocks with size as `shape`, that will be loaded by block load at a time. It is default to 1. * boundary_check (bool): [optional] indicates whether the operation detects the boundary and pads with zero for out-of-boundary access. It is default to do boundary check. diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index be631c4678eacb..08723d12c278c8 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -9,23 +9,12 @@ #include #include #include -#include #define DEBUG_TYPE "xegpu" namespace mlir { namespace xegpu { -static size_t getRankOf(Value value) { - if (value.getType().isIntOrIndexOrFloat()) - return 0; - if (auto ty = llvm::dyn_cast_if_present(value.getType())) - return ty.getRank(); - if (auto ty = llvm::dyn_cast_if_present(value.getType())) - return ty.getRank(); - llvm_unreachable("Unsupported value for getRankOf"); -} - static void transpose(llvm::ArrayRef trans, std::vector &shape) { std::vector old = shape; @@ -53,41 +42,10 @@ static std::string makeString(T array, bool breakline = false) { // XeGPU_CreateNdDescOp //===----------------------------------------------------------------------===// void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, - Type TensorDesc, Value source, ValueRange offsets, - ValueRange shape, ValueRange strides, - llvm::ArrayRef static_offsets) { - auto offsetRank = static_offsets.size(); - auto shapeRank = shape.size() ? shape.size() : getRankOf(source); - - size_t dynOffsetRank = - std::count_if(static_offsets.begin(), static_offsets.end(), - [](int64_t d) { return ShapedType::isDynamic(d); }); - - // shape and strides should exists at the same time - // and the final rank for shape and offset (dynamic + static) - // should be the same - assert(shape.size() == strides.size() && shapeRank == offsetRank && - offsets.size() == dynOffsetRank); - - state.addOperands(source); - state.addOperands(offsets); - state.addOperands(shape); - state.addOperands(strides); - state.addAttribute( - getOperandSegmentSizesAttrName(state.name), - builder.getDenseI32ArrayAttr({1, static_cast(offsets.size()), - static_cast(shape.size()), - static_cast(strides.size())})); - state.addAttribute(getStaticOffsetsAttrName(state.name), - builder.getDenseI64ArrayAttr(static_offsets)); - state.addTypes(TensorDesc); -} - -void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, - Type tdesc, Value source, + Type tdesc, TypedValue source, llvm::ArrayRef offsets) { - auto ty = llvm::dyn_cast_if_present(source.getType()); - assert(ty && ty.hasStaticShape() && offsets.size() == getRankOf(source)); + auto ty = source.getType(); + assert(ty && ty.hasStaticShape() && offsets.size() == (size_t)ty.getRank()); llvm::SmallVector staticOffsets; llvm::SmallVector dynamicOffsets; @@ -100,7 +58,7 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, } void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, - Type tdesc, Value source, + Type tdesc, TypedValue source, llvm::ArrayRef offsets, ValueRange shape, ValueRange stride) { assert(shape.size() && offsets.size() && stride.size() && @@ -117,108 +75,47 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, } LogicalResult CreateNdDescOp::verify() { - auto offsetRank = getOffsets().size(); - auto shapeRank = getShape().size(); - auto stridesRank = getStrides().size(); - auto baseRank = getRankOf(getSource()) ? getRankOf(getSource()) : 2; - - if (offsetRank != shapeRank || shapeRank != stridesRank || - shapeRank != baseRank) - - return emitOpError( - "Expecting the rank of shape, strides, offsets and memref type " - "should match with each other (they currently should be 2D)."); - return success(); -} - -// compute consolidated offsets from dynamic_offsets and static_offsets -// parameters -llvm::SmallVector CreateNdDescOp::getOffsets() { - llvm::SmallVector offsets; - auto dynamicOffsets = getDynamicOffsets(); // dynamic_offsets variable - auto staticOffsets = getStaticOffsets(); // static_offsets attribute - - // in case static_offsets is missing, dynamic_offsets will be used - if (staticOffsets.size() == 0) { - offsets.assign(dynamicOffsets.begin(), dynamicOffsets.end()); - return offsets; - } - - // use static offsets for each dim if it has valid value, - // othwise use the value from dynamic_offsets - for (size_t i = 0, j = 0; i < staticOffsets.size(); i++) { - if (ShapedType::isDynamic(staticOffsets[i])) { - assert(j < dynamicOffsets.size()); - offsets.push_back(dynamicOffsets[j++]); - } else { - auto ty = IndexType::get(getContext()); - auto attr = IntegerAttr::get(ty, staticOffsets[i]); - offsets.push_back(attr); - } - } - return offsets; -} - -// get the consolidated shape of the 2D memory region. -// It prefer dynamic_shape than the static shape of -// memref type. -llvm::SmallVector CreateNdDescOp::getShape() { - llvm::SmallVector shape; - auto dynShape = getDynamicShape(); - if (dynShape.size()) { - shape.append(dynShape.begin(), dynShape.end()); - return shape; - } - - auto ty = llvm::dyn_cast_if_present(getSourceType()); - if (ty && ty.hasStaticShape()) { - for (auto dim : ty.getShape()) { - auto attr = IntegerAttr::get(IndexType::get(getContext()), dim); - shape.push_back(attr); - } - return shape; + auto rank = (int64_t)getMixedOffsets().size(); + bool invalidRank = (rank != 2); + bool invalidElemTy = false; + + // check source type matches the rank if it is a memref. + // It also should have the same ElementType as TensorDesc. + auto memrefTy = getSourceType().dyn_cast(); + if (memrefTy) { + invalidRank |= (memrefTy.getRank() != rank); + invalidElemTy |= memrefTy.getElementType() != getElementType(); } - this->emitError("The shape information of the memory is missing.\n"); - return {}; -} - -// get the consolidated strides of the 2D memory region. -// It prefer dynamic_stride than the static strides of -// memref type. -llvm::SmallVector CreateNdDescOp::getStrides() { - llvm::SmallVector strides; + // check result type matches the rank + invalidRank = (getType().getRank() != rank); - auto dynStrides = getDynamicStrides(); - if (dynStrides.size()) { - strides.append(dynStrides.begin(), dynStrides.end()); - return strides; - } + // mismatches among shape, strides, and offsets are + // already handeled by OffsetSizeAndStrideOpInterface. + // So they are not check here. + if (invalidRank) + return emitOpError( + "Expecting the rank of shape, strides, offsets, " + "source memref type (if source is a memref) and TensorDesc " + "should match with each other. They currenlty are 2D."); - auto ty = llvm::dyn_cast_if_present(getSourceType()); - if (ty && ty.hasStaticShape()) { - auto [staticStrides, offset] = getStridesAndOffset(ty); - for (auto dim : staticStrides) { - auto attr = IntegerAttr::get(IndexType::get(getContext()), dim); - strides.push_back(attr); - } - return strides; - } + if (invalidElemTy) + return emitOpError("TensorDesc should have the same element " + "type with the source if it is a memref.\n"); - this->emitError("The strides information of the memory is missing.\n"); - return {}; + return success(); } //===----------------------------------------------------------------------===// -// XeGPU_LoadNDOp +// XeGPU_LoadNdOp //===----------------------------------------------------------------------===// -LogicalResult LoadNDOp::verify() { +LogicalResult LoadNdOp::verify() { auto tdescTy = getTensorDescType(); auto valueTy = getType(); if (tdescTy.getRank() != 2) return emitOpError( - "The TensorDesc for LoadNDOp should be a 2D TensorDesc."); + "The TensorDesc for LoadNdOp should be a 2D TensorDesc."); if (!valueTy) return emitOpError("Invalid result, it should be a VectorType.\n"); @@ -257,16 +154,15 @@ LogicalResult LoadNDOp::verify() { if (tdescShape != valueShape) return emitOpError() << "Result shape doesn't match TensorDesc shape." << "The expected shape is " << makeString(tdescShape) - << ". " - << "But the given shape is " << makeString(valueShape) - << ".\n"; + << ". But the given shape is " + << makeString(valueShape) << ".\n"; return success(); } //===----------------------------------------------------------------------===// -// XeGPU_StoreNDOp +// XeGPU_StoreNdOp //===----------------------------------------------------------------------===// -LogicalResult StoreNDOp::verify() { +LogicalResult StoreNdOp::verify() { auto dstTy = getTensorDesc().getType(); // Tile auto valTy = getValue().getType().cast(); // Vector From 8a9df4ba16cdaf422d7980f565f3c2046b141a90 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 12 Mar 2024 20:03:34 -0500 Subject: [PATCH 17/19] fix include format --- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 8 ++++---- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 43337a6ab43dcd..0b3f4b9c9dbeae 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -6,10 +6,10 @@ // //===----------------------------------------------------------------------===// -#include -#include -#include -#include +#include "mlir/Dialect/XeGPU/IR/XeGPU.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/DialectImplementation.h" +#include "llvm/ADT/TypeSwitch.h" namespace mlir { namespace xegpu { diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 08723d12c278c8..3a75b173b757c5 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -6,9 +6,9 @@ // //===----------------------------------------------------------------------===// -#include -#include -#include +#include "mlir/Dialect/Utils/StaticValueUtils.h" +#include "mlir/Dialect/XeGPU/IR/XeGPU.h" +#include "mlir/IR/Builders.h" #define DEBUG_TYPE "xegpu" From e3857bb9971b1570108ec55b97f2a18d44354cdc Mon Sep 17 00:00:00 2001 From: Chao Chen <116223022+chencha3@users.noreply.github.com> Date: Thu, 14 Mar 2024 10:13:19 -0500 Subject: [PATCH 18/19] Update mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td Co-authored-by: Mehdi Amini --- mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 24fae9596994e3..cb768d5e6b9af3 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -29,7 +29,7 @@ class XeGPU_Op traits = []>: def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface, AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> { - let summary = "create nd tensor descriptor operation"; + let summary = "Create nd-tensor descriptor operation"; let description = [{ The "create_nd_tdesc" operation creates a TensorDescType which represents a sub-view of a 2D memory region (It can be extended to support n-D memory From 5b6ebf8f8dcd4aef7fcca8ff44f6a88c7700580e Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Fri, 15 Mar 2024 11:33:30 -0500 Subject: [PATCH 19/19] move examaple to description --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 70 ++++++++++++++----- 1 file changed, 53 insertions(+), 17 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index cb768d5e6b9af3..02dc73ce7eb33d 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -180,15 +180,25 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { let summary = "prefetches a nD block to cache"; + let description = [{ + It issues an instruction to prefetch the data from memory to each + level of the cache based on their cache policy. + + Example: + ``` + xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : !xegpu.tensor_desc<8x16xf16> + ``` + + }]; + let arguments = (ins XeGPU_TensorDesc: $TensorDesc, OptionalAttr: $l1_hint, OptionalAttr: $l2_hint, OptionalAttr: $l3_hint); - // Format: xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint, - // l2_hint = #xegpu.cache_hint, - // l3_hint = #xegpu.cache_hint} - // : !xegpu.tensor_desc<8x16xf16> let assemblyFormat = "$TensorDesc attr-dict `:` qualified(type($TensorDesc))"; } @@ -198,11 +208,27 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> { "to registers (represented by vector)"; let description = [{ LoadNdOp essentially mimics the hardware block read instruction to read - a block of data from memory to register. It takes a set of cache hints - for each level of cache, L1, L2 and L3. If hardware does not have a + a block of data from memory to register. It takes a set of optional cache + hints for each level of cache, L1, L2 and L3. If hardware does not have a correspoding cache, Corresponding cache hint attribute will be masked. - If both transpose and vnni_axis present at the same time. It assume to - perform transpose first and then vnni transform. + vnni transform is an hardware feature for Intel GPU, which is used to + do data packing during the load for B operand of matrix operation, if + the bit width of the data type is less then 32 bits, e.g., fp16. And + transpose is another Intel hardware feature, which will do transpose + operation when loading the data if the bit width of the data type is + fp32 or fp64. It implies that vnni and transpose cannot exit at the + same time. + + Example: + ``` + xegpu.load_nd %1 {transpose = [1, 0], + l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32> + ``` + + }]; let arguments = (ins XeGPU_TensorDesc: $TensorDesc, @@ -224,27 +250,37 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> { } }]; - // Format: xegpu.load_nd %1 {transpose = [1, 0], - // l1_hint = #xegpu.cache_hint, - // l2_hint = #xegpu.cache_hint, - // l3_hint = #xegpu.cache_hint} - // : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32> let assemblyFormat = "$TensorDesc attr-dict `:` qualified(type($TensorDesc)) `->` type($value)"; let hasVerifier = 1; } def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", []> { let summary = "stores a n-D block register region back to memory, currently only supports 2D"; + + let description = [{ + StoreNdOp essentially mimics the hardware block write instruction io + write a block of data from register into the memory region as described + by the TensorDesc. It takes a set of optional cache hints for each level + of cache, L1, L2 and L3. If hardware does not have a correspoding cache, + Corresponding cache hint attribute will be masked. + + Example: + ``` + xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> + ``` + + + }]; + let arguments = (ins XeGPU_ValueType: $value, XeGPU_TensorDesc: $TensorDesc, OptionalAttr: $l1_hint, OptionalAttr: $l2_hint, OptionalAttr: $l3_hint); - // Format: xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint, - // l2_hint = #xegpu.cache_hint, - // l3_hint = #xegpu.cache_hint} - // : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> let assemblyFormat = "$value `,` $TensorDesc attr-dict `:` type($value) `,` qualified(type($TensorDesc))"; let hasVerifier = 1; }