ROCm
diff --git a/‎mlir/include/mlir/Conversion/LinalgToRock/LinalgToRock.h‎
Lines changed: 4 additions & 0 deletions b/‎mlir/include/mlir/Conversion/LinalgToRock/LinalgToRock.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎mlir/lib/Conversion/LinalgToRock/LinalgToRock.cpp‎
Lines changed: 46 additions & 1 deletion b/‎mlir/lib/Conversion/LinalgToRock/LinalgToRock.cpp‎
Lines changed: 46 additions & 1 deletion
diff --git a/‎mlir/lib/Conversion/LinalgToRock/LinalgToRockPass.cpp‎
Lines changed: 7 additions & 0 deletions b/‎mlir/lib/Conversion/LinalgToRock/LinalgToRockPass.cpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎mlir/lib/Conversion/MIGraphXToLinalg/MIGraphXToLinalg.cpp‎
Lines changed: 192 additions & 30 deletions b/‎mlir/lib/Conversion/MIGraphXToLinalg/MIGraphXToLinalg.cpp‎
Lines changed: 192 additions & 30 deletions
diff --git a/‎mlir/test/Conversion/LinalgToRock/linalg-to-rock-expand-strides.mlir‎
Lines changed: 56 additions & 0 deletions b/‎mlir/test/Conversion/LinalgToRock/linalg-to-rock-expand-strides.mlir‎
Lines changed: 56 additions & 0 deletions
@@ -13,6 +13,7 @@
 #ifndef MLIR_CONVERSION_LINALGTOROCK_H
 #define MLIR_CONVERSION_LINALGTOROCK_H
 
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -24,6 +25,9 @@ namespace mlir {
 namespace rock {
 void populateLinalgToRockConversionPattern(RewritePatternSet &pattern,
                                            MLIRContext *context);
+
+/// A tensor.insert_slice is said to be a rock.expand_strides
+bool isRockExpandStride(tensor::InsertSliceOp op);
 }
 } // namespace mlir
 
 
@@ -139,8 +139,53 @@ LogicalResult MatmulConverter<LinalgMatOp>::matchAndRewrite(
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// shape related changes
+//===----------------------------------------------------------------------===//
+namespace {
+struct ExpandStrideConverter final
+    : public OpConversionPattern<tensor::InsertSliceOp> {
+  using OpConversionPattern<tensor::InsertSliceOp>::OpConversionPattern;
+  using OpConversionPattern<tensor::InsertSliceOp>::getTypeConverter;
+  using OpAdaptor =
+      typename OpConversionPattern<tensor::InsertSliceOp>::OpAdaptor;
+
+  LogicalResult
+  matchAndRewrite(tensor::InsertSliceOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+} // namespace
+
+bool mlir::rock::isRockExpandStride(tensor::InsertSliceOp op) {
+  return op->hasAttr("rock.is_expand_strides") &&
+         isa<tensor::EmptyOp>(op.getOperand(1).getDefiningOp());
+}
+
+LogicalResult ExpandStrideConverter::matchAndRewrite(
+    tensor::InsertSliceOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  // The migraphx-to-linalg passes emits the rock.is_expand_stride attribute
+  // to indicate that the insert_slice is an expand_stride. In that case, we
+  // transform it into a rock.expand_strides.
+  if (!rock::isRockExpandStride(op)) {
+    return failure();
+  }
+  tensor::EmptyOp tensorEmpty =
+      dyn_cast<tensor::EmptyOp>(op.getOperand(1).getDefiningOp());
+  assert(tensorEmpty && "Should have been checked by isRockExpandStride");
+
+  Location loc = op.getLoc();
+  auto alloc = bufferization::AllocTensorOp::create(
+      rewriter, loc, tensorEmpty.getResult().getType(), {});
+  auto expandOp = rock::ExpandStridesOp::create(rewriter, loc, op.getType(),
+                                                adaptor.getSource(), alloc);
+  rewriter.replaceOp(op, expandOp);
+  return success();
+}
+
 void mlir::rock::populateLinalgToRockConversionPattern(
     RewritePatternSet &pattern, MLIRContext *context) {
   pattern.add<MatmulConverter<linalg::BatchMatmulOp>,
-              MatmulConverter<linalg::MatmulOp>>(context);
+              MatmulConverter<linalg::MatmulOp>, ExpandStrideConverter>(
+      context);
 }
@@ -38,6 +38,13 @@ static void populateLinalgToRockDialectConversion(ConversionTarget &target) {
                          rock::RockDialect, bufferization::BufferizationDialect,
                          math::MathDialect>();
 
+  // a tensor.insert_slice could be a rock expand stride, and in that case
+  // we expand it into a rock.expand_stride
+  target.addDynamicallyLegalOp<tensor::InsertSliceOp>(
+      [](tensor::InsertSliceOp op) -> std::optional<bool> {
+        return !rock::isRockExpandStride(op);
+      });
+
   // We only allow Linalg operations that are elementwise. Fusion is supported
   // via linalg.generic when it is an elementwise operation. Elementwise
   // operations would be converted into linalg.generic in later passes
 
@@ -53,60 +53,222 @@ struct AsLogicalShapeOpConverter final
 };
 } // namespace
 
-/// Checking to see if the permutation vector is like (0, 1, 2, 3, 4, 5, ...)
-static bool isPermutationStandardForm(ArrayRef<int64_t> permutation) {
-  SmallVector<int64_t, 4> increasingVec(permutation.size(), 0);
-  std::iota(increasingVec.begin(), increasingVec.end(), 0);
-  return llvm::equal(permutation, increasingVec);
-}
-
 LogicalResult AsLogicalShapeOpConverter::matchAndRewrite(
     migraphx::AsLogicalShapeOp op, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
   Location loc = op.getLoc();
   migraphx::MIXRShapedType inType = op.getIn().getType();
   RankedTensorType resultType = op.getOut().getType();
-  Value in = adaptor.getIn(); // The shape we are casting from
+  RankedTensorType memoryType = inType.asMemoryLayoutTensor();
 
-  SmallVector<int64_t, 4> permutation;
-  inType.getStridePermutation(permutation);
-  if (isPermutationStandardForm(permutation)) {
+  /// Expand a flat/underlying value into the N-D memory layout tensor.
+  auto expandToMemoryLayout = [&](Value input) -> Value {
+    if (input.getType() == memoryType)
+      return input;
     SmallVector<ReassociationIndices, 4> reassociationIndex(
-        1, ReassociationIndices(resultType.getRank(), 0));
+        1, ReassociationIndices(memoryType.getRank(), 0));
     std::iota(reassociationIndex[0].begin(), reassociationIndex[0].end(), 0);
-    auto newShape = tensor::ExpandShapeOp::create(rewriter, loc, resultType, in,
-                                                  reassociationIndex);
-    rewriter.replaceOp(op, newShape);
+    return tensor::ExpandShapeOp::create(rewriter, loc, memoryType, input,
+                                         reassociationIndex);
+  };
+
+  /// Invert the stride permutation to transpose from memory order back to
+  /// logical order.
+  auto transposeToLogicalOrder = [&](Value input) -> Value {
+    SmallVector<int64_t, 4> inversePermutation;
+    inType.getStridePermutation(inversePermutation);
+    size_t nDims = inversePermutation.size();
+    bool hasTranspose =
+        !llvm::equal(llvm::seq<int64_t>(nDims), inversePermutation);
+    if (!hasTranspose)
+      return input;
+
+    // Calculating the transposed shape and permutation
+    SmallVector<int64_t, 4> permutation, transposedShape;
+    permutation.resize_for_overwrite(nDims);
+    transposedShape.resize_for_overwrite(nDims);
+    RankedTensorType inputType = cast<RankedTensorType>(input.getType());
+    for (auto [to, from] : llvm::enumerate(inversePermutation)) {
+      permutation[from] = to;
+      transposedShape[from] = inputType.getShape()[to];
+    }
+
+    Value init = tensor::EmptyOp::create(rewriter, loc, transposedShape,
+                                         inputType.getElementType())
+                     .getResult();
+    return linalg::TransposeOp::create(rewriter, loc, input, init, permutation)
+        .getResult()[0];
+  };
+
+  /// Extract the logical slice when the memory layout is larger than the
+  /// logical shape (broadcast dimensions are collapsed to size 1).
+  auto tryExtractSlice = [&](Value input) -> Value {
+    SmallVector<int64_t, 4> slicingShape(resultType.getShape());
+    for (auto [dim, stride] :
+         llvm::zip_equal(slicingShape, inType.getStrides())) {
+      if (stride == 0)
+        dim = 1;
+    }
+    RankedTensorType inputType = cast<RankedTensorType>(input.getType());
+    if (inputType.getShape() == ArrayRef(slicingShape)) {
+      return input;
+    }
+
+    assert(llvm::none_of(llvm::zip_equal(slicingShape, inputType.getShape()),
+                         [](auto val) {
+                           auto [sliceDim, inputDim] = val;
+                           return sliceDim > inputDim;
+                         }) &&
+           "this should have been checked by the verifier as the memory layout "
+           "must be greater than the logical layout");
+
+    RankedTensorType sliceType = resultType.clone(slicingShape);
+    SmallVector<OpFoldResult, 4> offset(sliceType.getRank(),
+                                        rewriter.getIndexAttr(0)),
+        sizes;
+    llvm::transform(sliceType.getShape(), std::back_inserter(sizes),
+                    [&](int64_t size) { return rewriter.getIndexAttr(size); });
+    SmallVector<OpFoldResult, 4> strides(sliceType.getRank(),
+                                         rewriter.getIndexAttr(1));
+    tensor::ExtractSliceOp extractOp = tensor::ExtractSliceOp::create(
+        rewriter, loc, input, offset, sizes, strides);
+    return extractOp.getResult();
+  };
+
+  /// Broadcast along dimensions whose stride is 0 to reach the full logical
+  /// shape.
+  auto tryBroadcast = [&](Value input) -> Value {
+    if (input.getType() == resultType)
+      return input;
+    SmallVector<int64_t, 4> linalgInputShape, broadcastDimensions;
+    for (auto [index, stride, shape] :
+         llvm::enumerate(inType.getStrides(), inType.getShape())) {
+      if (stride != 0)
+        linalgInputShape.push_back(shape);
+      else
+        broadcastDimensions.push_back(index);
+    }
+    SmallVector<ReassociationIndices, 4> reassociationOne(
+        1, ReassociationIndices(resultType.getRank(), 0));
+    SmallVector<ReassociationIndices, 4> reassociationTwo(
+        1, ReassociationIndices(linalgInputShape.size(), 0));
+    std::iota(reassociationOne[0].begin(), reassociationOne[0].end(), 0);
+    std::iota(reassociationTwo[0].begin(), reassociationTwo[0].end(), 0);
+    input =
+        tensor::CollapseShapeOp::create(rewriter, loc, input, reassociationOne);
+    input = tensor::ExpandShapeOp::create(
+        rewriter, loc,
+        RankedTensorType::get(linalgInputShape, resultType.getElementType()),
+        input, reassociationTwo);
+    auto init = tensor::EmptyOp::create(rewriter, loc, resultType.getShape(),
+                                        resultType.getElementType());
+    return linalg::BroadcastOp::create(rewriter, loc, input, init,
+                                       broadcastDimensions)
+        .getResult()[0];
+  };
+
+  Value result = expandToMemoryLayout(adaptor.getIn());
+  result = transposeToLogicalOrder(result);
+
+  if (result.getType() == resultType) {
+    rewriter.replaceOp(op, result);
     return success();
   }
 
-  return op.emitError(
-      "input shape is non standard or broadcast; cannot convert this shape");
+  // handle long stride/broadcasting here
+  result = tryExtractSlice(result);
+  result = tryBroadcast(result);
+
+  rewriter.replaceOp(op, result);
+  return success();
 }
 
 LogicalResult AsUnderlyingShapeConverter::matchAndRewrite(
     migraphx::AsUnderlyingShapeOp op, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
   Location loc = op.getLoc();
+  migraphx::MIXRShapedType resultType = op.getOut().getType();
   Value in = adaptor.getIn();
-  migraphx::MIXRShapedType resultType = op.getResult().getType();
-  auto resultTensorType =
-      cast<RankedTensorType>(getTypeConverter()->convertType(resultType));
+  RankedTensorType memoryLayoutType = resultType.asMemoryLayoutTensor();
+  RankedTensorType inTensorType = cast<RankedTensorType>(in.getType());
 
-  SmallVector<int64_t, 4> permutation;
-  resultType.getStridePermutation(permutation);
-  if (isPermutationStandardForm(permutation)) {
+  RankedTensorType resultTensorType =
+      dyn_cast<RankedTensorType>(getTypeConverter()->convertType(resultType));
+  if (!resultTensorType)
+    return op.emitOpError("unsupported conversion to underlying shape");
+
+  if (inTensorType == resultTensorType) {
+    rewriter.replaceOp(op, in);
+    return success();
+  }
+
+  /// Transpose from logical order to memory layout order.
+  auto transposeToMemoryOrder = [&](Value input) -> Value {
+    SmallVector<int64_t, 4> permutation;
+    resultType.getStridePermutation(permutation);
+    if (llvm::is_sorted(permutation))
+      return input;
+    RankedTensorType inputType = cast<RankedTensorType>(input.getType());
+    SmallVector<int64_t, 4> transposedShape;
+    llvm::transform(permutation, std::back_inserter(transposedShape),
+                    [&](int64_t p) { return inputType.getShape()[p]; });
+    auto init = tensor::EmptyOp::create(rewriter, loc, transposedShape,
+                                        inputType.getElementType())
+                    .getResult();
+    return linalg::TransposeOp::create(rewriter, loc, input, init, permutation)
+        .getResult()[0];
+  };
+
+  /// Pad via insert_slice when the transposed shape is smaller than the
+  /// memory layout (e.g. due to stride-based padding).
+  auto tryInsertSlice = [&](Value input) -> FailureOr<Value> {
+    if (input.getType() == memoryLayoutType)
+      return input;
+    if (resultType.hasBroadcast())
+      return op.emitOpError(
+          "writing to tensors with broadcasts is unsupported");
+    RankedTensorType inputType = cast<RankedTensorType>(input.getType());
+    for (auto [index, memDim, inDim] :
+         llvm::enumerate(memoryLayoutType.getShape(), inputType.getShape())) {
+      if (memDim < inDim) {
+        return op.emitOpError("memory layout dimension ")
+               << memDim << " is smaller than logical dimension " << inDim
+               << "; this indicates invalid strides";
+      }
+    }
+
+    auto empty =
+        tensor::EmptyOp::create(rewriter, loc, memoryLayoutType.getShape(),
+                                memoryLayoutType.getElementType());
+    int64_t rank = inputType.getRank();
+    SmallVector<OpFoldResult> offsets(rank, rewriter.getIndexAttr(0));
+    SmallVector<OpFoldResult> sizes;
+    for (int64_t dim : inputType.getShape())
+      sizes.push_back(rewriter.getIndexAttr(dim));
+    SmallVector<OpFoldResult> strides(rank, rewriter.getIndexAttr(1));
+    tensor::InsertSliceOp insertSlice = tensor::InsertSliceOp::create(
+        rewriter, loc, input, empty, offsets, sizes, strides);
+    insertSlice->setAttr("rock.is_expand_strides", rewriter.getUnitAttr());
+    return insertSlice.getResult();
+  };
+
+  /// Collapse the N-D memory layout tensor into the flat underlying shape.
+  auto collapseToUnderlying = [&](Value input) -> Value {
+    assert(input.getType() == memoryLayoutType &&
+           "expected memory layout type before collapsing");
     SmallVector<ReassociationIndices, 4> reassociationIndex(
         1, ReassociationIndices(resultType.getRank(), 0));
     std::iota(reassociationIndex[0].begin(), reassociationIndex[0].end(), 0);
-    auto reshape = tensor::CollapseShapeOp::create(
-        rewriter, loc, resultTensorType, in, reassociationIndex);
-    rewriter.replaceOp(op, reshape);
-    return success();
-  }
+    return tensor::CollapseShapeOp::create(rewriter, loc, resultTensorType,
+                                           input, reassociationIndex);
+  };
 
-  return op.emitError(
-      "input shape is non standard or broadcast; cannot convert this shape");
+  FailureOr<Value> result = tryInsertSlice(transposeToMemoryOrder(in));
+  if (failed(result))
+    return failure();
+
+  rewriter.replaceOp(op, collapseToUnderlying(*result));
+  return success();
 }
 
 namespace {
 
@@ -0,0 +1,56 @@
+// RUN:  sed s/##TOKEN_ARCH##/%arch/g %s | rocmlir-opt --linalg-to-rock -verify-diagnostics -split-input-file | FileCheck %s
+
+// CHECK-LABEL: func.func @mlir_dot_log
+// CHECK-SAME: (%[[arg0:.*]]: tensor<1536xf16>, %[[arg1:.*]]: tensor<1536xf16>)
+func.func @mlir_dot_log(%arg0: tensor<1536xf16>, %arg1: tensor<1536xf16>) -> tensor<4608xf16> attributes {rock.kernel, rock.arch="##TOKEN_ARCH##"} {
+  //   CHECK: %[[expanded:.*]] = tensor.expand_shape %[[arg1]]
+  //   CHECK: %[[expanded_0:.*]] = tensor.expand_shape %[[arg0]]
+  %expanded = tensor.expand_shape %arg1 [[0, 1, 2]] output_shape [4, 16, 24] : tensor<1536xf16> into tensor<4x16x24xf16>
+  %expanded_0 = tensor.expand_shape %arg0 [[0, 1, 2]] output_shape [4, 24, 16] : tensor<1536xf16> into tensor<4x24x16xf16>
+  //   CHECK: %[[cst:.*]] = arith.constant dense<0.000000e+00> : tensor<4x24x24xf16>
+  %cst = arith.constant dense<0.000000e+00> : tensor<4x24x24xf16>
+  //   CHECK: %[[alloc:.*]] = bufferization.alloc_tensor() : tensor<4x24x24xf16>
+  //   CHECK: %[[gemm:.*]] = rock.gemm %[[alloc]] = %[[expanded_0]] * %[[expanded]]{{.*}}storeMethod
+  %0 = linalg.batch_matmul ins(%expanded_0, %expanded : tensor<4x24x16xf16>, tensor<4x16x24xf16>) outs(%cst : tensor<4x24x24xf16>) -> tensor<4x24x24xf16>
+  //   CHECK: %[[empty:.*]] = tensor.empty() : tensor<4x24x24xf16>
+  //   CHECK: %[[log:.*]] = linalg.log ins(%[[gemm]]{{.*}}) outs(%[[empty]]{{.*}}) -> tensor<4x24x24xf16>
+  %1 = tensor.empty() : tensor<4x24x24xf16>
+  %2 = linalg.log ins(%0 : tensor<4x24x24xf16>) outs(%1 : tensor<4x24x24xf16>) -> tensor<4x24x24xf16>
+  //   CHECK: %[[alloc2:.*]] = bufferization.alloc_tensor() : tensor<4x48x24xf16>
+  //   CHECK: %[[expand:.*]] = rock.expand_strides %[[log]] into %[[alloc2]]
+  %3 = tensor.empty() : tensor<4x48x24xf16>
+  %inserted_slice = tensor.insert_slice %2 into %3[0, 0, 0] [4, 24, 24] [1, 1, 1] {rock.is_expand_strides}: tensor<4x24x24xf16> into tensor<4x48x24xf16>
+  //   CHECK: %[[collapsed:.*]] = tensor.collapse_shape %[[expand]]
+  //   CHECK: return %[[collapsed]]
+  %collapsed = tensor.collapse_shape %inserted_slice [[0, 1, 2]] : tensor<4x48x24xf16> into tensor<4608xf16>
+  return %collapsed : tensor<4608xf16>
+}
+
+// -----
+
+
+// CHECK-LABEL: func.func @mlir_dot_log
+// CHECK-SAME: (%[[arg0:.*]]: tensor<320xf16>, %[[arg1:.*]]: tensor<1536xf16>)
+func.func @mlir_dot_log(%arg0: tensor<320xf16>, %arg1: tensor<1536xf16>) -> tensor<1152xf16> attributes {rock.kernel, rock.arch="##TOKEN_ARCH##"} {
+  //   CHECK: %[[expanded:.*]] = tensor.expand_shape %[[arg1]]
+  //   CHECK: %[[expanded_0:.*]] = tensor.expand_shape %[[arg0]]
+  %expanded = tensor.expand_shape %arg1 [[0, 1, 2]] output_shape [4, 16, 24] : tensor<1536xf16> into tensor<4x16x24xf16>
+  %expanded_0 = tensor.expand_shape %arg0 [[0, 1, 2]] output_shape [4, 5, 16] : tensor<320xf16> into tensor<4x5x16xf16>
+  //   CHECK: %[[cst:.*]] = arith.constant dense<0.000000e+00> : tensor<4x5x24xf16>
+  %cst = arith.constant dense<0.000000e+00> : tensor<4x5x24xf16>
+  //   CHECK: %[[alloc:.*]] = bufferization.alloc_tensor() : tensor<4x5x24xf16>
+  //   CHECK: %[[gemm:.*]] = rock.gemm %[[alloc]] = %[[expanded_0]] * %[[expanded]]{{.*}}storeMethod
+  %0 = linalg.batch_matmul ins(%expanded_0, %expanded : tensor<4x5x16xf16>, tensor<4x16x24xf16>) outs(%cst : tensor<4x5x24xf16>) -> tensor<4x5x24xf16>
+  //   CHECK: %[[empty:.*]] = tensor.empty() : tensor<4x5x24xf16>
+  //   CHECK: %[[log:.*]] = linalg.log ins(%[[gemm]]{{.*}}) outs(%[[empty]]{{.*}}) -> tensor<4x5x24xf16>
+  %1 = tensor.empty() : tensor<4x5x24xf16>
+  %2 = linalg.log ins(%0 : tensor<4x5x24xf16>) outs(%1 : tensor<4x5x24xf16>) -> tensor<4x5x24xf16>
+  //   CHECK: %[[alloc2:.*]] = bufferization.alloc_tensor() : tensor<4x12x24xf16>
+  //   CHECK: %[[expand:.*]] = rock.expand_strides %[[log]] into %[[alloc2]]
+  %3 = tensor.empty() : tensor<4x12x24xf16>
+  %inserted_slice = tensor.insert_slice %2 into %3[0, 0, 0] [4, 5, 24] [1, 1, 1] {rock.is_expand_strides} : tensor<4x5x24xf16> into tensor<4x12x24xf16>
+  //   CHECK: %[[collapsed:.*]] = tensor.collapse_shape %[[expand]]
+  //   CHECK: return %[[collapsed]]
+  %collapsed = tensor.collapse_shape %inserted_slice [[0, 1, 2]] : tensor<4x12x24xf16> into tensor<1152xf16>
+  return %collapsed : tensor<1152xf16>
+}