Skip to content

crash when suing gpu-kernel-outlining pass in schedule #88

@charithaintc

Description

@charithaintc

I have been experimenting with different schedules and getting the following error. Not sure how to fix it. can anyone help.

python examples/xegpu/softmax.py --dump-schedule --dump-kernel=initial | /home/jovy
an/llvm-project/build_llvm_upstream_python/bin/mlir-opt --pass-pipeline="builtin.module(transform-interpreter)" 
LLVM ERROR: Loading a dialect (dlti) while in a multi-threaded execution context (maybe the PassManager): this can indicate a missing `dependentDialects` in a pass for example.

This is my transform schedule and payload.

#map = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1) -> (d0)>
module {
  func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) attributes {llvm.emit_c_interface} {
    %0 = bufferization.to_tensor %arg0 restrict writable : memref<1024x64xf32> to tensor<1024x64xf32>
    %1 = bufferization.to_tensor %arg1 restrict : memref<1024x64xf32> to tensor<1024x64xf32>
    %cst = arith.constant 0xFF800000 : f32
    %2 = tensor.empty() : tensor<1024xf32>
    %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<1024xf32>) -> tensor<1024xf32>
    %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%1 : tensor<1024x64xf32>) outs(%3 : tensor<1024xf32>) {
    ^bb0(%in: f32, %out: f32):
      %12 = arith.maximumf %in, %out : f32
      linalg.yield %12 : f32
    } -> tensor<1024xf32>
    %5 = tensor.empty() : tensor<1024x64xf32>
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%1, %4 : tensor<1024x64xf32>, tensor<1024xf32>) outs(%5 : tensor<1024x64xf32>) {
    ^bb0(%in: f32, %in_1: f32, %out: f32):
      %12 = arith.subf %in, %in_1 : f32
      linalg.yield %12 : f32
    } -> tensor<1024x64xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<1024x64xf32>) outs(%5 : tensor<1024x64xf32>) {
    ^bb0(%in: f32, %out: f32):
      %12 = math.exp %in : f32
      linalg.yield %12 : f32
    } -> tensor<1024x64xf32>
    %8 = tensor.empty() : tensor<1024xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %9 = linalg.fill ins(%cst_0 : f32) outs(%8 : tensor<1024xf32>) -> tensor<1024xf32>
    %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%7 : tensor<1024x64xf32>) outs(%9 : tensor<1024xf32>) {
    ^bb0(%in: f32, %out: f32):
      %12 = arith.addf %in, %out : f32
      linalg.yield %12 : f32
    } -> tensor<1024xf32>
    %11 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%7, %10 : tensor<1024x64xf32>, tensor<1024xf32>) outs(%5 : tensor<1024x64xf32>) {
    ^bb0(%in: f32, %in_1: f32, %out: f32):
      %12 = arith.divf %in, %in_1 : f32
      linalg.yield %12 : f32
    } -> tensor<1024x64xf32>
    bufferization.materialize_in_destination %11 in restrict writable %arg0 : (tensor<1024x64xf32>, memref<1024x64xf32>) -> ()
    return
  }
  func.func @gpu_alloc_2d_f32(%arg0: i32, %arg1: i32) -> memref<?x?xf32> attributes {llvm.emit_c_interface} {
    %0 = arith.index_cast %arg0 : i32 to index
    %1 = arith.index_cast %arg1 : i32 to index
    %memref = gpu.alloc  (%0, %1) : memref<?x?xf32>
    return %memref : memref<?x?xf32>
  }
  func.func @gpu_dealloc_2d_f32(%arg0: memref<?x?xf32>) attributes {llvm.emit_c_interface} {
    gpu.dealloc  %arg0 : memref<?x?xf32>
    return
  }
  func.func @gpu_copy_2d_f32(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>) attributes {llvm.emit_c_interface} {
    gpu.memcpy  %arg1, %arg0 : memref<?x?xf32>, memref<?x?xf32>
    return
  }
}

module attributes {transform.with_named_sequence} {
  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
    %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
    %1:5 = transform.split_handle %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
    %tiled_op, %forall_op = transform.structured.tile_using_forall %1#4 tile_sizes [64] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
    %fused_op, %new_containing_op = transform.structured.fuse_into_containing_op %1#3 into %forall_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
    %fused_op_0, %new_containing_op_1 = transform.structured.fuse_into_containing_op %1#2 into %new_containing_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
    %fused_op_2, %new_containing_op_3 = transform.structured.fuse_into_containing_op %1#1 into %new_containing_op_1 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
    %fused_op_4, %new_containing_op_5 = transform.structured.fuse_into_containing_op %1#0 into %new_containing_op_3 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
    %2 = transform.get_parent_op %new_containing_op_5 {deduplicate, op_name = "func.func"} : (!transform.any_op) -> !transform.any_op
    transform.apply_cse to %2 : !transform.any_op
    transform.apply_patterns to %2 {
      transform.apply_patterns.canonicalization
    } : !transform.any_op
    %3 = transform.apply_registered_pass "eliminate-empty-tensors" to %2 : (!transform.any_op) -> !transform.any_op
    %4 = transform.structured.vectorize_children_and_apply_patterns %3 {fold_type_extensions_into_contract} : (!transform.any_op) -> !transform.any_op
    %5 = transform.get_parent_op %4 {deduplicate, op_name = "builtin.module"} : (!transform.any_op) -> !transform.any_op
    %6 = transform.bufferization.one_shot_bufferize layout{IdentityLayoutMap} %5 {allow_return_allocs_from_loops = true, bufferize_function_boundaries = true} : (!transform.any_op) -> !transform.any_op
    %7 = transform.apply_registered_pass "fold-memref-alias-ops" to %6 : (!transform.any_op) -> !transform.any_op
    transform.apply_cse to %7 : !transform.any_op
    transform.apply_patterns to %7 {
      transform.apply_patterns.canonicalization
    } : !transform.any_op
    %8 = transform.structured.match ops{["scf.forall"]} in %7 : (!transform.any_op) -> !transform.any_op
    %9 = transform.split_handle %8 : (!transform.any_op) -> !transform.any_op
    %10 = transform.loop.forall_to_parallel %9 : (!transform.any_op) -> !transform.any_op
    %11 = transform.get_parent_op %10 : (!transform.any_op) -> !transform.any_op
    %12 = transform.apply_registered_pass "gpu-map-parallel-loops" to %11 : (!transform.any_op) -> !transform.any_op
    %13 = transform.apply_registered_pass "convert-parallel-loops-to-gpu" to %12 : (!transform.any_op) -> !transform.any_op
    %14 = transform.apply_registered_pass "lower-affine" to %13 : (!transform.any_op) -> !transform.any_op
    transform.apply_cse to %14 : !transform.any_op
    transform.apply_patterns to %14 {
      transform.apply_patterns.canonicalization
    } : !transform.any_op
    %15 = transform.structured.match ops{["gpu.launch"]} in %14 : (!transform.any_op) -> !transform.any_op
    %16 = transform.split_handle %15 : (!transform.any_op) -> !transform.any_op
    transform.xegpu.set_gpu_launch_threads %16 threads = [128, 1, 1] : !transform.any_op
    %17 = transform.apply_registered_pass "lower-affine" to %14 : (!transform.any_op) -> !transform.any_op
    transform.apply_patterns to %17 {
      transform.apply_patterns.canonicalization
    } : !transform.any_op
    %18 = transform.apply_registered_pass "gpu-launch-sink-index-computations" to %17 : (!transform.any_op) -> !transform.any_op
    %19 = transform.get_parent_op %18 {deduplicate, op_name = "builtin.module"} : (!transform.any_op) -> !transform.any_op
    %20 = transform.apply_registered_pass "gpu-kernel-outlining" to %19 : (!transform.any_op) -> !transform.any_op
    transform.yield 
  }
}

is there anything I am doing wrong?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions