Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions include/infiniop/ops/awq_marlin_repack.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifndef __INFINIOP_AWQ_MARLIN_REPACK_API_H__
#define __INFINIOP_AWQ_MARLIN_REPACK_API_H__

#include "../operator_descriptor.h"
#include <cstdint>

typedef struct InfiniopDescriptor *infiniopAwqMarlinRepackDescriptor_t;

__INFINI_C __export infiniStatus_t infiniopCreateAwqMarlinRepackDescriptor(infiniopHandle_t handle,
infiniopAwqMarlinRepackDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
int64_t num_bits,
bool is_a_8bit);

__INFINI_C __export infiniStatus_t infiniopGetAwqMarlinRepackWorkspaceSize(infiniopAwqMarlinRepackDescriptor_t desc, size_t *size);

__INFINI_C __export infiniStatus_t infiniopAwqMarlinRepack(infiniopAwqMarlinRepackDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *output,
const void *input,
void *stream);

__INFINI_C __export infiniStatus_t infiniopDestroyAwqMarlinRepackDescriptor(infiniopAwqMarlinRepackDescriptor_t desc);

#endif
29 changes: 29 additions & 0 deletions include/infiniop/ops/gptq_marlin_repack.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#ifndef __INFINIOP_GPTQ_MARLIN_REPACK_API_H__
#define __INFINIOP_GPTQ_MARLIN_REPACK_API_H__

#include "../operator_descriptor.h"
#include <cstdint>

typedef struct InfiniopDescriptor *infiniopGptqMarlinRepackDescriptor_t;

__INFINI_C __export infiniStatus_t infiniopCreateGptqMarlinRepackDescriptor(infiniopHandle_t handle,
infiniopGptqMarlinRepackDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
infiniopTensorDescriptor_t perm_desc,
int64_t num_bits,
bool is_a_8bit);

__INFINI_C __export infiniStatus_t infiniopGetGptqMarlinRepackWorkspaceSize(infiniopGptqMarlinRepackDescriptor_t desc, size_t *size);

__INFINI_C __export infiniStatus_t infiniopGptqMarlinRepack(infiniopGptqMarlinRepackDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *output,
const void *input,
const void *perm,
void *stream);

__INFINI_C __export infiniStatus_t infiniopDestroyGptqMarlinRepackDescriptor(infiniopGptqMarlinRepackDescriptor_t desc);

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ void run_with_workspace(void *planned_meta) {
return tensor->numel() == 0 ? nullptr : tensor->data();
};

context::setDeviceMemoryAsync(planned->workspace->data(), 0, planned->workspace->nbytes(), context::getStream());
INFINICORE_CHECK_ERROR(infiniopGptqMarlinGemm(
planned->descriptor->desc,
planned->workspace->data(),
Expand Down Expand Up @@ -183,6 +184,7 @@ void direct_with_workspace(Tensor workspace, Tensor out, const Tensor &a, const
return tensor->numel() == 0 ? nullptr : tensor->data();
};

context::setDeviceMemoryAsync(workspace->data(), 0, workspace->nbytes(), context::getStream());
INFINICORE_CHECK_ERROR(infiniopGptqMarlinGemm(
descriptor->desc,
workspace->data(),
Expand Down
2 changes: 1 addition & 1 deletion src/infiniop/ops/add/add.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,4 @@
}; \
}

#endif // ADD_H
#endif // ADD_H
109 changes: 100 additions & 9 deletions src/infiniop/ops/add/nvidia/add_nvidia.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,86 @@
#include "add_nvidia.cuh"

namespace op::add::nvidia {
namespace {

template <typename T>
INFINIOP_CUDA_KERNEL addKernel(
size_t output_size,
size_t ndim,
bool output_contiguous,
const bool *__restrict__ input_contiguous,
const bool *__restrict__ input_broadcasted,
const size_t *__restrict__ output_shape,
const size_t *__restrict__ input_shapes,
const ptrdiff_t *__restrict__ output_strides,
const ptrdiff_t *__restrict__ input_strides,
T *output,
const T *__restrict__ a,
const T *__restrict__ b,
size_t offset) {

size_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;
if (idx >= output_size) {
return;
}

size_t out_idx = op::elementwise::nvidia::getOutputIndex(idx, output_contiguous, ndim, output_shape, output_strides);
op::elementwise::nvidia::InputIndexer indexer{
idx, ndim, input_contiguous, input_broadcasted, input_shapes, input_strides, output_strides};
output[out_idx] = cuda::AddOp{}(a[indexer(0)], b[indexer(1)]);
}

template <typename T>
infiniStatus_t launchAddKernel(
const op::elementwise::ElementwiseInfo &info,
const std::shared_ptr<device::nvidia::Handle::Internal> &internal,
void *workspace,
void *output,
const void *a,
const void *b,
cudaStream_t stream) {

auto output_size = info.getOutputSize();
if (output_size == 0) {
return INFINI_STATUS_SUCCESS;
}

auto ndim = info.getNdim();
auto *d_meta_start = reinterpret_cast<int8_t *>(workspace);
CHECK_CUDA(cudaMemcpyAsync(d_meta_start, info.getMetaStart(), info.getMetaMemSize(), cudaMemcpyHostToDevice, stream));

auto *d_output_shape = reinterpret_cast<const size_t *>(d_meta_start);
auto *d_output_strides = reinterpret_cast<const ptrdiff_t *>(d_output_shape + ndim);
auto *d_input_shapes = reinterpret_cast<const size_t *>(d_output_strides + ndim);
auto *d_input_strides = reinterpret_cast<const ptrdiff_t *>(d_input_shapes + info.getInputSize() * ndim);
auto *d_input_contiguous = reinterpret_cast<const bool *>(d_input_strides + info.getInputSize() * ndim);
auto *d_input_broadcasted = reinterpret_cast<const bool *>(d_input_contiguous + info.getInputSize());

dim3 block_dims(std::min(256U, static_cast<uint32_t>(internal->maxThreadsPerBlock())));
dim3 grid_dims(std::min(uint32_t(CEIL_DIV(output_size, block_dims.x)), static_cast<uint32_t>(internal->gridSizeX())));
size_t step = grid_dims.x * block_dims.x;

for (size_t i = 0; i < output_size; i += step) {
addKernel<T><<<grid_dims, block_dims, 0, stream>>>(
output_size,
ndim,
info.isOutputContiguous(),
d_input_contiguous,
d_input_broadcasted,
d_output_shape,
d_input_shapes,
d_output_strides,
d_input_strides,
reinterpret_cast<T *>(output),
reinterpret_cast<const T *>(a),
reinterpret_cast<const T *>(b),
i);
}

return INFINI_STATUS_SUCCESS;
}

} // namespace

Descriptor::~Descriptor() = default;

Expand All @@ -26,8 +106,18 @@ infiniStatus_t Descriptor::create(

CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);

// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
CHECK_RESULT(info_result);
auto info = info_result.take();
auto workspace_size = info.getMetaMemSize();

*desc_ptr = new Descriptor(
dtype,
std::move(info),
handle->internal(),
workspace_size,
handle->device,
handle->device_id);

return INFINI_STATUS_SUCCESS;
}
Expand All @@ -36,7 +126,8 @@ infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
const void *a,
const void *b,
void *stream) const {

if (workspace_size < _workspace_size) {
Expand All @@ -45,17 +136,17 @@ infiniStatus_t Descriptor::calculate(

switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::AddOp, half>(_info, workspace, output, inputs, stream);
return launchAddKernel<half>(_info, _internal, workspace, output, a, b, reinterpret_cast<cudaStream_t>(stream));
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::AddOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
return launchAddKernel<cuda_bfloat16>(_info, _internal, workspace, output, a, b, reinterpret_cast<cudaStream_t>(stream));
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::AddOp, float>(_info, workspace, output, inputs, stream);
return launchAddKernel<float>(_info, _internal, workspace, output, a, b, reinterpret_cast<cudaStream_t>(stream));
case INFINI_DTYPE_I32:
return _device_info->calculate<256, cuda::AddOp, int32_t>(_info, workspace, output, inputs, stream);
return launchAddKernel<int32_t>(_info, _internal, workspace, output, a, b, reinterpret_cast<cudaStream_t>(stream));
case INFINI_DTYPE_I64:
return _device_info->calculate<256, cuda::AddOp, int64_t>(_info, workspace, output, inputs, stream);
return launchAddKernel<int64_t>(_info, _internal, workspace, output, a, b, reinterpret_cast<cudaStream_t>(stream));
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::AddOp, double>(_info, workspace, output, inputs, stream);
return launchAddKernel<double>(_info, _internal, workspace, output, a, b, reinterpret_cast<cudaStream_t>(stream));
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
Expand Down
42 changes: 41 additions & 1 deletion src/infiniop/ops/add/nvidia/add_nvidia.cuh
Original file line number Diff line number Diff line change
@@ -1,8 +1,48 @@
#ifndef __ADD_CUDA_API_H__
#define __ADD_CUDA_API_H__

#include "../../../devices/nvidia/nvidia_handle.cuh"
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"

ELEMENTWISE_DESCRIPTOR(add, nvidia)
namespace op::add::nvidia {
class Descriptor final : public InfiniopDescriptor {
infiniDtype_t _dtype;
op::elementwise::ElementwiseInfo _info;
std::shared_ptr<device::nvidia::Handle::Internal> _internal;
size_t _workspace_size;

Descriptor(
infiniDtype_t dtype,
op::elementwise::ElementwiseInfo info,
std::shared_ptr<device::nvidia::Handle::Internal> internal,
size_t workspace_size,
infiniDevice_t device_type,
int device_id)
: InfiniopDescriptor{device_type, device_id},
_dtype(dtype),
_info(std::move(info)),
_internal(std::move(internal)),
_workspace_size(workspace_size) {}

public:
~Descriptor();

size_t workspaceSize() const { return _workspace_size; }

static infiniStatus_t create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
std::vector<infiniopTensorDescriptor_t> input_descs);

infiniStatus_t calculate(
void *workspace,
size_t workspace_size,
void *output,
const void *a,
const void *b,
void *stream) const;
};
} // namespace op::add::nvidia

#endif // __ADD_CUDA_API_H__
47 changes: 43 additions & 4 deletions src/infiniop/ops/add/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#ifdef ENABLE_ASCEND_API
#include "ascend/add_ascend.h"
#endif
#include <vector>

__INFINI_C infiniStatus_t infiniopCreateAddDescriptor(
infiniopHandle_t handle,
Expand Down Expand Up @@ -132,6 +133,36 @@ __INFINI_C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t de
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}

namespace {

template <typename Descriptor>
infiniStatus_t calculateAdd(
const Descriptor *desc,
void *workspace,
size_t workspace_size,
void *c,
const void *a,
const void *b,
void *stream) {
const std::vector<const void *> inputs{a, b};
return desc->calculate(workspace, workspace_size, c, inputs, stream);
}

#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)
infiniStatus_t calculateAdd(
const op::add::nvidia::Descriptor *desc,
void *workspace,
size_t workspace_size,
void *c,
const void *a,
const void *b,
void *stream) {
return desc->calculate(workspace, workspace_size, c, a, b, stream);
}
#endif

} // namespace

__INFINI_C infiniStatus_t infiniopAdd(
infiniopAddDescriptor_t desc,
void *workspace,
Expand All @@ -141,10 +172,18 @@ __INFINI_C infiniStatus_t infiniopAdd(
const void *b,
void *stream) {

#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::add::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, c, {a, b}, stream)
// NVIDIA Add keeps explicit a/b pointers because the generic elementwise
// input-vector path copies inputs.data() from host to device workspace before
// launching the kernel. During CUDA graph capture, that H2D node records the
// host source address; if infiniopAdd used a temporary vector such as {a, b},
// graph replay could later read from an invalid host address and copy bad input
// pointers into device workspace. Other backends keep their original vector
// interface through calculateAdd's default forwarding path.
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return calculateAdd( \
reinterpret_cast<const op::add::NAMESPACE::Descriptor *>(desc), \
workspace, workspace_size, c, a, b, stream)

switch (desc->device_type) {

Expand Down
48 changes: 48 additions & 0 deletions src/infiniop/ops/awq_marlin_repack/awq_marlin_repack.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#ifndef AWQ_MARLIN_REPACK_H
#define AWQ_MARLIN_REPACK_H

#include "../../operator.h"
#include "info.h"

#define DESCRIPTOR(NAMESPACE) \
\
namespace op::awq_marlin_repack::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
AwqMarlinRepackInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
AwqMarlinRepackInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t output_desc, \
infiniopTensorDescriptor_t input_desc, \
int64_t num_bits, \
bool is_a_8bit); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *output, \
const void *input, \
void *stream) const; \
}; \
}

#endif // AWQ_MARLIN_REPACK_H
Loading
Loading