Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ if(AUTO_DETECT_DEVICES)
# NVIDIA
set(NVIDIA_FOUND FALSE)

file(GLOB NVIDIA_DEV_FILES "/dev/nvidia*")
file(GLOB NVIDIA_DEV_FILES "/dev/nvidia0")

if(NVIDIA_DEV_FILES)
set(NVIDIA_FOUND TRUE)
Expand Down
12 changes: 11 additions & 1 deletion examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,22 @@ foreach(source_file ${EXAMPLE_SOURCES})
target_link_libraries(${example_name} PRIVATE infiniccl)

# Add runtime and backend dependencies for direct runtime/backend usage.
if(WITH_NVIDIA OR WITH_ILUVATAR)
if(WITH_NVIDIA)
target_link_libraries(${example_name} PRIVATE CUDA::cudart)
endif()

if(WITH_ILUVATAR)
set_source_files_properties(${source_file} PROPERTIES LANGUAGE CXX)
set_target_properties(${example_name} PROPERTIES
RULE_LAUNCH_COMPILE "${ILUVATAR_CUDA_COMPILER} "
)
target_compile_options(${example_name} PRIVATE ${ILUVATAR_CUDA_FLAGS} "-Wno-unused-command-line-argument")
target_link_libraries(${example_name} PRIVATE CUDA::cudart CUDA::cuda_driver)
endif()

if(WITH_METAX)
target_link_libraries(${example_name} PRIVATE ${MACA_RUNTIME_LIB})
target_compile_options(${example_name} PRIVATE "-x" "maca")
endif()

if(WITH_MOORE)
Expand Down
20 changes: 14 additions & 6 deletions scripts/gen_bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@
"""

# Hardware traits to look for in device directories.
DEVICE_TRAIT_HEADERS = ["device_.h", "runtime_.h", "data_type_.h"]
DEVICE_TRAIT_FILES = {
"caster_": ["h", "cuh"],
"device_": ["h"],
"runtime_": ["h", "cuh"],
"data_type_": ["h", "cuh"],
}

# Map logical backend names (from CMake) to their internal source paths.
BACKEND_PATH_MAP = {"ompi": "ompi/impl", "mpich": "ompi/impl", "nccl": "nvidia/nccl"}
Expand Down Expand Up @@ -97,11 +102,14 @@ def generate(project_root, output_dir, devices, backends):
device_included = False
manifest_lines.append(f"\n// --- DEVICE: {dev.upper()} ---")

for trait in DEVICE_TRAIT_HEADERS:
rel_path = f"{dev}/{trait}"
if os.path.exists(os.path.join(src_dir, rel_path)):
manifest_lines.append(f'#include "{rel_path}"')
device_included = True
for trait, extensions in DEVICE_TRAIT_FILES.items():
for ext in extensions:
rel_path = f"{dev}/{trait}.{ext}"

if os.path.exists(os.path.join(src_dir, rel_path)):
manifest_lines.append(f'#include "{rel_path}"')
device_included = True
break

if device_included:
found_devices.append(f"Device::Type::k{dev.capitalize()}")
Expand Down
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ if(WITH_ILUVATAR)
set_source_files_properties(${ILUVATAR_SOURCES} PROPERTIES
COMPILE_OPTIONS "${ILUVATAR_CUDA_FLAGS}"
)
set(CMAKE_CXX_COMPILER "${ILUVATAR_CUDA_COMPILER}" CACHE FILEPATH "Iluvatar Compiler" FORCE)
target_compile_options(infiniccl PRIVATE ${ILUVATAR_CUDA_FLAGS})

target_link_libraries(infiniccl PRIVATE CUDA::cudart CUDA::cuda_driver)
endif()
Expand Down
49 changes: 49 additions & 0 deletions src/cambricon/caster.mlu
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#include "caster_.h"

namespace infini::ccl {

float HardwareCastImpl<Device::Type::kCambricon, float, CambriconFP16>::Apply(
CambriconFP16 x) {
const __half& native_x = *reinterpret_cast<const __half*>(&x.bits);
return __half2float(native_x);
}

CambriconFP16 HardwareCastImpl<Device::Type::kCambricon, CambriconFP16,
float>::Apply(float x) {
__half native_res = __float2half__(x);
CambriconFP16 out;
out.bits = *reinterpret_cast<uint16_t*>(&native_res);
return out;
}

CambriconFP16
HardwareCastImpl<Device::Type::kCambricon, CambriconFP16, int>::Apply(int x) {
__half native_res = __int2half_rn__(x);
CambriconFP16 out;
out.bits = *reinterpret_cast<uint16_t*>(&native_res);
return out;
}

float HardwareCastImpl<Device::Type::kCambricon, float, CambriconBF16>::Apply(
CambriconBF16 x) {
const bfloat16_t& native_x = *reinterpret_cast<const bfloat16_t*>(&x.bits);
return __bfloat162float__(native_x);
}

CambriconBF16 HardwareCastImpl<Device::Type::kCambricon, CambriconBF16,
float>::Apply(float x) {
bfloat16_t native_res = __float2bfloat16__(x);
CambriconBF16 out;
out.bits = *reinterpret_cast<uint16_t*>(&native_res);
return out;
}

CambriconBF16
HardwareCastImpl<Device::Type::kCambricon, CambriconBF16, int>::Apply(int x) {
bfloat16_t native_res = __int2bfloat16_rn__(x);
CambriconBF16 out;
out.bits = *reinterpret_cast<uint16_t*>(&native_res);
return out;
}

} // namespace infini::ccl
41 changes: 41 additions & 0 deletions src/cambricon/caster_.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#ifndef INFINI_CCL_CAMBRICON_CASTER__H_
#define INFINI_CCL_CAMBRICON_CASTER__H_

#include "caster.h"
#include "data_type_.h"

namespace infini::ccl {

template <>
struct HardwareCastImpl<Device::Type::kCambricon, float, CambriconFP16> {
static float Apply(CambriconFP16 x);
};

template <>
struct HardwareCastImpl<Device::Type::kCambricon, CambriconFP16, float> {
static CambriconFP16 Apply(float x);
};

template <>
struct HardwareCastImpl<Device::Type::kCambricon, CambriconFP16, int> {
static CambriconFP16 Apply(int x);
};

template <>
struct HardwareCastImpl<Device::Type::kCambricon, float, CambriconBF16> {
static float Apply(CambriconBF16 x);
};

template <>
struct HardwareCastImpl<Device::Type::kCambricon, CambriconBF16, float> {
static CambriconBF16 Apply(float x);
};

template <>
struct HardwareCastImpl<Device::Type::kCambricon, CambriconBF16, int> {
static CambriconBF16 Apply(int x);
};

} // namespace infini::ccl

#endif // INFINI_CCL_CAMBRICON_CASTER__H_
27 changes: 11 additions & 16 deletions src/cambricon/data_type_.h
Original file line number Diff line number Diff line change
@@ -1,33 +1,28 @@
#ifndef INFINI_CCL_CAMBRICON_DATA_TYPE__H_
#define INFINI_CCL_CAMBRICON_DATA_TYPE__H_

#if defined(__mlu__) || defined(__BANG__)
#include "bang_bf16.h"
#else
// If compiling on the host side using standard GCC, mock the storage types
// to prevent header poisoning and avoid `bang_fp16.h` dependency altogether.
#ifndef HAS_CAMBRICON_HOST_HALF_MOCKS
#define HAS_CAMBRICON_HOST_HALF_MOCKS
#include <cstdint>

typedef uint16_t half;
typedef uint16_t bfloat16_t;

#endif
#endif

#include "cambricon/device_.h"
#include "data_type_impl.h"

namespace infini::ccl {

// Unique tag types to force completely distinct template signatures.
struct CambriconFP16 {
uint16_t bits;
};
struct CambriconBF16 {
uint16_t bits;
};

template <>
struct TypeMap<Device::Type::kCambricon, DataType::kFloat16> {
using type = half;
using type = CambriconFP16;
};

template <>
struct TypeMap<Device::Type::kCambricon, DataType::kBFloat16> {
using type = bfloat16_t;
using type = CambriconBF16;
};

} // namespace infini::ccl
Expand Down
83 changes: 83 additions & 0 deletions src/caster.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#ifndef INFINI_CCL_CASTER_H_
#define INFINI_CCL_CASTER_H_

#include <type_traits>
#include <typeinfo>
#include <utility>

#include "device.h"

namespace infini::ccl {

// Check if a type is complete (i.e. fully defined) at compile-time.
template <typename T, typename = void>
struct IsComplete : std::false_type {};

template <typename T>
struct IsComplete<T, std::void_t<decltype(sizeof(T))>> : std::true_type {};

template <Device::Type kDev, typename Dst, typename Src>
struct HardwareCastImpl;

template <Device::Type kDev, typename Dst, typename Src>
inline constexpr bool HasHardwareCastV =
IsComplete<HardwareCastImpl<kDev, Dst, Src>>::value;

// Intermediate type used for fallback conversions, usually `float`.
template <Device::Type kDev>
struct CastBridge {
using Type = float;
};

template <Device::Type kDev>
struct Caster {
template <typename Dst, typename Src>
static constexpr Dst Cast(Src&& x) {
using PureSrc = std::remove_cv_t<std::remove_reference_t<Src>>;
using PureDst = std::remove_cv_t<std::remove_reference_t<Dst>>;
using Bridge = typename CastBridge<kDev>::Type;

if constexpr (std::is_same_v<PureSrc, PureDst>) {
return std::forward<Src>(x);
} else if constexpr (HasHardwareCastV<kDev, PureDst, PureSrc>) {
return HardwareCastImpl<kDev, PureDst, PureSrc>::Apply(
std::forward<Src>(x));
} else if constexpr (!std::is_same_v<PureSrc, Bridge> &&
!std::is_same_v<PureDst, Bridge> &&
HasHardwareCastV<kDev, Bridge, PureSrc> &&
(HasHardwareCastV<kDev, PureDst, Bridge> ||
std::is_arithmetic_v<PureDst>)) {
Bridge tmp = Cast<Bridge>(std::forward<Src>(x));
return Cast<PureDst>(tmp);
} else if constexpr (!std::is_same_v<PureDst, Bridge> &&
std::is_arithmetic_v<PureSrc> &&
HasHardwareCastV<kDev, PureDst, Bridge>) {
Bridge tmp = static_cast<Bridge>(std::forward<Src>(x));
return Cast<PureDst>(tmp);
} else if constexpr (std::is_arithmetic_v<PureSrc> &&
std::is_arithmetic_v<PureDst>) {
return static_cast<PureDst>(std::forward<Src>(x));
} else {
static_assert(HasHardwareCastV<kDev, PureDst, PureSrc>,
"no cast path available. "
"Need to provide `HardwareCastImpl` specialization.");
}
}
};

// Convenience wrapper for casting between two types.
// Otherwise, need to write `Caster<kDev>::template Cast<Target>(val)` every
// time.
template <Device::Type kDev, typename Target, typename Source>
inline Target CastTo(Source&& val) {
return Caster<kDev>::template Cast<Target>(std::forward<Source>(val));
}

template <Device::Type kDev, typename Source>
inline float ToFloat(Source&& val) {
return Caster<kDev>::template Cast<float>(std::forward<Source>(val));
}

} // namespace infini::ccl

#endif // INFINI_CCL_CASTER_H_
31 changes: 31 additions & 0 deletions src/cpu/caster_.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#ifndef INFINI_CCL_CPU_CASTER__H_
#define INFINI_CCL_CPU_CASTER__H_

#include "caster.h"
#include "data_type_.h"

namespace infini::ccl {

template <>
struct HardwareCastImpl<Device::Type::kCpu, float, Float16> {
static float Apply(Float16 x) { return x.ToFloat(); }
};

template <>
struct HardwareCastImpl<Device::Type::kCpu, Float16, float> {
static Float16 Apply(float x) { return Float16::FromFloat(x); }
};

template <>
struct HardwareCastImpl<Device::Type::kCpu, float, BFloat16> {
static float Apply(BFloat16 x) { return x.ToFloat(); }
};

template <>
struct HardwareCastImpl<Device::Type::kCpu, BFloat16, float> {
static BFloat16 Apply(float x) { return BFloat16::FromFloat(x); }
};

} // namespace infini::ccl

#endif // INFINI_CCL_CPU_CASTER__H_
51 changes: 51 additions & 0 deletions src/iluvatar/caster_.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#ifndef INFINI_CCL_ILUVATAR_CASTER__CUH_
#define INFINI_CCL_ILUVATAR_CASTER__CUH_

#include "caster.h"
#include "data_type_.h"
#include "traits.h"

namespace infini::ccl {

// Explicitly tell the trait system that host-side math on these types is
// unworkable on Iluvatar.
template <typename S, typename Op>
struct SupportsOp<half, S, Op, void> : std::false_type {};

template <typename S, typename Op>
struct SupportsOp<__nv_bfloat16, S, Op, void> : std::false_type {};

template <>
struct HardwareCastImpl<Device::Type::kIluvatar, float, half> {
__host__ __device__ static float Apply(half x) { return __half2float(x); }
};

template <>
struct HardwareCastImpl<Device::Type::kIluvatar, float, __nv_bfloat16> {
__host__ __device__ static float Apply(__nv_bfloat16 x) {
return __bfloat162float(x);
}
};

template <>
struct HardwareCastImpl<Device::Type::kIluvatar, half, float> {
__host__ __device__ static half Apply(float x) { return __float2half(x); }
};

template <>
struct HardwareCastImpl<Device::Type::kIluvatar, __nv_bfloat16, float> {
__host__ __device__ static __nv_bfloat16 Apply(float x) {
return __float2bfloat16(x);
}
};

template <>
struct HardwareCastImpl<Device::Type::kIluvatar, __nv_bfloat16, double> {
__host__ __device__ static __nv_bfloat16 Apply(double x) {
return __double2bfloat16(x);
}
};

} // namespace infini::ccl

#endif // INFINI_CCL_ILUVATAR_CASTER__CUH_
Loading
Loading