diff --git a/acceleration/interface/unordered_hash.hpp b/acceleration/interface/unordered_hash.hpp new file mode 100644 index 0000000..479d5a4 --- /dev/null +++ b/acceleration/interface/unordered_hash.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include + +namespace detail { + size_t unordered_hash_32(uint32_t* data, size_t length, size_t seed); +} + +template +size_t unordered_hash_32(const T* data, size_t length, size_t seed = 0) +{ + static_assert(sizeof(T) % sizeof(uint32_t) == 0, "Data type size must be multiple of 4 bytes."); + return detail::unordered_hash_32((uint32_t*)data, length, seed); +} + +template +size_t unordered_hash_32(const Container& container, size_t seed = 0) +{ + using T = typename Container::value_type; + static_assert(sizeof(T) % sizeof(uint32_t) == 0, "Data type size must be multiple of 4 bytes."); + return detail::unordered_hash_32((uint32_t*)container.data(), container.size(), seed); +} \ No newline at end of file diff --git a/acceleration/src/32x4/pre_unordered_hash.ispc b/acceleration/src/32x4/pre_unordered_hash.ispc new file mode 100644 index 0000000..5bd60a8 --- /dev/null +++ b/acceleration/src/32x4/pre_unordered_hash.ispc @@ -0,0 +1,28 @@ +export void pre_unordered_hash_32bit(uniform uint arr[], uniform uint64 length, uniform uint64 out[]) +{ + const uniform int32 perm1[] = {1, 0, 3, 2}; + const uniform int32 perm2[] = {0, 4, 4, 4}; + const uniform int32 perm3[] = {0, 1, 4, 5}; + + varying uint64 a = 0, b = 0; + varying uint64 c = 1; + varying uint64 n = length; + foreach (i = 0...length) + { + varying uint val = arr[i]; + a += val; + b ^= val; + c *= (val | 1); + } + + a += shuffle(a, perm1[programIndex]); + b ^= shuffle(b, perm1[programIndex]); + c *= shuffle(c, perm1[programIndex]); + a += rotate(a, -2); + b ^= rotate(b, -2); + c *= rotate(c, -2); + + a = shuffle(a, b, perm2[programIndex]); + b = shuffle(c, n, perm2[programIndex]); + out[programIndex] = shuffle(a, b, perm3[programIndex]); +} \ No newline at end of file diff --git a/acceleration/src/32x4/sort.ispc b/acceleration/src/32x4/sort.ispc new file mode 100644 index 0000000..3cbb5a6 --- /dev/null +++ b/acceleration/src/32x4/sort.ispc @@ -0,0 +1,30 @@ +export void sort_u32x4(uniform uint arr[]) +{ + const uniform int32 perm1[] = {2, 3, 0, 1}; + const uniform int32 perm2[] = {0, 1, 4, 5}; + const uniform int32 perm3[] = {1, 0, 3, 2}; + const uniform int32 perm4[] = {0, 4, 2, 6}; + const uniform int32 perm5[] = {0, 2, 1, 3}; + const uniform int32 perm6[] = {0, 1, 6, 3}; + + varying int32 val = arr[programIndex]; + varying int32 to_compare = shuffle(val, perm1[programIndex]); + + varying int32 min_val = min(val, to_compare); + varying int32 max_val = max(val, to_compare); + + val = shuffle(min_val, max_val, perm2[programIndex]); + to_compare = shuffle(val, perm3[programIndex]); + + min_val = min(val, to_compare); + max_val = max(val, to_compare); + + val = shuffle(min_val, max_val, perm4[programIndex]); + to_compare = shuffle(val, perm5[programIndex]); + + min_val = min(val, to_compare); + max_val = max(val, to_compare); + + val = shuffle(min_val, max_val, perm6[programIndex]); + arr[programIndex] = val; +} \ No newline at end of file diff --git a/acceleration/src/unordered_hash.cpp b/acceleration/src/unordered_hash.cpp new file mode 100644 index 0000000..d8d49a8 --- /dev/null +++ b/acceleration/src/unordered_hash.cpp @@ -0,0 +1,47 @@ +#include + +#include +#include + +namespace detail { + static inline size_t rtol_64(size_t val, size_t shift) + { + return (val >> shift) | (val << (64 - shift)); + } + + static inline size_t mix_last_64(size_t seed, size_t h) + { + h *= 0x87c37b91114253d5L; + h = rtol_64(h, 31); + h *= 0x4cf5ad432745937fL; + + return seed ^ h; + } + + static inline size_t mix_64(size_t seed, size_t h) + { + h = mix_last_64(seed, h); + h = rtol_64(h, 27); + return h * 5 + 0x52dce729; + } + + static inline size_t avalanche_64(size_t h) + { + h ^= h >> 33; + h *= 0xff51afd7ed558ccdL; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53L; + h ^= h >> 33; + return h; + } + + size_t unordered_hash_32(uint32_t* data, size_t length, size_t seed) + { + std::array intermediate{}; + ispc::pre_unordered_hash_32bit(data, length, intermediate.data()); + seed = mix_64(seed, intermediate[0]); + seed = mix_64(seed, intermediate[1]); + seed = mix_last_64(seed, intermediate[2]); + return avalanche_64(seed ^ intermediate[3]); + } +} \ No newline at end of file diff --git a/acceleration/xmake.lua b/acceleration/xmake.lua new file mode 100644 index 0000000..76ddd83 --- /dev/null +++ b/acceleration/xmake.lua @@ -0,0 +1,13 @@ +-- for now do not use avx-512 since it performs worse, and may cause instruction error under some cases +local ispc_target_list = {"sse2-i32x4", + "sse4.1-i32x4", + "avx1-i32x4", + "avx2-i32x4", + "avx2vnni-i32x4"} + +target("cpu_acceleration_32x4") + set_kind("static") + add_rules("generate.ispc", {header_extension = "_ispc.h", arch = "x86-64", target_list = ispc_target_list}) + add_includedirs("interface/", {public = true}) + add_files("src/**.ispc") + add_files("src/**.cpp") \ No newline at end of file diff --git a/xmake/rules/ispc/xmake.lua b/xmake/rules/ispc/xmake.lua new file mode 100644 index 0000000..4b753f8 --- /dev/null +++ b/xmake/rules/ispc/xmake.lua @@ -0,0 +1,96 @@ +rule("generate.ispc") + set_extensions(".ispc") + + on_config(function (target) + local headersdir = path.join(target:autogendir(), "rules", "ispc", "headers") + os.mkdir(headersdir) + target:add("includedirs", headersdir, {public = true}) + end) + + before_buildcmd_file(function (target, batchcmds, sourcefile_ispc, opt) + import("lib.detect.find_tool") + local ispc = assert(find_tool("ispc"), "ispc not found!") + + local flags = {} + if target:values("ispc.flags") then + table.join2(flags, target:values("ispc.flags")) + end + + if target:get("symbols") == "debug" then + table.insert(flags, "-g") + end + + if target:get("optimize") == "none" then + table.insert(flags, "-O0") + elseif target:get("optimize") == "fast" then + table.insert(flags, "-O2") + elseif target:get("optimize") == "faster" or target:get("optimize") == "fastest" then + table.insert(flags, "-O3") + elseif target:get("optimize") == "smallest" then + table.insert(flags, "-O1") + end + + if target:get("warnings") == "none" then + table.insert(flags, "--woff") + elseif target:get("warnings") == "error" then + table.insert(flags, "--werror") + end + + if not target:is_plat("windows") then + table.insert(flags, "--pic") + end + + local headersdir = path.join(target:autogendir(), "rules", "ispc", "headers") + local objectfile = target:objectfile(sourcefile_ispc) + local objectdir = path.directory(objectfile) + local headersfile + local header_extension = target:extraconf("rules", "generate.ispc", "header_extension") + if header_extension then + headersfile = path.join(headersdir, path.basename(sourcefile_ispc) .. header_extension) + else + headersfile = path.join(headersdir, path.filename(sourcefile_ispc) .. ".h") + end + local arch = target:extraconf("rules", "generate.ispc", "arch") + if arch then + table.insert(flags, "--arch=" .. arch) + end + local target_list = target:extraconf("rules", "generate.ispc", "target_list") + if target_list then + local targets = "" + for _, target_item in ipairs(target_list) do + targets = targets .. target_item .. "," + end + table.insert(flags, "--target=" .. targets) + end + + table.insert(flags, "-o") + table.insert(flags, path(objectfile)) + table.insert(flags, "-h") + table.insert(flags, path(headersfile)) + table.insert(flags, "-I") + table.insert(flags, os.projectdir()) + table.insert(flags, path(sourcefile_ispc)) + + local compile_flags = "ispc " + for _, flag in ipairs(flags) do + compile_flags = compile_flags .. tostring(flag) .. " " + end + print("try building ispc file: %s", compile_flags) + batchcmds:show_progress(opt.progress, "${color.build.object}compiling.ispc %s", sourcefile_ispc) + batchcmds:mkdir(objectdir) + batchcmds:vrunv(ispc.program, flags) + + table.insert(target:objectfiles(), objectfile) + if table.getn(target_list) > 1 then + for _, target_item in ipairs(target_list) do + i, _ = string.find(target_item, "[.-]") + obj_suffix = string.sub(target_item, 1, i-1) + obj_suffix = string.gsub(obj_suffix, "avx1", "avx") + table.insert(target:objectfiles(), target:objectfile(sourcefile_ispc .. "_" .. obj_suffix)) + end + end + + batchcmds:add_depfiles(sourcefile_ispc, headersfile) + batchcmds:set_depmtime(os.mtime(objectfile)) + batchcmds:set_depcache(target:dependfile(objectfile)) + end) \ No newline at end of file