Browse Source

cpu accleration module(impled by ispc)

V2-integral-fix
Zhicheng Wang 1 week ago
parent
commit
51149bf9f9
  1. 22
      acceleration/interface/unordered_hash.hpp
  2. 28
      acceleration/src/32x4/pre_unordered_hash.ispc
  3. 30
      acceleration/src/32x4/sort.ispc
  4. 47
      acceleration/src/unordered_hash.cpp
  5. 13
      acceleration/xmake.lua
  6. 96
      xmake/rules/ispc/xmake.lua

22
acceleration/interface/unordered_hash.hpp

@ -0,0 +1,22 @@
#pragma once
#include <pre_unordered_hash_ispc.h>
namespace detail {
size_t unordered_hash_32(uint32_t* data, size_t length, size_t seed);
}
template <typename T>
size_t unordered_hash_32(const T* data, size_t length, size_t seed = 0)
{
static_assert(sizeof(T) % sizeof(uint32_t) == 0, "Data type size must be multiple of 4 bytes.");
return detail::unordered_hash_32((uint32_t*)data, length, seed);
}
template <typename Container>
size_t unordered_hash_32(const Container& container, size_t seed = 0)
{
using T = typename Container::value_type;
static_assert(sizeof(T) % sizeof(uint32_t) == 0, "Data type size must be multiple of 4 bytes.");
return detail::unordered_hash_32((uint32_t*)container.data(), container.size(), seed);
}

28
acceleration/src/32x4/pre_unordered_hash.ispc

@ -0,0 +1,28 @@
export void pre_unordered_hash_32bit(uniform uint arr[], uniform uint64 length, uniform uint64 out[])
{
const uniform int32 perm1[] = {1, 0, 3, 2};
const uniform int32 perm2[] = {0, 4, 4, 4};
const uniform int32 perm3[] = {0, 1, 4, 5};
varying uint64 a = 0, b = 0;
varying uint64 c = 1;
varying uint64 n = length;
foreach (i = 0...length)
{
varying uint val = arr[i];
a += val;
b ^= val;
c *= (val | 1);
}
a += shuffle(a, perm1[programIndex]);
b ^= shuffle(b, perm1[programIndex]);
c *= shuffle(c, perm1[programIndex]);
a += rotate(a, -2);
b ^= rotate(b, -2);
c *= rotate(c, -2);
a = shuffle(a, b, perm2[programIndex]);
b = shuffle(c, n, perm2[programIndex]);
out[programIndex] = shuffle(a, b, perm3[programIndex]);
}

30
acceleration/src/32x4/sort.ispc

@ -0,0 +1,30 @@
export void sort_u32x4(uniform uint arr[])
{
const uniform int32 perm1[] = {2, 3, 0, 1};
const uniform int32 perm2[] = {0, 1, 4, 5};
const uniform int32 perm3[] = {1, 0, 3, 2};
const uniform int32 perm4[] = {0, 4, 2, 6};
const uniform int32 perm5[] = {0, 2, 1, 3};
const uniform int32 perm6[] = {0, 1, 6, 3};
varying int32 val = arr[programIndex];
varying int32 to_compare = shuffle(val, perm1[programIndex]);
varying int32 min_val = min(val, to_compare);
varying int32 max_val = max(val, to_compare);
val = shuffle(min_val, max_val, perm2[programIndex]);
to_compare = shuffle(val, perm3[programIndex]);
min_val = min(val, to_compare);
max_val = max(val, to_compare);
val = shuffle(min_val, max_val, perm4[programIndex]);
to_compare = shuffle(val, perm5[programIndex]);
min_val = min(val, to_compare);
max_val = max(val, to_compare);
val = shuffle(min_val, max_val, perm6[programIndex]);
arr[programIndex] = val;
}

47
acceleration/src/unordered_hash.cpp

@ -0,0 +1,47 @@
#include <array>
#include <cstddef>
#include <unordered_hash.hpp>
namespace detail {
static inline size_t rtol_64(size_t val, size_t shift)
{
return (val >> shift) | (val << (64 - shift));
}
static inline size_t mix_last_64(size_t seed, size_t h)
{
h *= 0x87c37b91114253d5L;
h = rtol_64(h, 31);
h *= 0x4cf5ad432745937fL;
return seed ^ h;
}
static inline size_t mix_64(size_t seed, size_t h)
{
h = mix_last_64(seed, h);
h = rtol_64(h, 27);
return h * 5 + 0x52dce729;
}
static inline size_t avalanche_64(size_t h)
{
h ^= h >> 33;
h *= 0xff51afd7ed558ccdL;
h ^= h >> 33;
h *= 0xc4ceb9fe1a85ec53L;
h ^= h >> 33;
return h;
}
size_t unordered_hash_32(uint32_t* data, size_t length, size_t seed)
{
std::array<size_t, 4> intermediate{};
ispc::pre_unordered_hash_32bit(data, length, intermediate.data());
seed = mix_64(seed, intermediate[0]);
seed = mix_64(seed, intermediate[1]);
seed = mix_last_64(seed, intermediate[2]);
return avalanche_64(seed ^ intermediate[3]);
}
}

13
acceleration/xmake.lua

@ -0,0 +1,13 @@
-- for now do not use avx-512 since it performs worse, and may cause instruction error under some cases
local ispc_target_list = {"sse2-i32x4",
"sse4.1-i32x4",
"avx1-i32x4",
"avx2-i32x4",
"avx2vnni-i32x4"}
target("cpu_acceleration_32x4")
set_kind("static")
add_rules("generate.ispc", {header_extension = "_ispc.h", arch = "x86-64", target_list = ispc_target_list})
add_includedirs("interface/", {public = true})
add_files("src/**.ispc")
add_files("src/**.cpp")

96
xmake/rules/ispc/xmake.lua

@ -0,0 +1,96 @@
rule("generate.ispc")
set_extensions(".ispc")
on_config(function (target)
local headersdir = path.join(target:autogendir(), "rules", "ispc", "headers")
os.mkdir(headersdir)
target:add("includedirs", headersdir, {public = true})
end)
before_buildcmd_file(function (target, batchcmds, sourcefile_ispc, opt)
import("lib.detect.find_tool")
local ispc = assert(find_tool("ispc"), "ispc not found!")
local flags = {}
if target:values("ispc.flags") then
table.join2(flags, target:values("ispc.flags"))
end
if target:get("symbols") == "debug" then
table.insert(flags, "-g")
end
if target:get("optimize") == "none" then
table.insert(flags, "-O0")
elseif target:get("optimize") == "fast" then
table.insert(flags, "-O2")
elseif target:get("optimize") == "faster" or target:get("optimize") == "fastest" then
table.insert(flags, "-O3")
elseif target:get("optimize") == "smallest" then
table.insert(flags, "-O1")
end
if target:get("warnings") == "none" then
table.insert(flags, "--woff")
elseif target:get("warnings") == "error" then
table.insert(flags, "--werror")
end
if not target:is_plat("windows") then
table.insert(flags, "--pic")
end
local headersdir = path.join(target:autogendir(), "rules", "ispc", "headers")
local objectfile = target:objectfile(sourcefile_ispc)
local objectdir = path.directory(objectfile)
local headersfile
local header_extension = target:extraconf("rules", "generate.ispc", "header_extension")
if header_extension then
headersfile = path.join(headersdir, path.basename(sourcefile_ispc) .. header_extension)
else
headersfile = path.join(headersdir, path.filename(sourcefile_ispc) .. ".h")
end
local arch = target:extraconf("rules", "generate.ispc", "arch")
if arch then
table.insert(flags, "--arch=" .. arch)
end
local target_list = target:extraconf("rules", "generate.ispc", "target_list")
if target_list then
local targets = ""
for _, target_item in ipairs(target_list) do
targets = targets .. target_item .. ","
end
table.insert(flags, "--target=" .. targets)
end
table.insert(flags, "-o")
table.insert(flags, path(objectfile))
table.insert(flags, "-h")
table.insert(flags, path(headersfile))
table.insert(flags, "-I")
table.insert(flags, os.projectdir())
table.insert(flags, path(sourcefile_ispc))
local compile_flags = "ispc "
for _, flag in ipairs(flags) do
compile_flags = compile_flags .. tostring(flag) .. " "
end
print("try building ispc file: %s", compile_flags)
batchcmds:show_progress(opt.progress, "${color.build.object}compiling.ispc %s", sourcefile_ispc)
batchcmds:mkdir(objectdir)
batchcmds:vrunv(ispc.program, flags)
table.insert(target:objectfiles(), objectfile)
if table.getn(target_list) > 1 then
for _, target_item in ipairs(target_list) do
i, _ = string.find(target_item, "[.-]")
obj_suffix = string.sub(target_item, 1, i-1)
obj_suffix = string.gsub(obj_suffix, "avx1", "avx")
table.insert(target:objectfiles(), target:objectfile(sourcefile_ispc .. "_" .. obj_suffix))
end
end
batchcmds:add_depfiles(sourcefile_ispc, headersfile)
batchcmds:set_depmtime(os.mtime(objectfile))
batchcmds:set_depcache(target:dependfile(objectfile))
end)
Loading…
Cancel
Save