6 changed files with 236 additions and 0 deletions
@ -0,0 +1,22 @@ |
|||||
|
#pragma once |
||||
|
|
||||
|
#include <pre_unordered_hash_ispc.h> |
||||
|
|
||||
|
namespace detail { |
||||
|
size_t unordered_hash_32(uint32_t* data, size_t length, size_t seed); |
||||
|
} |
||||
|
|
||||
|
template <typename T> |
||||
|
size_t unordered_hash_32(const T* data, size_t length, size_t seed = 0) |
||||
|
{ |
||||
|
static_assert(sizeof(T) % sizeof(uint32_t) == 0, "Data type size must be multiple of 4 bytes."); |
||||
|
return detail::unordered_hash_32((uint32_t*)data, length, seed); |
||||
|
} |
||||
|
|
||||
|
template <typename Container> |
||||
|
size_t unordered_hash_32(const Container& container, size_t seed = 0) |
||||
|
{ |
||||
|
using T = typename Container::value_type; |
||||
|
static_assert(sizeof(T) % sizeof(uint32_t) == 0, "Data type size must be multiple of 4 bytes."); |
||||
|
return detail::unordered_hash_32((uint32_t*)container.data(), container.size(), seed); |
||||
|
} |
||||
@ -0,0 +1,28 @@ |
|||||
|
export void pre_unordered_hash_32bit(uniform uint arr[], uniform uint64 length, uniform uint64 out[]) |
||||
|
{ |
||||
|
const uniform int32 perm1[] = {1, 0, 3, 2}; |
||||
|
const uniform int32 perm2[] = {0, 4, 4, 4}; |
||||
|
const uniform int32 perm3[] = {0, 1, 4, 5}; |
||||
|
|
||||
|
varying uint64 a = 0, b = 0; |
||||
|
varying uint64 c = 1; |
||||
|
varying uint64 n = length; |
||||
|
foreach (i = 0...length) |
||||
|
{ |
||||
|
varying uint val = arr[i]; |
||||
|
a += val; |
||||
|
b ^= val; |
||||
|
c *= (val | 1); |
||||
|
} |
||||
|
|
||||
|
a += shuffle(a, perm1[programIndex]); |
||||
|
b ^= shuffle(b, perm1[programIndex]); |
||||
|
c *= shuffle(c, perm1[programIndex]); |
||||
|
a += rotate(a, -2); |
||||
|
b ^= rotate(b, -2); |
||||
|
c *= rotate(c, -2); |
||||
|
|
||||
|
a = shuffle(a, b, perm2[programIndex]); |
||||
|
b = shuffle(c, n, perm2[programIndex]); |
||||
|
out[programIndex] = shuffle(a, b, perm3[programIndex]); |
||||
|
} |
||||
@ -0,0 +1,30 @@ |
|||||
|
export void sort_u32x4(uniform uint arr[]) |
||||
|
{ |
||||
|
const uniform int32 perm1[] = {2, 3, 0, 1}; |
||||
|
const uniform int32 perm2[] = {0, 1, 4, 5}; |
||||
|
const uniform int32 perm3[] = {1, 0, 3, 2}; |
||||
|
const uniform int32 perm4[] = {0, 4, 2, 6}; |
||||
|
const uniform int32 perm5[] = {0, 2, 1, 3}; |
||||
|
const uniform int32 perm6[] = {0, 1, 6, 3}; |
||||
|
|
||||
|
varying int32 val = arr[programIndex]; |
||||
|
varying int32 to_compare = shuffle(val, perm1[programIndex]); |
||||
|
|
||||
|
varying int32 min_val = min(val, to_compare); |
||||
|
varying int32 max_val = max(val, to_compare); |
||||
|
|
||||
|
val = shuffle(min_val, max_val, perm2[programIndex]); |
||||
|
to_compare = shuffle(val, perm3[programIndex]); |
||||
|
|
||||
|
min_val = min(val, to_compare); |
||||
|
max_val = max(val, to_compare); |
||||
|
|
||||
|
val = shuffle(min_val, max_val, perm4[programIndex]); |
||||
|
to_compare = shuffle(val, perm5[programIndex]); |
||||
|
|
||||
|
min_val = min(val, to_compare); |
||||
|
max_val = max(val, to_compare); |
||||
|
|
||||
|
val = shuffle(min_val, max_val, perm6[programIndex]); |
||||
|
arr[programIndex] = val; |
||||
|
} |
||||
@ -0,0 +1,47 @@ |
|||||
|
#include <array> |
||||
|
|
||||
|
#include <cstddef> |
||||
|
#include <unordered_hash.hpp> |
||||
|
|
||||
|
namespace detail { |
||||
|
static inline size_t rtol_64(size_t val, size_t shift) |
||||
|
{ |
||||
|
return (val >> shift) | (val << (64 - shift)); |
||||
|
} |
||||
|
|
||||
|
static inline size_t mix_last_64(size_t seed, size_t h) |
||||
|
{ |
||||
|
h *= 0x87c37b91114253d5L; |
||||
|
h = rtol_64(h, 31); |
||||
|
h *= 0x4cf5ad432745937fL; |
||||
|
|
||||
|
return seed ^ h; |
||||
|
} |
||||
|
|
||||
|
static inline size_t mix_64(size_t seed, size_t h) |
||||
|
{ |
||||
|
h = mix_last_64(seed, h); |
||||
|
h = rtol_64(h, 27); |
||||
|
return h * 5 + 0x52dce729; |
||||
|
} |
||||
|
|
||||
|
static inline size_t avalanche_64(size_t h) |
||||
|
{ |
||||
|
h ^= h >> 33; |
||||
|
h *= 0xff51afd7ed558ccdL; |
||||
|
h ^= h >> 33; |
||||
|
h *= 0xc4ceb9fe1a85ec53L; |
||||
|
h ^= h >> 33; |
||||
|
return h; |
||||
|
} |
||||
|
|
||||
|
size_t unordered_hash_32(uint32_t* data, size_t length, size_t seed) |
||||
|
{ |
||||
|
std::array<size_t, 4> intermediate{}; |
||||
|
ispc::pre_unordered_hash_32bit(data, length, intermediate.data()); |
||||
|
seed = mix_64(seed, intermediate[0]); |
||||
|
seed = mix_64(seed, intermediate[1]); |
||||
|
seed = mix_last_64(seed, intermediate[2]); |
||||
|
return avalanche_64(seed ^ intermediate[3]); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,13 @@ |
|||||
|
-- for now do not use avx-512 since it performs worse, and may cause instruction error under some cases |
||||
|
local ispc_target_list = {"sse2-i32x4", |
||||
|
"sse4.1-i32x4", |
||||
|
"avx1-i32x4", |
||||
|
"avx2-i32x4", |
||||
|
"avx2vnni-i32x4"} |
||||
|
|
||||
|
target("cpu_acceleration_32x4") |
||||
|
set_kind("static") |
||||
|
add_rules("generate.ispc", {header_extension = "_ispc.h", arch = "x86-64", target_list = ispc_target_list}) |
||||
|
add_includedirs("interface/", {public = true}) |
||||
|
add_files("src/**.ispc") |
||||
|
add_files("src/**.cpp") |
||||
@ -0,0 +1,96 @@ |
|||||
|
rule("generate.ispc") |
||||
|
set_extensions(".ispc") |
||||
|
|
||||
|
on_config(function (target) |
||||
|
local headersdir = path.join(target:autogendir(), "rules", "ispc", "headers") |
||||
|
os.mkdir(headersdir) |
||||
|
target:add("includedirs", headersdir, {public = true}) |
||||
|
end) |
||||
|
|
||||
|
before_buildcmd_file(function (target, batchcmds, sourcefile_ispc, opt) |
||||
|
import("lib.detect.find_tool") |
||||
|
local ispc = assert(find_tool("ispc"), "ispc not found!") |
||||
|
|
||||
|
local flags = {} |
||||
|
if target:values("ispc.flags") then |
||||
|
table.join2(flags, target:values("ispc.flags")) |
||||
|
end |
||||
|
|
||||
|
if target:get("symbols") == "debug" then |
||||
|
table.insert(flags, "-g") |
||||
|
end |
||||
|
|
||||
|
if target:get("optimize") == "none" then |
||||
|
table.insert(flags, "-O0") |
||||
|
elseif target:get("optimize") == "fast" then |
||||
|
table.insert(flags, "-O2") |
||||
|
elseif target:get("optimize") == "faster" or target:get("optimize") == "fastest" then |
||||
|
table.insert(flags, "-O3") |
||||
|
elseif target:get("optimize") == "smallest" then |
||||
|
table.insert(flags, "-O1") |
||||
|
end |
||||
|
|
||||
|
if target:get("warnings") == "none" then |
||||
|
table.insert(flags, "--woff") |
||||
|
elseif target:get("warnings") == "error" then |
||||
|
table.insert(flags, "--werror") |
||||
|
end |
||||
|
|
||||
|
if not target:is_plat("windows") then |
||||
|
table.insert(flags, "--pic") |
||||
|
end |
||||
|
|
||||
|
local headersdir = path.join(target:autogendir(), "rules", "ispc", "headers") |
||||
|
local objectfile = target:objectfile(sourcefile_ispc) |
||||
|
local objectdir = path.directory(objectfile) |
||||
|
local headersfile |
||||
|
local header_extension = target:extraconf("rules", "generate.ispc", "header_extension") |
||||
|
if header_extension then |
||||
|
headersfile = path.join(headersdir, path.basename(sourcefile_ispc) .. header_extension) |
||||
|
else |
||||
|
headersfile = path.join(headersdir, path.filename(sourcefile_ispc) .. ".h") |
||||
|
end |
||||
|
local arch = target:extraconf("rules", "generate.ispc", "arch") |
||||
|
if arch then |
||||
|
table.insert(flags, "--arch=" .. arch) |
||||
|
end |
||||
|
local target_list = target:extraconf("rules", "generate.ispc", "target_list") |
||||
|
if target_list then |
||||
|
local targets = "" |
||||
|
for _, target_item in ipairs(target_list) do |
||||
|
targets = targets .. target_item .. "," |
||||
|
end |
||||
|
table.insert(flags, "--target=" .. targets) |
||||
|
end |
||||
|
|
||||
|
table.insert(flags, "-o") |
||||
|
table.insert(flags, path(objectfile)) |
||||
|
table.insert(flags, "-h") |
||||
|
table.insert(flags, path(headersfile)) |
||||
|
table.insert(flags, "-I") |
||||
|
table.insert(flags, os.projectdir()) |
||||
|
table.insert(flags, path(sourcefile_ispc)) |
||||
|
|
||||
|
local compile_flags = "ispc " |
||||
|
for _, flag in ipairs(flags) do |
||||
|
compile_flags = compile_flags .. tostring(flag) .. " " |
||||
|
end |
||||
|
print("try building ispc file: %s", compile_flags) |
||||
|
batchcmds:show_progress(opt.progress, "${color.build.object}compiling.ispc %s", sourcefile_ispc) |
||||
|
batchcmds:mkdir(objectdir) |
||||
|
batchcmds:vrunv(ispc.program, flags) |
||||
|
|
||||
|
table.insert(target:objectfiles(), objectfile) |
||||
|
if table.getn(target_list) > 1 then |
||||
|
for _, target_item in ipairs(target_list) do |
||||
|
i, _ = string.find(target_item, "[.-]") |
||||
|
obj_suffix = string.sub(target_item, 1, i-1) |
||||
|
obj_suffix = string.gsub(obj_suffix, "avx1", "avx") |
||||
|
table.insert(target:objectfiles(), target:objectfile(sourcefile_ispc .. "_" .. obj_suffix)) |
||||
|
end |
||||
|
end |
||||
|
|
||||
|
batchcmds:add_depfiles(sourcefile_ispc, headersfile) |
||||
|
batchcmds:set_depmtime(os.mtime(objectfile)) |
||||
|
batchcmds:set_depcache(target:dependfile(objectfile)) |
||||
|
end) |
||||
Loading…
Reference in new issue