6 changed files with 236 additions and 0 deletions
@ -0,0 +1,22 @@ |
|||
#pragma once |
|||
|
|||
#include <pre_unordered_hash_ispc.h> |
|||
|
|||
namespace detail { |
|||
size_t unordered_hash_32(uint32_t* data, size_t length, size_t seed); |
|||
} |
|||
|
|||
template <typename T> |
|||
size_t unordered_hash_32(const T* data, size_t length, size_t seed = 0) |
|||
{ |
|||
static_assert(sizeof(T) % sizeof(uint32_t) == 0, "Data type size must be multiple of 4 bytes."); |
|||
return detail::unordered_hash_32((uint32_t*)data, length, seed); |
|||
} |
|||
|
|||
template <typename Container> |
|||
size_t unordered_hash_32(const Container& container, size_t seed = 0) |
|||
{ |
|||
using T = typename Container::value_type; |
|||
static_assert(sizeof(T) % sizeof(uint32_t) == 0, "Data type size must be multiple of 4 bytes."); |
|||
return detail::unordered_hash_32((uint32_t*)container.data(), container.size(), seed); |
|||
} |
|||
@ -0,0 +1,28 @@ |
|||
export void pre_unordered_hash_32bit(uniform uint arr[], uniform uint64 length, uniform uint64 out[]) |
|||
{ |
|||
const uniform int32 perm1[] = {1, 0, 3, 2}; |
|||
const uniform int32 perm2[] = {0, 4, 4, 4}; |
|||
const uniform int32 perm3[] = {0, 1, 4, 5}; |
|||
|
|||
varying uint64 a = 0, b = 0; |
|||
varying uint64 c = 1; |
|||
varying uint64 n = length; |
|||
foreach (i = 0...length) |
|||
{ |
|||
varying uint val = arr[i]; |
|||
a += val; |
|||
b ^= val; |
|||
c *= (val | 1); |
|||
} |
|||
|
|||
a += shuffle(a, perm1[programIndex]); |
|||
b ^= shuffle(b, perm1[programIndex]); |
|||
c *= shuffle(c, perm1[programIndex]); |
|||
a += rotate(a, -2); |
|||
b ^= rotate(b, -2); |
|||
c *= rotate(c, -2); |
|||
|
|||
a = shuffle(a, b, perm2[programIndex]); |
|||
b = shuffle(c, n, perm2[programIndex]); |
|||
out[programIndex] = shuffle(a, b, perm3[programIndex]); |
|||
} |
|||
@ -0,0 +1,30 @@ |
|||
export void sort_u32x4(uniform uint arr[]) |
|||
{ |
|||
const uniform int32 perm1[] = {2, 3, 0, 1}; |
|||
const uniform int32 perm2[] = {0, 1, 4, 5}; |
|||
const uniform int32 perm3[] = {1, 0, 3, 2}; |
|||
const uniform int32 perm4[] = {0, 4, 2, 6}; |
|||
const uniform int32 perm5[] = {0, 2, 1, 3}; |
|||
const uniform int32 perm6[] = {0, 1, 6, 3}; |
|||
|
|||
varying int32 val = arr[programIndex]; |
|||
varying int32 to_compare = shuffle(val, perm1[programIndex]); |
|||
|
|||
varying int32 min_val = min(val, to_compare); |
|||
varying int32 max_val = max(val, to_compare); |
|||
|
|||
val = shuffle(min_val, max_val, perm2[programIndex]); |
|||
to_compare = shuffle(val, perm3[programIndex]); |
|||
|
|||
min_val = min(val, to_compare); |
|||
max_val = max(val, to_compare); |
|||
|
|||
val = shuffle(min_val, max_val, perm4[programIndex]); |
|||
to_compare = shuffle(val, perm5[programIndex]); |
|||
|
|||
min_val = min(val, to_compare); |
|||
max_val = max(val, to_compare); |
|||
|
|||
val = shuffle(min_val, max_val, perm6[programIndex]); |
|||
arr[programIndex] = val; |
|||
} |
|||
@ -0,0 +1,47 @@ |
|||
#include <array> |
|||
|
|||
#include <cstddef> |
|||
#include <unordered_hash.hpp> |
|||
|
|||
namespace detail { |
|||
static inline size_t rtol_64(size_t val, size_t shift) |
|||
{ |
|||
return (val >> shift) | (val << (64 - shift)); |
|||
} |
|||
|
|||
static inline size_t mix_last_64(size_t seed, size_t h) |
|||
{ |
|||
h *= 0x87c37b91114253d5L; |
|||
h = rtol_64(h, 31); |
|||
h *= 0x4cf5ad432745937fL; |
|||
|
|||
return seed ^ h; |
|||
} |
|||
|
|||
static inline size_t mix_64(size_t seed, size_t h) |
|||
{ |
|||
h = mix_last_64(seed, h); |
|||
h = rtol_64(h, 27); |
|||
return h * 5 + 0x52dce729; |
|||
} |
|||
|
|||
static inline size_t avalanche_64(size_t h) |
|||
{ |
|||
h ^= h >> 33; |
|||
h *= 0xff51afd7ed558ccdL; |
|||
h ^= h >> 33; |
|||
h *= 0xc4ceb9fe1a85ec53L; |
|||
h ^= h >> 33; |
|||
return h; |
|||
} |
|||
|
|||
size_t unordered_hash_32(uint32_t* data, size_t length, size_t seed) |
|||
{ |
|||
std::array<size_t, 4> intermediate{}; |
|||
ispc::pre_unordered_hash_32bit(data, length, intermediate.data()); |
|||
seed = mix_64(seed, intermediate[0]); |
|||
seed = mix_64(seed, intermediate[1]); |
|||
seed = mix_last_64(seed, intermediate[2]); |
|||
return avalanche_64(seed ^ intermediate[3]); |
|||
} |
|||
} |
|||
@ -0,0 +1,13 @@ |
|||
-- for now do not use avx-512 since it performs worse, and may cause instruction error under some cases |
|||
local ispc_target_list = {"sse2-i32x4", |
|||
"sse4.1-i32x4", |
|||
"avx1-i32x4", |
|||
"avx2-i32x4", |
|||
"avx2vnni-i32x4"} |
|||
|
|||
target("cpu_acceleration_32x4") |
|||
set_kind("static") |
|||
add_rules("generate.ispc", {header_extension = "_ispc.h", arch = "x86-64", target_list = ispc_target_list}) |
|||
add_includedirs("interface/", {public = true}) |
|||
add_files("src/**.ispc") |
|||
add_files("src/**.cpp") |
|||
@ -0,0 +1,96 @@ |
|||
rule("generate.ispc") |
|||
set_extensions(".ispc") |
|||
|
|||
on_config(function (target) |
|||
local headersdir = path.join(target:autogendir(), "rules", "ispc", "headers") |
|||
os.mkdir(headersdir) |
|||
target:add("includedirs", headersdir, {public = true}) |
|||
end) |
|||
|
|||
before_buildcmd_file(function (target, batchcmds, sourcefile_ispc, opt) |
|||
import("lib.detect.find_tool") |
|||
local ispc = assert(find_tool("ispc"), "ispc not found!") |
|||
|
|||
local flags = {} |
|||
if target:values("ispc.flags") then |
|||
table.join2(flags, target:values("ispc.flags")) |
|||
end |
|||
|
|||
if target:get("symbols") == "debug" then |
|||
table.insert(flags, "-g") |
|||
end |
|||
|
|||
if target:get("optimize") == "none" then |
|||
table.insert(flags, "-O0") |
|||
elseif target:get("optimize") == "fast" then |
|||
table.insert(flags, "-O2") |
|||
elseif target:get("optimize") == "faster" or target:get("optimize") == "fastest" then |
|||
table.insert(flags, "-O3") |
|||
elseif target:get("optimize") == "smallest" then |
|||
table.insert(flags, "-O1") |
|||
end |
|||
|
|||
if target:get("warnings") == "none" then |
|||
table.insert(flags, "--woff") |
|||
elseif target:get("warnings") == "error" then |
|||
table.insert(flags, "--werror") |
|||
end |
|||
|
|||
if not target:is_plat("windows") then |
|||
table.insert(flags, "--pic") |
|||
end |
|||
|
|||
local headersdir = path.join(target:autogendir(), "rules", "ispc", "headers") |
|||
local objectfile = target:objectfile(sourcefile_ispc) |
|||
local objectdir = path.directory(objectfile) |
|||
local headersfile |
|||
local header_extension = target:extraconf("rules", "generate.ispc", "header_extension") |
|||
if header_extension then |
|||
headersfile = path.join(headersdir, path.basename(sourcefile_ispc) .. header_extension) |
|||
else |
|||
headersfile = path.join(headersdir, path.filename(sourcefile_ispc) .. ".h") |
|||
end |
|||
local arch = target:extraconf("rules", "generate.ispc", "arch") |
|||
if arch then |
|||
table.insert(flags, "--arch=" .. arch) |
|||
end |
|||
local target_list = target:extraconf("rules", "generate.ispc", "target_list") |
|||
if target_list then |
|||
local targets = "" |
|||
for _, target_item in ipairs(target_list) do |
|||
targets = targets .. target_item .. "," |
|||
end |
|||
table.insert(flags, "--target=" .. targets) |
|||
end |
|||
|
|||
table.insert(flags, "-o") |
|||
table.insert(flags, path(objectfile)) |
|||
table.insert(flags, "-h") |
|||
table.insert(flags, path(headersfile)) |
|||
table.insert(flags, "-I") |
|||
table.insert(flags, os.projectdir()) |
|||
table.insert(flags, path(sourcefile_ispc)) |
|||
|
|||
local compile_flags = "ispc " |
|||
for _, flag in ipairs(flags) do |
|||
compile_flags = compile_flags .. tostring(flag) .. " " |
|||
end |
|||
print("try building ispc file: %s", compile_flags) |
|||
batchcmds:show_progress(opt.progress, "${color.build.object}compiling.ispc %s", sourcefile_ispc) |
|||
batchcmds:mkdir(objectdir) |
|||
batchcmds:vrunv(ispc.program, flags) |
|||
|
|||
table.insert(target:objectfiles(), objectfile) |
|||
if table.getn(target_list) > 1 then |
|||
for _, target_item in ipairs(target_list) do |
|||
i, _ = string.find(target_item, "[.-]") |
|||
obj_suffix = string.sub(target_item, 1, i-1) |
|||
obj_suffix = string.gsub(obj_suffix, "avx1", "avx") |
|||
table.insert(target:objectfiles(), target:objectfile(sourcefile_ispc .. "_" .. obj_suffix)) |
|||
end |
|||
end |
|||
|
|||
batchcmds:add_depfiles(sourcefile_ispc, headersfile) |
|||
batchcmds:set_depmtime(os.mtime(objectfile)) |
|||
batchcmds:set_depcache(target:dependfile(objectfile)) |
|||
end) |
|||
Loading…
Reference in new issue