16#ifndef TLX_SORT_STRINGS_SAMPLE_SORT_TOOLS_HEADER
17#define TLX_SORT_STRINGS_SAMPLE_SORT_TOOLS_HEADER
33namespace sort_strings_detail {
38template <
typename Type>
40std::string
to_binary(Type v,
const size_t width = (8 *
sizeof(Type))) {
41 std::string str(width,
' ');
42 for (
size_t i = 0; i < width; i++) {
43 str[width - i - 1] = (v & 1) ?
'1' :
'0';
50template <
size_t TreeBits>
52 static const bool debug =
false;
63 int hi =
treebits - 32 + clz<uint32_t>(
id);
66 unsigned int bkt = ((
id << (hi + 1)) & bitmask) | (1 << hi);
78 int lo = ctz<uint32_t>(
id) + 1;
81 unsigned int bkt = ((
id >> lo) & bitmask) | (1 << (
treebits - lo));
121template <
typename key_type,
size_t num_splitters>
130 key_type tree[num_splitters + 1],
131 unsigned char splitter_lcp[num_splitters + 1],
132 const key_type* samples,
size_t samplesize)
135 key_type sentinel = 0;
136 recurse(samples, samples + samplesize, 1, sentinel);
138 assert(
splitter_ == splitter + num_splitters);
139 assert(
lcp_iter_ == splitter_lcp + num_splitters);
141 splitter_lcp[0] &= 0x80;
143 splitter_lcp[num_splitters] = 0;
146 ptrdiff_t
snum(
const key_type* s)
const {
147 return static_cast<ptrdiff_t
>(s -
samples_);
150 key_type
recurse(
const key_type* lo,
const key_type* hi,
151 unsigned int treeidx, key_type& rec_prevkey) {
153 <<
"rec_buildtree(" <<
snum(lo) <<
"," <<
snum(hi)
154 <<
", treeidx=" << treeidx <<
")";
157 const key_type* mid = lo +
static_cast<ptrdiff_t
>(hi - lo) / 2;
160 <<
"tree[" << treeidx <<
"] = samples[" <<
snum(mid) <<
"] = "
163 key_type mykey =
tree_[treeidx] = *mid;
165 const key_type* midlo = mid;
166 while (lo < midlo && *(midlo - 1) == mykey) midlo--;
168 const key_type* midhi = mid;
169 while (midhi + 1 < hi && *midhi == mykey) midhi++;
171 if (midhi - midlo > 1)
174 const key_type* midlo = mid, * midhi = mid + 1;
176 if (2 * treeidx < num_splitters)
178 key_type prevkey =
recurse(lo, midlo, 2 * treeidx + 0, rec_prevkey);
180 key_type xorSplit = prevkey ^ mykey;
186 <<
" - " <<
clz(xorSplit)
187 <<
" bits = " <<
clz(xorSplit) / 8
193 (
clz(xorSplit) / 8) |
195 ((mykey & 0xFF) ? 0 : 0x80);
197 return recurse(midhi, hi, 2 * treeidx + 1, mykey);
201 key_type xorSplit = rec_prevkey ^ mykey;
207 <<
" - " <<
clz(xorSplit)
208 <<
" bits = " <<
clz(xorSplit) / 8
214 (
clz(xorSplit) / 8) |
216 ((mykey & 0xFF) ? 0 : 0x80);
231template <
typename key_type,
size_t num_splitters>
239 unsigned char splitter_lcp[num_splitters + 1],
240 const key_type* samples,
size_t samplesize)
244 key_type sentinel = 0;
245 recurse(samples, samples + samplesize, 1, sentinel);
247 assert(
lcp_iter_ == splitter_lcp + num_splitters);
249 splitter_lcp[0] &= 0x80;
251 splitter_lcp[num_splitters] = 0;
254 ptrdiff_t
snum(
const key_type* s)
const {
255 return static_cast<ptrdiff_t
>(s -
samples_);
258 key_type
recurse(
const key_type* lo,
const key_type* hi,
259 unsigned int treeidx, key_type& rec_prevkey) {
261 <<
"rec_buildtree(" <<
snum(lo) <<
"," <<
snum(hi)
262 <<
", treeidx=" << treeidx <<
")";
265 const key_type* mid = lo +
static_cast<ptrdiff_t
>(hi - lo) / 2;
268 <<
"tree[" << treeidx <<
"] = samples[" <<
snum(mid) <<
"] = "
271 key_type mykey =
tree_[treeidx] = *mid;
273 const key_type* midlo = mid;
274 while (lo < midlo && *(midlo - 1) == mykey) midlo--;
276 const key_type* midhi = mid;
277 while (midhi + 1 < hi && *midhi == mykey) midhi++;
279 if (midhi - midlo > 1)
282 const key_type* midlo = mid, * midhi = mid + 1;
284 if (2 * treeidx < num_splitters)
286 key_type prevkey =
recurse(lo, midlo, 2 * treeidx + 0, rec_prevkey);
288 key_type xorSplit = prevkey ^ mykey;
294 <<
" - " <<
clz(xorSplit)
295 <<
" bits = " <<
clz(xorSplit) / 8
299 (
clz(xorSplit) / 8) |
301 ((mykey & 0xFF) ? 0 : 0x80);
303 return recurse(midhi, hi, 2 * treeidx + 1, mykey);
307 key_type xorSplit = rec_prevkey ^ mykey;
313 <<
" - " <<
clz(xorSplit)
314 <<
" bits = " <<
clz(xorSplit) / 8
318 (
clz(xorSplit) / 8) |
320 ((mykey & 0xFF) ? 0 : 0x80);
335template <
typename key_type,
size_t TreeBits,
size_t Rollout = 4>
343 void build(key_type* samples,
size_t samplesize,
344 unsigned char* splitter_lcp) {
347 samples, samplesize);
368#define TLX_CLASSIFY_TREE_STEP \
369 for (size_t u = 0; u < Rollout; ++u) { \
370 i[u] = 2 * i[u] + (key[u] <= splitter_tree_[i[u]] ? 0 : 1); \
372 TLX_ATTRIBUTE_FALLTHROUGH;
377 const key_type key[Rollout], uint16_t obkt[Rollout])
const {
380 unsigned int i[Rollout];
381 std::fill(i, i + Rollout, 1u);
421 for (
size_t u = 0; u < Rollout; ++u)
424 for (
size_t u = 0; u < Rollout; ++u) {
429 for (
size_t u = 0; u < Rollout; ++u) {
436#undef TLX_CLASSIFY_TREE_STEP
439 template <
typename StringSet>
442 const StringSet& strset,
443 typename StringSet::Iterator begin,
typename StringSet::Iterator end,
444 uint16_t* bktout,
size_t depth)
const {
447 if (begin + Rollout <= end)
449 key_type key[Rollout];
450 for (
size_t u = 0; u < Rollout; ++u)
451 key[u] = get_key<key_type>(strset, begin[u], depth);
455 begin += Rollout, bktout += Rollout;
459 key_type key = get_key<key_type>(strset, *begin++, depth);
477template <
typename key_type,
size_t TreeBits>
485 void build(key_type* samples,
size_t samplesize,
unsigned char* splitter_lcp) {
490#define TLX_CLASSIFY_TREE_STEP \
491 if (TLX_UNLIKELY(key == splitter_tree_[i])) { \
493 2 * PerfectTreeCalculations<treebits>::level_to_preorder(i) - 1; \
495 i = 2 * i + (key < splitter_tree_[i] ? 0 : 1); \
496 TLX_ATTRIBUTE_FALLTHROUGH;
546#undef TLX_CLASSIFY_TREE_STEP
549 template <
typename StringSet>
551 const StringSet& strset,
552 typename StringSet::Iterator begin,
typename StringSet::Iterator end,
553 uint16_t* bktout,
size_t depth)
const {
556 key_type key = get_key<key_type>(strset, *begin++, depth);
575template <
typename key_type,
size_t TreeBits,
unsigned Rollout = 4>
583 void build(key_type* samples,
size_t samplesize,
584 unsigned char* splitter_lcp) {
613#define TLX_CLASSIFY_TREE_STEP \
614 for (size_t u = 0; u < Rollout; ++u) { \
615 i[u] = 2 * i[u] + (key[u] <= splitter_tree_[i[u]] ? 0 : 1); \
617 TLX_ATTRIBUTE_FALLTHROUGH;
622 const key_type key[Rollout], uint16_t obkt[Rollout])
const {
625 unsigned int i[Rollout];
626 std::fill(i + 0, i + Rollout, 1);
667 for (
unsigned u = 0; u < Rollout; ++u)
670 for (
unsigned u = 0; u < Rollout; ++u) {
675 for (
unsigned u = 0; u < Rollout; ++u) {
682#undef TLX_CLASSIFY_TREE_STEP
685 template <
typename StringSet>
688 const StringSet& strset,
689 typename StringSet::Iterator begin,
typename StringSet::Iterator end,
690 uint16_t* bktout,
size_t depth)
const {
693 if (begin + Rollout <= end)
695 key_type key[Rollout];
696 for (
size_t u = 0; u < Rollout; ++u)
697 key[u] = get_key<key_type>(strset, begin[u], depth);
701 begin += Rollout, bktout += Rollout;
705 key_type key = get_key<key_type>(strset, *begin++, depth);
Sample Sort Classification Tree Unrolled with Equal Comparisons.
key_type splitter_tree_[num_splitters+1]
void build(key_type *samples, size_t samplesize, unsigned char *splitter_lcp)
build tree and splitter array from sample
static const size_t treebits
key_type get_splitter(unsigned int i) const
return a splitter
unsigned int find_bkt(const key_type &key) const
binary search on splitter array for bucket number
static const size_t num_splitters
void classify(const StringSet &strset, typename StringSet::Iterator begin, typename StringSet::Iterator end, uint16_t *bktout, size_t depth) const
classify all strings in area by walking tree and saving bucket id
Sample Sort Classification Tree Unrolled, Interleaved, and with Perfect Tree Index Calculations.
key_type splitter_tree_[num_splitters+1]
void build(key_type *samples, size_t samplesize, unsigned char *splitter_lcp)
build tree and splitter array from sample
static const size_t treebits
key_type get_splitter(unsigned int i) const
return a splitter
unsigned int find_bkt(const key_type &key) const
binary search on splitter array for bucket number
static const size_t num_splitters
void find_bkt_unroll(const key_type key[Rollout], uint16_t obkt[Rollout]) const
search in splitter tree for bucket number, unrolled for Rollout keys at once.
void classify(const StringSet &strset, typename StringSet::Iterator begin, typename StringSet::Iterator end, uint16_t *bktout, size_t depth) const
classify all strings in area by walking tree and saving bucket id
Sample Sort Classification Tree Unrolled and Interleaved.
key_type splitter_tree_[num_splitters+1]
void build(key_type *samples, size_t samplesize, unsigned char *splitter_lcp)
build tree and splitter array from sample
static const size_t treebits
key_type splitter_[num_splitters]
key_type get_splitter(unsigned int i) const
return a splitter
unsigned int find_bkt(const key_type &key) const
binary search on splitter array for bucket number
static const size_t num_splitters
void find_bkt_unroll(const key_type key[Rollout], uint16_t obkt[Rollout]) const
search in splitter tree for bucket number, unrolled for Rollout keys at once.
void classify(const StringSet &strset, typename StringSet::Iterator begin, typename StringSet::Iterator end, uint16_t *bktout, size_t depth) const
classify all strings in area by walking tree and saving bucket id
Recursive TreeBuilder for full-descent and unrolled variants, constructs only a level-order binary tr...
ptrdiff_t snum(const key_type *s) const
SSTreeBuilderLevelOrder(key_type tree[num_splitters], unsigned char splitter_lcp[num_splitters+1], const key_type *samples, size_t samplesize)
build tree, sizes: splitter_tree[num_splitters + 1] and
key_type recurse(const key_type *lo, const key_type *hi, unsigned int treeidx, key_type &rec_prevkey)
unsigned char * lcp_iter_
const key_type * samples_
static const bool debug_splitter
Recursive TreeBuilder for full-descent and unrolled variants, constructs a both a pre-order and level...
ptrdiff_t snum(const key_type *s) const
key_type recurse(const key_type *lo, const key_type *hi, unsigned int treeidx, key_type &rec_prevkey)
unsigned char * lcp_iter_
SSTreeBuilderPreAndLevelOrder(key_type splitter[num_splitters], key_type tree[num_splitters+1], unsigned char splitter_lcp[num_splitters+1], const key_type *samples, size_t samplesize)
const key_type * samples_
static const bool debug_splitter
#define tlx_die_unequal(X, Y)
Check that X == Y or die miserably, but output the values of X and Y for better debugging.
std::string hexdump_type(const Type &t)
Dump a (binary) item as a sequence of uppercase hexadecimal pairs.
#define TLX_LOGC(cond)
Explicitly specify the condition for logging.
#define TLX_LOG
Default logging method: output if the local debug variable is true.
#define TLX_LOG0
Override default output: never or always output log.
static void perfect_tree_calculations_self_verify()
static std::string to_binary(Type v, const size_t width=(8 *sizeof(Type)))
represent binary digits of large integer datatypes
Class to transform in-order to level-order indexes in a perfect binary tree.
static const size_t num_nodes
static unsigned int level_to_preorder(unsigned int id)
static const size_t treebits
static unsigned int pre_to_levelorder(unsigned int id)
static void self_verify()