properly handle duplicate page ranges

This commit is contained in:
silverweed 2024-08-09 11:44:55 +02:00
parent d1548a467b
commit dcf6e13fa5

View file

@ -87,6 +87,7 @@ void gather_ntuple_metadata(Arena *arena, RMicroFileReader &reader, const RNTupl
fprintf(stderr, "Loading pages...\n");
u64 n_pages = 0;
u64 n_duplicate_page_ranges = 0;
u64 n_elems = 0;
u64 tot_page_comp_size = 0;
Page_Info_Node *pinfo_head = nullptr, *pinfo_tail = nullptr;
@ -124,6 +125,8 @@ void gather_ntuple_metadata(Arena *arena, RMicroFileReader &reader, const RNTupl
pinfo->is_first_in_cluster = true;
}
b8 duplicate = false;
if (UNLIKELY(!pinfo_head)) {
// first node inserted
assert(!pinfo_tail);
@ -146,6 +149,11 @@ void gather_ntuple_metadata(Arena *arena, RMicroFileReader &reader, const RNTupl
b8 pinfo_is_after_last = pinfo->range.start >= last_inserted_pinfo->range.end();
if (pinfo_is_after_last) {
for (Page_Info_Node *node = last_inserted_pinfo->next; node; node = node->next) {
// sanity check for duplicate pages
if (pinfo->range.start == node->range.start) {
duplicate = true;
break;
}
// check if `pinfo` fits right before the node we're looking at
if (pinfo->range.end() <= node->range.start) {
Page_Info_Node *prev = node->prev;
@ -163,6 +171,11 @@ void gather_ntuple_metadata(Arena *arena, RMicroFileReader &reader, const RNTupl
}
} else {
for (Page_Info_Node *node = last_inserted_pinfo; node; node = node->prev) {
// sanity check for duplicate pages
if (pinfo->range.start == node->range.start) {
duplicate = true;
break;
}
// check if `pinfo` fits right before the node we're looking at
if (pinfo->range.end() <= node->range.start) {
Page_Info_Node *prev = node->prev;
@ -180,7 +193,12 @@ void gather_ntuple_metadata(Arena *arena, RMicroFileReader &reader, const RNTupl
}
}
assert(inserted);
assert(inserted != duplicate);
}
if (duplicate) {
++n_duplicate_page_ranges;
continue;
}
last_inserted_pinfo = pinfo;
@ -195,7 +213,8 @@ void gather_ntuple_metadata(Arena *arena, RMicroFileReader &reader, const RNTupl
chr::time_point end_t = chr::high_resolution_clock::now();
u64 time_spent_ms = chr::duration_cast<chr::milliseconds>(end_t - start_t).count();
fprintf(stderr, "Loaded %lu pages in %lu ms.\nGenerating groups...\n", n_pages, time_spent_ms);
fprintf(stderr, "Loaded %lu pages in %lu ms (%lu duplicates).\nGenerating groups...\n",
n_pages, time_spent_ms, n_duplicate_page_ranges);
// Create page groups and chunks.
// Each page group is a grouping of GROUP_SIZE page infos whose range is equal to the combined ranges