diff --git a/src/rntuple.cpp b/src/rntuple.cpp index 50075cc..720370e 100644 --- a/src/rntuple.cpp +++ b/src/rntuple.cpp @@ -87,6 +87,7 @@ void gather_ntuple_metadata(Arena *arena, RMicroFileReader &reader, const RNTupl fprintf(stderr, "Loading pages...\n"); u64 n_pages = 0; + u64 n_duplicate_page_ranges = 0; u64 n_elems = 0; u64 tot_page_comp_size = 0; Page_Info_Node *pinfo_head = nullptr, *pinfo_tail = nullptr; @@ -124,6 +125,8 @@ void gather_ntuple_metadata(Arena *arena, RMicroFileReader &reader, const RNTupl pinfo->is_first_in_cluster = true; } + b8 duplicate = false; + if (UNLIKELY(!pinfo_head)) { // first node inserted assert(!pinfo_tail); @@ -146,6 +149,11 @@ void gather_ntuple_metadata(Arena *arena, RMicroFileReader &reader, const RNTupl b8 pinfo_is_after_last = pinfo->range.start >= last_inserted_pinfo->range.end(); if (pinfo_is_after_last) { for (Page_Info_Node *node = last_inserted_pinfo->next; node; node = node->next) { + // sanity check for duplicate pages + if (pinfo->range.start == node->range.start) { + duplicate = true; + break; + } // check if `pinfo` fits right before the node we're looking at if (pinfo->range.end() <= node->range.start) { Page_Info_Node *prev = node->prev; @@ -163,6 +171,11 @@ void gather_ntuple_metadata(Arena *arena, RMicroFileReader &reader, const RNTupl } } else { for (Page_Info_Node *node = last_inserted_pinfo; node; node = node->prev) { + // sanity check for duplicate pages + if (pinfo->range.start == node->range.start) { + duplicate = true; + break; + } // check if `pinfo` fits right before the node we're looking at if (pinfo->range.end() <= node->range.start) { Page_Info_Node *prev = node->prev; @@ -180,7 +193,12 @@ void gather_ntuple_metadata(Arena *arena, RMicroFileReader &reader, const RNTupl } } - assert(inserted); + assert(inserted != duplicate); + } + + if (duplicate) { + ++n_duplicate_page_ranges; + continue; } last_inserted_pinfo = pinfo; @@ -195,7 +213,8 @@ void gather_ntuple_metadata(Arena *arena, RMicroFileReader &reader, const RNTupl chr::time_point end_t = chr::high_resolution_clock::now(); u64 time_spent_ms = chr::duration_cast(end_t - start_t).count(); - fprintf(stderr, "Loaded %lu pages in %lu ms.\nGenerating groups...\n", n_pages, time_spent_ms); + fprintf(stderr, "Loaded %lu pages in %lu ms (%lu duplicates).\nGenerating groups...\n", + n_pages, time_spent_ms, n_duplicate_page_ranges); // Create page groups and chunks. // Each page group is a grouping of GROUP_SIZE page infos whose range is equal to the combined ranges