From cae82ef66fb6b346efd2c5df423e0d052715d95b Mon Sep 17 00:00:00 2001 From: silverweed Date: Fri, 20 Sep 2024 11:53:53 +0200 Subject: [PATCH] make rntviewer more resilient to corrupted files --- src/hover.cpp | 11 +++-- src/rntuple.cpp | 112 +++++++++++++++++++++++++++--------------------- 2 files changed, 68 insertions(+), 55 deletions(-) diff --git a/src/hover.cpp b/src/hover.cpp index 36ff211..88db4d4 100644 --- a/src/hover.cpp +++ b/src/hover.cpp @@ -719,13 +719,12 @@ Sec_Hover_Info get_section_hover_info(Arena *arena, Section section, u64 off, co // so they're always occupying only 8 bytes. || hover.field_le("Flags: 0x%" PRIX64) || hover.field_le("Header checksum: 0x%" PRIX64) + || hover.frame_header() || hover.schema_description("Schema Extension") - // TODO: - // - list of column group record frames - || hover.frame_header() - // - list of cluster group record frames - || hover.frame_header() - || hover.cluster_group() + // - list of column group record frames (TODO) + || hover.frame_header("Column Groups") + // - list of cluster group record frames (TODO) + || hover.frame_header("Cluster Groups") || hover.range("Payload", section.range.len - hover.cur_field_off) || hover.field_le("Checksum: 0x%" PRIX64) ; diff --git a/src/rntuple.cpp b/src/rntuple.cpp index f484a08..5803210 100644 --- a/src/rntuple.cpp +++ b/src/rntuple.cpp @@ -34,8 +34,12 @@ ROOT::Experimental::RNTupleDescriptor create_descriptor(Arena *arena, RMicroFile // Deserialize header+footer RNTupleDescriptorBuilder desc_builder; - RNTupleSerializer::DeserializeHeader(header, anchor.fLenHeader, desc_builder); - RNTupleSerializer::DeserializeFooter(footer, anchor.fLenFooter, desc_builder); + try { + RNTupleSerializer::DeserializeHeader(header, anchor.fLenHeader, desc_builder); + RNTupleSerializer::DeserializeFooter(footer, anchor.fLenFooter, desc_builder); + } catch (...) { + fprintf(stderr, "Failed to deserialize header/footer!\n"); + } RNTupleDescriptor descriptor = desc_builder.MoveDescriptor(); for (const RClusterGroupDescriptor &cgdesc : descriptor.GetClusterGroupIterable()) { @@ -262,65 +266,75 @@ void gather_ntuple_metadata(Arena *arena, RMicroFileReader &reader, const RNTupl // that an offset belongs to. // A page chunk is a grouping of adjacent pages, used to quickly determine if an offset is part // of a page or not. - assert(pinfo_head); - const u64 GROUP_SIZE = 500; - Page_Info_Group *groups = arena_push_array_nozero(arena, n_pages / GROUP_SIZE + 1); - u64 n_groups = 1; - groups->first = pinfo_head; - groups->range.start = pinfo_head->range.start; + Page_Info_Group *groups = nullptr; + Page_Info_Chunk *chunks_head = nullptr, *chunks_tail = nullptr; + u64 n_groups = 0; + u64 n_chunks = 0; + u64 idx = 0; + // NOTE: pinfo_head may be null if we failed to load any page (which may happen e.g. if the rntuple + // is corrupted) + if (pinfo_head) { + const u64 GROUP_SIZE = 500; + groups = arena_push_array_nozero(arena, n_pages / GROUP_SIZE + 1); + n_groups = 1; + groups->first = pinfo_head; + groups->range.start = pinfo_head->range.start; - Page_Info_Chunk *chunks_head = arena_push(arena); - Page_Info_Chunk *chunks_tail = chunks_head; - chunks_head->range = pinfo_head->range; - u64 n_chunks = 1; + chunks_head = arena_push(arena); + chunks_tail = chunks_head; + chunks_head->range = pinfo_head->range; + n_chunks = 1; - u64 idx = 1; - [[maybe_unused]] Page_Info_Node *prev = pinfo_head; - for (Page_Info_Node *pinfo = pinfo_head->next; pinfo; pinfo = pinfo->next) { - assert(prev->range.end() <= pinfo->range.start); - prev = pinfo; + idx = 1; + [[maybe_unused]] Page_Info_Node *prev = pinfo_head; + for (Page_Info_Node *pinfo = pinfo_head->next; pinfo; pinfo = pinfo->next) { + assert(prev->range.end() <= pinfo->range.start); + prev = pinfo; - if (pinfo->range.start != chunks_tail->range.end()) { - // close current chunk and open new one - Page_Info_Chunk *chunk = arena_push(arena); - chunk->range.start = pinfo->range.start; - chunk->first_group = n_groups - 1; - chunks_tail->next = chunk; - chunks_tail = chunk; - ++n_chunks; + if (pinfo->range.start != chunks_tail->range.end()) { + // close current chunk and open new one + Page_Info_Chunk *chunk = arena_push(arena); + chunk->range.start = pinfo->range.start; + chunk->first_group = n_groups - 1; + chunks_tail->next = chunk; + chunks_tail = chunk; + ++n_chunks; + } + chunks_tail->range.len += pinfo->range.len; + + // while we're at it, set the first_page_idx information on the page's parent cluster + // Note that the first page won't update its cluster's `first_page_idx` (since we loop + // from idx = 1) but that's fine because that idx is by definition 0. + if (pinfo->is_first_in_cluster) + clusters[pinfo->cluster_id].first_page_idx = idx; + + if (idx++ % GROUP_SIZE != 0) + continue; + + // Create a new group every GROUP_SIZE page infos + + Page_Info_Group &cur_group = groups[n_groups]; + cur_group.first = pinfo; + cur_group.range.start = pinfo->range.start; + Page_Info_Group &prev_group = groups[n_groups - 1]; + prev_group.range.len = cur_group.range.start - prev_group.range.start; + + ++n_groups; } - chunks_tail->range.len += pinfo->range.len; - - // while we're at it, set the first_page_idx information on the page's parent cluster - // Note that the first page won't update its cluster's `first_page_idx` (since we loop - // from idx = 1) but that's fine because that idx is by definition 0. - if (pinfo->is_first_in_cluster) - clusters[pinfo->cluster_id].first_page_idx = idx; - - if (idx++ % GROUP_SIZE != 0) - continue; - - // Create a new group every GROUP_SIZE page infos - - Page_Info_Group &cur_group = groups[n_groups]; - cur_group.first = pinfo; - cur_group.range.start = pinfo->range.start; - Page_Info_Group &prev_group = groups[n_groups - 1]; - prev_group.range.len = cur_group.range.start - prev_group.range.start; - - ++n_groups; } // verify that we added all pages to chunks assert(idx == n_pages); - Page_Info_Group &last_group = groups[n_groups - 1]; - last_group.range.len = pinfo_tail->range.end() - last_group.range.start; + if (n_groups) { + Page_Info_Group &last_group = groups[n_groups - 1]; + last_group.range.len = pinfo_tail->range.end() - last_group.range.start; + } fprintf(stderr, "Generated %" PRIu64 " groups and %" PRIu64 " chunks.\n", n_groups, n_chunks); - assert(!chunks_tail->next); - assert(!pinfo_tail->next); + assert(!chunks_tail || !chunks_tail->next); + assert(!pinfo_tail || !pinfo_tail->next); rndata.pages = pinfo_head; rndata.page_groups = groups;