import re import utils from cardlib import Card # Format a list of rows of data into nice columns. # Note that it's the columns that are nice, not this code. def padrows(l): # get length for each field lens = [] for ll in l: for i, field in enumerate(ll): if i < len(lens): lens[i] = max(len(str(field)), lens[i]) else: lens += [len(str(field))] # now pad out to that length padded = [] for ll in l: padded += [''] for i, field in enumerate(ll): s = str(field) pad = ' ' * (lens[i] - len(s)) padded[-1] += (s + pad + ' ') return padded def printrows(l): for row in l: print(row) # index management helpers def index_size(d): return sum([len(d[k]) for k in d]) def inc(d, k, obj): if k or k == 0: if k in d: d[k] += obj else: d[k] = obj # thanks gleemax def plimit(s, mlen = 1000): if len(s) > mlen: return s[:1000] + '[...]' else: return s class Datamine: # build the global indices def __init__(self, card_srcs): # global card pools self.unparsed_cards = [] self.invalid_cards = [] self.cards = [] self.allcards = [] # global indices self.by_name = {} self.by_type = {} self.by_type_inclusive = {} self.by_supertype = {} self.by_supertype_inclusive = {} self.by_subtype = {} self.by_subtype_inclusive = {} self.by_color = {} self.by_color_inclusive = {} self.by_color_count = {} self.by_cmc = {} self.by_cost = {} self.by_power = {} self.by_toughness = {} self.by_pt = {} self.by_loyalty = {} self.by_textlines = {} self.by_textlen = {} self.indices = { 'by_name' : self.by_name, 'by_type' : self.by_type, 'by_type_inclusive' : self.by_type_inclusive, 'by_supertype' : self.by_supertype, 'by_supertype_inclusive' : self.by_supertype_inclusive, 'by_subtype' : self.by_subtype, 'by_subtype_inclusive' : self.by_subtype_inclusive, 'by_color' : self.by_color, 'by_color_inclusive' : self.by_color_inclusive, 'by_color_count' : self.by_color_count, 'by_cmc' : self.by_cmc, 'by_cost' : self.by_cost, 'by_power' : self.by_power, 'by_toughness' : self.by_toughness, 'by_pt' : self.by_pt, 'by_loyalty' : self.by_loyalty, 'by_textlines' : self.by_textlines, 'by_textlen' : self.by_textlen, } for card_src in card_srcs: # the empty card is not interesting if not card_src: continue card = Card(card_src) if card.valid: self.cards += [card] self.allcards += [card] elif card.parsed: self.invalid_cards += [card] self.allcards += [card] else: self.unparsed_cards += [card] if card.parsed: inc(self.by_name, card.name, [card]) inc(self.by_type, ' '.join(card.types), [card]) for t in card.types: inc(self.by_type_inclusive, t, [card]) inc(self.by_supertype, ' '.join(card.supertypes), [card]) for t in card.supertypes: inc(self.by_supertype_inclusive, t, [card]) inc(self.by_subtype, ' '.join(card.subtypes), [card]) for t in card.subtypes: inc(self.by_subtype_inclusive, t, [card]) if card.cost.colors: inc(self.by_color, card.cost.colors, [card]) for c in card.cost.colors: inc(self.by_color_inclusive, c, [card]) inc(self.by_color_count, len(card.cost.colors), [card]) else: # colorless, still want to include in these tables inc(self.by_color, 'A', [card]) inc(self.by_color_inclusive, 'A', [card]) inc(self.by_color_count, 0, [card]) inc(self.by_cmc, card.cost.cmc, [card]) inc(self.by_cost, card.cost.encode() if card.cost.encode() else 'none', [card]) inc(self.by_power, card.pt_p, [card]) inc(self.by_toughness, card.pt_t, [card]) inc(self.by_pt, card.pt, [card]) inc(self.by_loyalty, card.loyalty, [card]) inc(self.by_textlines, len(card.text_lines), [card]) inc(self.by_textlen, len(card.text.encode()), [card]) # summarize the indices # Yes, this printing code is pretty terrible. def summarize(self, hsize = 10, vsize = 10, cmcsize = 20): print('====================') print(str(len(self.cards)) + ' valid cards, ' + str(len(self.invalid_cards)) + ' invalid cards.') print(str(len(self.allcards)) + ' cards parsed, ' + str(len(self.unparsed_cards)) + ' failed to parse') print('--------------------') print(str(len(self.by_name)) + ' unique card names') print('--------------------') print((str(len(self.by_color_inclusive)) + ' represented colors (including colorless as \'A\'), ' + str(len(self.by_color)) + ' combinations')) print('Breakdown by color:') rows = [list(self.by_color_inclusive.keys())] rows += [[len(self.by_color_inclusive[k]) for k in rows[0]]] printrows(padrows(rows)) print('Breakdown by number of colors:') rows = [list(self.by_color_count.keys())] rows += [[len(self.by_color_count[k]) for k in rows[0]]] printrows(padrows(rows)) print('--------------------') print(str(len(self.by_type_inclusive)) + ' unique card types, ' + str(len(self.by_type)) + ' combinations') print('Breakdown by type:') d = sorted(self.by_type_inclusive, lambda x: len(self.by_type_inclusive[x]), reverse=True) rows = [[k for k in d[:hsize]]] rows += [[len(self.by_type_inclusive[k]) for k in rows[0]]] printrows(padrows(rows)) print('--------------------') print((str(len(self.by_subtype_inclusive)) + ' unique subtypes, ' + str(len(self.by_subtype)) + ' combinations')) print('-- Popular subtypes: --') d = sorted(self.by_subtype_inclusive, lambda x: len(self.by_subtype_inclusive[x]), reverse=True) rows = [] for k in d[0:vsize]: rows += [[k, len(self.by_subtype_inclusive[k])]] printrows(padrows(rows)) print('-- Top combinations: --') d = sorted(self.by_subtype, lambda x: len(self.by_subtype[x]), reverse = True) rows = [] for k in d[0:vsize]: rows += [[k, len(self.by_subtype[k])]] printrows(padrows(rows)) print('--------------------') print((str(len(self.by_supertype_inclusive)) + ' unique supertypes, ' + str(len(self.by_supertype)) + ' combinations')) print('Breakdown by supertype:') d = sorted(self.by_supertype_inclusive, lambda x: len(self.by_supertype_inclusive[x]), reverse=True) rows = [[k for k in d[:hsize]]] rows += [[len(self.by_supertype_inclusive[k]) for k in rows[0]]] printrows(padrows(rows)) print('--------------------') print(str(len(self.by_cmc)) + ' different CMCs, ' + str(len(self.by_cost)) + ' unique mana costs') print('Breakdown by CMC:') d = sorted(self.by_cmc, reverse=False) rows = [[k for k in d[:cmcsize]]] rows += [[len(self.by_cmc[k]) for k in rows[0]]] printrows(padrows(rows)) print('-- Popular mana costs: --') d = sorted(self.by_cost, lambda x: len(self.by_cost[x]), reverse = True) rows = [] for k in d[0:vsize]: rows += [[utils.from_mana(k), len(self.by_cost[k])]] printrows(padrows(rows)) print('--------------------') print(str(len(self.by_pt)) + ' unique p/t combinations') if len(self.by_power) > 0 and len(self.by_toughness) > 0: print(('Largest power: ' + str(max(list(map(len, self.by_power))) - 1) + ', largest toughness: ' + str(max(list(map(len, self.by_toughness))) - 1))) print('-- Popular p/t values: --') d = sorted(self.by_pt, lambda x: len(self.by_pt[x]), reverse = True) rows = [] for k in d[0:vsize]: rows += [[utils.from_unary(k), len(self.by_pt[k])]] printrows(padrows(rows)) print('--------------------') print('Loyalty values:') d = sorted(self.by_loyalty, lambda x: len(self.by_loyalty[x]), reverse = True) rows = [] for k in d[0:vsize]: rows += [[utils.from_unary(k), len(self.by_loyalty[k])]] printrows(padrows(rows)) print('--------------------') if len(self.by_textlen) > 0 and len(self.by_textlines) > 0: print(('Card text ranges from ' + str(min(self.by_textlen)) + ' to ' + str(max(self.by_textlen)) + ' characters in length')) print(('Card text ranges from ' + str(min(self.by_textlines)) + ' to ' + str(max(self.by_textlines)) + ' lines')) print('-- Line counts by frequency: --') d = sorted(self.by_textlines, lambda x: len(self.by_textlines[x]), reverse = True) rows = [] for k in d[0:vsize]: rows += [[k, len(self.by_textlines[k])]] printrows(padrows(rows)) print('====================') # describe outliers in the indices def outliers(self, hsize=10, vsize=10, dump_invalid=False): print('********************') print('Overview of indices:') rows = [['Index Name', 'Keys', 'Total Members']] for index in self.indices: rows += [[index, len(self.indices[index]), index_size(self.indices[index])]] printrows(padrows(rows)) print('********************') if len(self.by_name) > 0: scardname = sorted(self.by_name, len, reverse=False)[0] print('Shortest Cardname: (' + str(len(scardname)) + ')') print(' ' + scardname) lcardname = sorted(self.by_name, len, reverse=True)[0] print('Longest Cardname: (' + str(len(lcardname)) + ')') print(' ' + lcardname) d = sorted(self.by_name, lambda x: len(self.by_name[x]), reverse = True) rows = [] for k in d[0:vsize]: if len(self.by_name[k]) > 1: rows += [[k, len(self.by_name[k])]] if rows == []: print('No duplicated cardnames') else: print('-- Most duplicated names: --') printrows(padrows(rows)) else: print('No cards indexed by name?') print('--------------------') if len(self.by_type) > 0: ltypes = sorted(self.by_type, len, reverse=True)[0] print('Longest card type: (' + str(len(ltypes)) + ')') print(' ' + ltypes) else: print('No cards indexed by type?') if len(self.by_subtype) > 0: lsubtypes = sorted(self.by_subtype, len, reverse=True)[0] print('Longest subtype: (' + str(len(lsubtypes)) + ')') print(' ' + lsubtypes) else: print('No cards indexed by subtype?') if len(self.by_supertype) > 0: lsupertypes = sorted(self.by_supertype, len, reverse=True)[0] print('Longest supertype: (' + str(len(lsupertypes)) + ')') print(' ' + lsupertypes) else: print('No cards indexed by supertype?') print('--------------------') if len(self.by_cost) > 0: lcost = sorted(self.by_cost, len, reverse=True)[0] print('Longest mana cost: (' + str(len(lcost)) + ')') print(' ' + utils.from_mana(lcost)) print('\n' + plimit(self.by_cost[lcost][0].encode()) + '\n') else: print('No cards indexed by cost?') if len(self.by_cmc) > 0: lcmc = sorted(self.by_cmc, reverse=True)[0] print('Largest cmc: (' + str(lcmc) + ')') print(' ' + str(self.by_cmc[lcmc][0].cost)) print('\n' + plimit(self.by_cmc[lcmc][0].encode())) else: print('No cards indexed by cmc?') print('--------------------') if len(self.by_power) > 0: lpower = sorted(self.by_power, len, reverse=True)[0] print('Largest creature power: ' + utils.from_unary(lpower)) print('\n' + plimit(self.by_power[lpower][0].encode()) + '\n') else: print('No cards indexed by power?') if len(self.by_toughness) > 0: ltoughness = sorted(self.by_toughness, len, reverse=True)[0] print('Largest creature toughness: ' + utils.from_unary(ltoughness)) print('\n' + plimit(self.by_toughness[ltoughness][0].encode())) else: print('No cards indexed by toughness?') print('--------------------') if len(self.by_textlines) > 0: llines = sorted(self.by_textlines, reverse=True)[0] print('Most lines of text in a card: ' + str(llines)) print('\n' + plimit(self.by_textlines[llines][0].encode()) + '\n') else: print('No cards indexed by line count?') if len(self.by_textlen) > 0: ltext = sorted(self.by_textlen, reverse=True)[0] print('Most chars in a card text: ' + str(ltext)) print('\n' + plimit(self.by_textlen[ltext][0].encode())) else: print('No cards indexed by char count?') print('--------------------') print('There were ' + str(len(self.invalid_cards)) + ' invalid cards.') if dump_invalid: for card in self.invalid_cards: print('\n' + repr(card.fields)) elif len(self.invalid_cards) > 0: print('Not summarizing.') print('--------------------') print('There were ' + str(len(self.unparsed_cards)) + ' unparsed cards.') if dump_invalid: for card in self.unparsed_cards: print('\n' + repr(card.fields)) elif len(self.unparsed_cards) > 0: print('Not summarizing.') print('====================')