import re import codecs import sys import random import utils # Format a list of rows of data into nice columns. # Note that it's the columns that are nice, not this code. def padrows(l): # get length for each field lens = [] for ll in l: for i, field in enumerate(ll): if i < len(lens): lens[i] = max(len(str(field)), lens[i]) else: lens += [len(str(field))] # now pad out to that length padded = [] for ll in l: padded += [''] for i, field in enumerate(ll): s = str(field) pad = ' ' * (lens[i] - len(s)) padded[-1] += (s + pad + ' ') return padded def printrows(l): for row in l: print row def randomize_all_mana(text): manastrs = re.findall(utils.mana_regex, text) newtext = text for manastr in sorted(manastrs, lambda x,y: cmp(len(x), len(y)), reverse = True): newtext = newtext.replace(manastr, utils.reserved_marker) for manastr in manastrs: newtext = newtext.replace(utils.reserved_marker, Manacost(manastr).reencode(randomize = True), 1) return newtext # so this stuff still needs to be cleaned up punctuation_chars = r'[+\-*",.:;WUBRGPV/XTQ|\\&^\{\}@ \n=~%\[\]]' creature_keywords = [ # evergreen 'deathtouch', 'defender', 'double strike', 'first strike', 'flash', 'flying', 'haste', 'hexproof', 'indestructible', 'lifelink', 'menace', 'prowess', 'reach', 'trample', 'vigilance', # no longer evergreen 'banding', 'fear', 'shroud', 'intimidate', # expert level keywords 'absorb', 'amplify', 'annihilator', 'battle cry', 'bolster', 'bloodthirst', 'bushido', 'changeling', 'convoke', 'devour', 'evolve', 'exalted', 'extort', 'fading', 'flanking', 'frenzy', 'graft', 'haunt', 'horsemanship', 'infect', 'modular', #'morph', #'ninjutsu', 'persist', 'poisonous', 'provoke', #'prowl', 'rampage', 'ripple', #'scavenge', 'shadow', 'soulbond', 'soulshift', 'split second', 'sunburst', 'undying', #'unearth', 'unleash', 'vanishing', 'wither', ] # there are other keywords out there, these are just easy to detect # data aggregating classes class Manacost: '''mana cost representation with data''' # hardcoded to be dependent on the symbol structure... ah well def get_colors(self): colors = '' for sym in self.symbols: if self.symbols[sym] > 0: symcolors = re.sub(r'2|P|S|X', '', sym) for symcolor in symcolors: if symcolor not in colors: colors += symcolor # sort so the order is always consistent return ''.join(sorted(colors)) def check_colors(self, symbolstring): for sym in symbolstring: if not sym in self.colors: return False return True def __init__(self, text): self.raw = text self.cmc = 0 self.colorless = 0 self.sequence = [] self.symbols = {sym : 0 for sym in utils.mana_syms} self.allsymbols = {sym : 0 for sym in utils.mana_symall} if text == '': self._parsed = True self._valid = True self.none = True self.inner = '' elif not (len(self.raw) >= 2 and self.raw[0] == '{' and self.raw[-1] == '}'): self._parsed = False self._valid = False self.none = False else: self._parsed = True self._valid = True self.none = False self.inner = self.raw[1:-1] # structure mirrors the decoding in utils, but we pull out different data here idx = 0 while idx < len(self.inner): # taking this branch is an infinite loop if unary_marker is empty if (len(utils.mana_unary_marker) > 0 and self.inner[idx:idx+len(utils.mana_unary_marker)] == utils.mana_unary_marker): idx += len(utils.mana_unary_marker) self.sequence += [utils.mana_unary_marker] elif self.inner[idx:idx+len(utils.mana_unary_counter)] == utils.mana_unary_counter: idx += len(utils.mana_unary_counter) self.sequence += [utils.mana_unary_counter] self.colorless += 1 self.cmc += 1 else: old_idx = idx for symlen in range(utils.mana_symlen_min, utils.mana_symlen_max + 1): encoded_sym = self.inner[idx:idx+symlen] if encoded_sym in utils.mana_symall_decode: idx += symlen # leave the sequence encoded for convenience self.sequence += [encoded_sym] sym = utils.mana_symall_decode[encoded_sym] self.allsymbols[sym] += 1 if sym in utils.mana_symalt: self.symbols[utils.mana_alt(sym)] += 1 else: self.symbols[sym] += 1 if sym == utils.mana_X: self.cmc += 0 elif utils.mana_2 in sym: self.cmc += 2 else: self.cmc += 1 break # otherwise we'll go into an infinite loop if we see a symbol we don't know if idx == old_idx: idx += 1 self._valid = False self.colors = self.get_colors() def __str__(self): return utils.mana_untranslate(utils.mana_open_delimiter + ''.join(self.sequence) + utils.mana_close_delimiter) def format(self, for_forum): return utils.mana_untranslate(utils.mana_open_delimiter + ''.join(self.sequence, for_forum) + utils.mana_close_delimiter) def reencode(self, randomize = False): if self.none: return '' elif randomize: # so this won't work very well if mana_unary_marker isn't empty return (utils.mana_open_delimiter + ''.join(random.sample(self.sequence, len(self.sequence))) + utils.mana_close_delimiter) else: return utils.mana_open_delimiter + ''.join(self.sequence) + utils.mana_close_delimiter class Card: '''card representation with data''' def __init__(self, text): self.raw = text self._parsed = True self._valid = True if '\n' in self.raw: halves = self.raw.split('\n') if not len(halves) == 2: self._parsed = False self._valid = False self.fields = halves return else: self.raw = halves[0] self.bside = Card(halves[1]) if not self.bside._valid: self._valid = False else: self.bside = None fields = self.raw.split(utils.fieldsep) if not len(fields) >= 10: self._parsed = False self._valid = False self.fields = fields else: if not fields[1] == '': self.name = fields[1] else: self.name = '' self._valid = False if not fields[2] == '': self.supertypes = fields[2].split(' ') else: self.supertypes = [] if not fields[3] == '': self.types = fields[3].split(' ') else: self.types = [] self._valid = False if not fields[4] == '': self.loyalty = fields[4] try: self.loyalty_value = int(self.loyalty) except ValueError: self.loyalty_value = None # strictly speaking, '* where * is something' is valid... # self._valid = False else: self.loyalty = None self.loyalty_value = None if not fields[5] == '': self.subtypes = fields[5].split(' ') if 'creature' in self.types: self.creaturetypes = self.subtypes else: self.creaturetypes = [] else: self.subtypes = [] self.creaturetypes = [] if not fields[6] == '': self.pt = fields[6] self.power = None self.power_value = None self.toughness = None self.toughness_value = None p_t = self.pt.split('/') if len(p_t) == 2: self.power = p_t[0] try: self.power_value = int(self.power) except ValueError: self.power_value = None self.toughness = p_t[1] try: self.toughness_value = int(self.toughness) except ValueError: self.toughness_value = None else: self._valid = False else: self.pt = None self.power = None self.power_value = None self.toughness = None self.toughness_value = None # if there's no cost (lands) then cost.none will be True self.cost = Manacost(fields[7]) if not fields[8] == '': self.text = fields[8] self.text_lines = self.text.split(utils.newline) self.text_words = re.sub(punctuation_chars, ' ', self.text).split() self.creature_words = [] # SUPER HACK if 'creature' in self.types: for line in self.text_lines: orig_line = line guess = [] for keyword in creature_keywords: if keyword in line: guess += [keyword] line = line.replace(keyword, '') # yeah, I said it was a hack if re.sub(punctuation_chars, ' ', line).split() == [] or 'protect' in line or 'walk' in line or 'sliver creatures' in line or 'you control have' in line: for word in guess: if word not in self.creature_words: self.creature_words += [word] # elif len(guess) > 0 and len(line) < 30: # print orig_line else: self.text = '' self.text_lines = [] self.text_words = [] self.creature_words = [] def __str__(self): return ''.join([ utils.fieldsep, self.name, utils.fieldsep, (' ' + utils.dash_marker + ' ').join([' '.join(self.supertypes + self.types), ' '.join(self.subtypes)]), utils.fieldsep, str(self.cost.cmc) if self.cost.colors == '' else str(self.cost.cmc) + ', ' + self.cost.colors, utils.fieldsep, ]) def reencode(self, randomize = False): return ''.join([ utils.fieldsep, self.name, utils.fieldsep, ' '.join(self.supertypes), utils.fieldsep, ' '.join(self.types), utils.fieldsep, self.loyalty if self.loyalty else '', utils.fieldsep, ' '.join(self.subtypes), utils.fieldsep, self.pt if self.pt else '', utils.fieldsep, self.cost.reencode(randomize) if not self.cost.none else '', utils.fieldsep, self.text if not randomize else randomize_all_mana(self.text), utils.fieldsep, utils.bsidesep + self.bside.reencode(randomize) if self.bside else '', ]) # global card pools unparsed_cards = [] invalid_cards = [] cards = [] allcards = [] # global indices by_name = {} by_type = {} by_type_inclusive = {} by_supertype = {} by_supertype_inclusive = {} by_subtype = {} by_subtype_inclusive = {} by_color = {} by_color_inclusive = {} by_color_count = {} by_cmc = {} by_cost = {} by_power = {} by_toughness = {} by_pt = {} by_loyalty = {} by_textlines = {} by_textlen = {} indices = { 'by_name' : by_name, 'by_type' : by_type, 'by_type_inclusive' : by_type_inclusive, 'by_supertype' : by_supertype, 'by_supertype_inclusive' : by_supertype_inclusive, 'by_subtype' : by_subtype, 'by_subtype_inclusive' : by_subtype_inclusive, 'by_color' : by_color, 'by_color_inclusive' : by_color_inclusive, 'by_color_count' : by_color_count, 'by_cmc' : by_cmc, 'by_cost' : by_cost, 'by_power' : by_power, 'by_toughness' : by_toughness, 'by_pt' : by_pt, 'by_loyalty' : by_loyalty, 'by_textlines' : by_textlines, 'by_textlen' : by_textlen, } def index_size(d): return sum(map(lambda k: len(d[k]), d)) def inc(d, k, obj): if k or k == 0: if k in d: d[k] += obj else: d[k] = obj # build the global indices def analyze(cardtexts): global unparsed_cards, invalid_cards, cards, allcards for cardtext in cardtexts: # the empty card is not interesting if not cardtext: continue card = Card(cardtext) if card._valid: cards += [card] allcards += [card] elif card._parsed: invalid_cards += [card] allcards += [card] else: unparsed_cards += [card] if card._parsed: inc(by_name, card.name, [card]) inc(by_type, ' '.join(card.types), [card]) for t in card.types: inc(by_type_inclusive, t, [card]) inc(by_supertype, ' '.join(card.supertypes), [card]) for t in card.supertypes: inc(by_supertype_inclusive, t, [card]) inc(by_subtype, ' '.join(card.subtypes), [card]) for t in card.subtypes: inc(by_subtype_inclusive, t, [card]) if card.cost.colors: inc(by_color, card.cost.colors, [card]) for c in card.cost.colors: inc(by_color_inclusive, c, [card]) inc(by_color_count, len(card.cost.colors), [card]) else: # colorless, still want to include in these tables inc(by_color, 'A', [card]) inc(by_color_inclusive, 'A', [card]) inc(by_color_count, 0, [card]) inc(by_cmc, card.cost.cmc, [card]) inc(by_cost, card.cost.reencode() if card.cost.reencode() else 'none', [card]) inc(by_power, card.power, [card]) inc(by_toughness, card.toughness, [card]) inc(by_pt, card.pt, [card]) inc(by_loyalty, card.loyalty, [card]) inc(by_textlines, len(card.text_lines), [card]) inc(by_textlen, len(card.text), [card]) # summarize the indices # Yes, this printing code is pretty terrible. def summarize(hsize = 10, vsize = 10, cmcsize = 20): print '====================' print str(len(cards)) + ' valid cards, ' + str(len(invalid_cards)) + ' invalid cards.' print str(len(allcards)) + ' cards parsed, ' + str(len(unparsed_cards)) + ' failed to parse' print '--------------------' print str(len(by_name)) + ' unique card names' print '--------------------' print (str(len(by_color_inclusive)) + ' represented colors (including colorless as \'A\'), ' + str(len(by_color)) + ' combinations') print 'Breakdown by color:' rows = [by_color_inclusive.keys()] rows += [[len(by_color_inclusive[k]) for k in rows[0]]] printrows(padrows(rows)) print 'Breakdown by number of colors:' rows = [by_color_count.keys()] rows += [[len(by_color_count[k]) for k in rows[0]]] printrows(padrows(rows)) print '--------------------' print str(len(by_type_inclusive)) + ' unique card types, ' + str(len(by_type)) + ' combinations' print 'Breakdown by type:' d = sorted(by_type_inclusive, lambda x,y: cmp(len(by_type_inclusive[x]), len(by_type_inclusive[y])), reverse = True) rows = [[k for k in d[:hsize]]] rows += [[len(by_type_inclusive[k]) for k in rows[0]]] printrows(padrows(rows)) print '--------------------' print (str(len(by_subtype_inclusive)) + ' unique subtypes, ' + str(len(by_subtype)) + ' combinations') print '-- Popular subtypes: --' d = sorted(by_subtype_inclusive, lambda x,y: cmp(len(by_subtype_inclusive[x]), len(by_subtype_inclusive[y])), reverse = True) rows = [] for k in d[0:vsize]: rows += [[k, len(by_subtype_inclusive[k])]] printrows(padrows(rows)) print '-- Top combinations: --' d = sorted(by_subtype, lambda x,y: cmp(len(by_subtype[x]), len(by_subtype[y])), reverse = True) rows = [] for k in d[0:vsize]: rows += [[k, len(by_subtype[k])]] printrows(padrows(rows)) print '--------------------' print (str(len(by_supertype_inclusive)) + ' unique supertypes, ' + str(len(by_supertype)) + ' combinations') print 'Breakdown by supertype:' d = sorted(by_supertype_inclusive, lambda x,y: cmp(len(by_supertype_inclusive[x]),len(by_supertype_inclusive[y])), reverse = True) rows = [[k for k in d[:hsize]]] rows += [[len(by_supertype_inclusive[k]) for k in rows[0]]] printrows(padrows(rows)) print '--------------------' print str(len(by_cmc)) + ' different CMCs, ' + str(len(by_cost)) + ' unique mana costs' print 'Breakdown by CMC:' d = sorted(by_cmc, reverse = False) rows = [[k for k in d[:cmcsize]]] rows += [[len(by_cmc[k]) for k in rows[0]]] printrows(padrows(rows)) print '-- Popular mana costs: --' d = sorted(by_cost, lambda x,y: cmp(len(by_cost[x]), len(by_cost[y])), reverse = True) rows = [] for k in d[0:vsize]: rows += [[utils.from_mana(k), len(by_cost[k])]] printrows(padrows(rows)) print '--------------------' print str(len(by_pt)) + ' unique p/t combinations' print ('Largest power: ' + str(max(map(len, by_power)) - 1) + ', largest toughness: ' + str(max(map(len, by_toughness)) - 1)) print '-- Popular p/t values: --' d = sorted(by_pt, lambda x,y: cmp(len(by_pt[x]), len(by_pt[y])), reverse = True) rows = [] for k in d[0:vsize]: rows += [[utils.from_unary(k), len(by_pt[k])]] printrows(padrows(rows)) print '--------------------' print 'Loyalty values:' d = sorted(by_loyalty, lambda x,y: cmp(len(by_loyalty[x]), len(by_loyalty[y])), reverse = True) rows = [] for k in d[0:vsize]: rows += [[utils.from_unary(k), len(by_loyalty[k])]] printrows(padrows(rows)) print '--------------------' print('Card text ranges from ' + str(min(by_textlen)) + ' to ' + str(max(by_textlen)) + ' characters in length') print('Card text ranges from ' + str(min(by_textlines)) + ' to ' + str(max(by_textlines)) + ' lines') print '-- Line counts by frequency: --' d = sorted(by_textlines, lambda x,y: cmp(len(by_textlines[x]), len(by_textlines[y])), reverse = True) rows = [] for k in d[0:vsize]: rows += [[k, len(by_textlines[k])]] printrows(padrows(rows)) print '====================' # describe outliers in the indices def outliers(hsize = 10, vsize = 10, dump_invalid = False): print '********************' print 'Overview of indices:' rows = [['Index Name', 'Keys', 'Total Members']] for index in indices: rows += [[index, len(indices[index]), index_size(indices[index])]] printrows(padrows(rows)) print '********************' if len(by_name) > 0: scardname = sorted(by_name, lambda x,y: cmp(len(x), len(y)), reverse = False)[0] print 'Shortest Cardname: (' + str(len(scardname)) + ')' print ' ' + scardname lcardname = sorted(by_name, lambda x,y: cmp(len(x), len(y)), reverse = True)[0] print 'Longest Cardname: (' + str(len(lcardname)) + ')' print ' ' + lcardname d = sorted(by_name, lambda x,y: cmp(len(by_name[x]), len(by_name[y])), reverse = True) rows = [] for k in d[0:vsize]: if len(by_name[k]) > 1: rows += [[k, len(by_name[k])]] if rows == []: print('No duplicated cardnames') else: print '-- Most duplicated names: --' printrows(padrows(rows)) else: print 'No cards indexed by name?' print '--------------------' if len(by_type) > 0: ltypes = sorted(by_type, lambda x,y: cmp(len(x), len(y)), reverse = True)[0] print 'Longest card type: (' + str(len(ltypes)) + ')' print ' ' + ltypes else: print 'No cards indexed by type?' if len(by_subtype) > 0: lsubtypes = sorted(by_subtype, lambda x,y: cmp(len(x), len(y)), reverse = True)[0] print 'Longest subtype: (' + str(len(lsubtypes)) + ')' print ' ' + lsubtypes else: print 'No cards indexed by subtype?' if len(by_supertype) > 0: lsupertypes = sorted(by_supertype, lambda x,y: cmp(len(x), len(y)), reverse = True)[0] print 'Longest supertype: (' + str(len(lsupertypes)) + ')' print ' ' + lsupertypes else: print 'No cards indexed by supertype?' print '--------------------' if len(by_cost) > 0: lcost = sorted(by_cost, lambda x,y: cmp(len(x), len(y)), reverse = True)[0] print 'Longest mana cost: (' + str(len(lcost)) + ')' print ' ' + utils.from_mana(lcost) print '\n' + by_cost[lcost][0].reencode() + '\n' else: print 'No cards indexed by cost?' if len(by_cmc) > 0: lcmc = sorted(by_cmc, reverse = True)[0] print 'Largest cmc: (' + str(lcmc) + ')' print ' ' + str(by_cmc[lcmc][0].cost) print '\n' + by_cmc[lcmc][0].reencode() else: print 'No cards indexed by cmc?' print '--------------------' if len(by_power) > 0: lpower = sorted(by_power, lambda x,y: cmp(len(x), len(y)), reverse = True)[0] print 'Largest creature power: ' + utils.from_unary(lpower) print '\n' + by_power[lpower][0].reencode() + '\n' else: print 'No cards indexed by power?' if len(by_toughness) > 0: ltoughness = sorted(by_toughness, lambda x,y: cmp(len(x), len(y)), reverse = True)[0] print 'Largest creature toughness: ' + utils.from_unary(ltoughness) print '\n' + by_toughness[ltoughness][0].reencode() else: print 'No cards indexed by toughness?' print '--------------------' if len(by_textlines) > 0: llines = sorted(by_textlines, reverse = True)[0] print 'Most lines of text in a card: ' + str(llines) print '\n' + by_textlines[llines][0].reencode() + '\n' else: print 'No cards indexed by line count?' if len(by_textlen) > 0: ltext = sorted(by_textlen, reverse = True)[0] print 'Most chars in a card text: ' + str(ltext) print '\n' + by_textlen[ltext][0].reencode() else: print 'No cards indexed by char count?' print '--------------------' print 'There were ' + str(len(invalid_cards)) + ' invalid cards.' if dump_invalid: for card in invalid_cards: print '\n' + card.raw elif len(invalid_cards) > 0: print 'Not summarizing.' print '--------------------' print 'There were ' + str(len(unparsed_cards)) + ' unparsed cards.' if dump_invalid: for card in unparsed_cards: print '\n' + card.raw elif len(unparsed_cards) > 0: print 'Not summarizing.' print '====================' def main(fname, oname = None, verbose = False): if verbose: print 'Opening encoded card file: ' + fname with open(fname, 'rt') as f: text = f.read() cardtexts = text.split(utils.cardsep) analyze(cardtexts) summarize() outliers(dump_invalid = False) if __name__ == '__main__': import sys if len(sys.argv) == 2: main(sys.argv[1]) elif len(sys.argv) == 3: main(sys.argv[1], oname = sys.argv[2]) else: print 'Usage: ' + sys.argv[0] + ' ' + ' [output filename]' exit(1)