From 40fc6958268b3f08e72543dada4e2eb22a8a5e66 Mon Sep 17 00:00:00 2001 From: Bill Zorn Date: Tue, 14 Jul 2015 23:27:21 -0700 Subject: [PATCH] Card now has flexible input from encoded formats. Data mining code updated. Unfortunately, python does not do import in a nice way without using the full bore module system, I'll deal with that another day. --- datamine.py | 395 ------------------------------------------------- lib/cardlib.py | 93 ++++++++++-- lib/datalib.py | 380 +++++++++++++++++++++++++++++++++++++++++++++++ summarize.py | 33 +++++ 4 files changed, 492 insertions(+), 409 deletions(-) delete mode 100644 datamine.py create mode 100644 lib/datalib.py create mode 100644 summarize.py diff --git a/datamine.py b/datamine.py deleted file mode 100644 index c853166..0000000 --- a/datamine.py +++ /dev/null @@ -1,395 +0,0 @@ -import re -import codecs -import sys -import random - -import lib.utils as utils -from lib.card import Card -from lib.mana import Manacost - -# Format a list of rows of data into nice columns. -# Note that it's the columns that are nice, not this code. -def padrows(l): - # get length for each field - lens = [] - for ll in l: - for i, field in enumerate(ll): - if i < len(lens): - lens[i] = max(len(str(field)), lens[i]) - else: - lens += [len(str(field))] - # now pad out to that length - padded = [] - for ll in l: - padded += [''] - for i, field in enumerate(ll): - s = str(field) - pad = ' ' * (lens[i] - len(s)) - padded[-1] += (s + pad + ' ') - return padded -def printrows(l): - for row in l: - print row - -# global card pools -unparsed_cards = [] -invalid_cards = [] -cards = [] -allcards = [] - -# global indices -by_name = {} -by_type = {} -by_type_inclusive = {} -by_supertype = {} -by_supertype_inclusive = {} -by_subtype = {} -by_subtype_inclusive = {} -by_color = {} -by_color_inclusive = {} -by_color_count = {} -by_cmc = {} -by_cost = {} -by_power = {} -by_toughness = {} -by_pt = {} -by_loyalty = {} -by_textlines = {} -by_textlen = {} - -indices = { - 'by_name' : by_name, - 'by_type' : by_type, - 'by_type_inclusive' : by_type_inclusive, - 'by_supertype' : by_supertype, - 'by_supertype_inclusive' : by_supertype_inclusive, - 'by_subtype' : by_subtype, - 'by_subtype_inclusive' : by_subtype_inclusive, - 'by_color' : by_color, - 'by_color_inclusive' : by_color_inclusive, - 'by_color_count' : by_color_count, - 'by_cmc' : by_cmc, - 'by_cost' : by_cost, - 'by_power' : by_power, - 'by_toughness' : by_toughness, - 'by_pt' : by_pt, - 'by_loyalty' : by_loyalty, - 'by_textlines' : by_textlines, - 'by_textlen' : by_textlen, -} - -def index_size(d): - return sum(map(lambda k: len(d[k]), d)) - -def inc(d, k, obj): - if k or k == 0: - if k in d: - d[k] += obj - else: - d[k] = obj - -# build the global indices -def analyze(cardtexts): - global unparsed_cards, invalid_cards, cards, allcards - for cardtext in cardtexts: - # the empty card is not interesting - if not cardtext: - continue - card = Card(cardtext) - if card._valid: - cards += [card] - allcards += [card] - elif card._parsed: - invalid_cards += [card] - allcards += [card] - else: - unparsed_cards += [card] - - if card._parsed: - inc(by_name, card.name, [card]) - - inc(by_type, ' '.join(card.types), [card]) - for t in card.types: - inc(by_type_inclusive, t, [card]) - inc(by_supertype, ' '.join(card.supertypes), [card]) - for t in card.supertypes: - inc(by_supertype_inclusive, t, [card]) - inc(by_subtype, ' '.join(card.subtypes), [card]) - for t in card.subtypes: - inc(by_subtype_inclusive, t, [card]) - - if card.cost.colors: - inc(by_color, card.cost.colors, [card]) - for c in card.cost.colors: - inc(by_color_inclusive, c, [card]) - inc(by_color_count, len(card.cost.colors), [card]) - else: - # colorless, still want to include in these tables - inc(by_color, 'A', [card]) - inc(by_color_inclusive, 'A', [card]) - inc(by_color_count, 0, [card]) - - inc(by_cmc, card.cost.cmc, [card]) - inc(by_cost, card.cost.reencode() if card.cost.reencode() else 'none', [card]) - - inc(by_power, card.power, [card]) - inc(by_toughness, card.toughness, [card]) - inc(by_pt, card.pt, [card]) - - inc(by_loyalty, card.loyalty, [card]) - - inc(by_textlines, len(card.text_lines), [card]) - inc(by_textlen, len(card.text), [card]) - -# summarize the indices -# Yes, this printing code is pretty terrible. -def summarize(hsize = 10, vsize = 10, cmcsize = 20): - print '====================' - print str(len(cards)) + ' valid cards, ' + str(len(invalid_cards)) + ' invalid cards.' - print str(len(allcards)) + ' cards parsed, ' + str(len(unparsed_cards)) + ' failed to parse' - print '--------------------' - print str(len(by_name)) + ' unique card names' - print '--------------------' - print (str(len(by_color_inclusive)) + ' represented colors (including colorless as \'A\'), ' - + str(len(by_color)) + ' combinations') - print 'Breakdown by color:' - rows = [by_color_inclusive.keys()] - rows += [[len(by_color_inclusive[k]) for k in rows[0]]] - printrows(padrows(rows)) - print 'Breakdown by number of colors:' - rows = [by_color_count.keys()] - rows += [[len(by_color_count[k]) for k in rows[0]]] - printrows(padrows(rows)) - print '--------------------' - print str(len(by_type_inclusive)) + ' unique card types, ' + str(len(by_type)) + ' combinations' - print 'Breakdown by type:' - d = sorted(by_type_inclusive, - lambda x,y: cmp(len(by_type_inclusive[x]), len(by_type_inclusive[y])), - reverse = True) - rows = [[k for k in d[:hsize]]] - rows += [[len(by_type_inclusive[k]) for k in rows[0]]] - printrows(padrows(rows)) - print '--------------------' - print (str(len(by_subtype_inclusive)) + ' unique subtypes, ' - + str(len(by_subtype)) + ' combinations') - print '-- Popular subtypes: --' - d = sorted(by_subtype_inclusive, - lambda x,y: cmp(len(by_subtype_inclusive[x]), len(by_subtype_inclusive[y])), - reverse = True) - rows = [] - for k in d[0:vsize]: - rows += [[k, len(by_subtype_inclusive[k])]] - printrows(padrows(rows)) - print '-- Top combinations: --' - d = sorted(by_subtype, - lambda x,y: cmp(len(by_subtype[x]), len(by_subtype[y])), - reverse = True) - rows = [] - for k in d[0:vsize]: - rows += [[k, len(by_subtype[k])]] - printrows(padrows(rows)) - print '--------------------' - print (str(len(by_supertype_inclusive)) + ' unique supertypes, ' - + str(len(by_supertype)) + ' combinations') - print 'Breakdown by supertype:' - d = sorted(by_supertype_inclusive, - lambda x,y: cmp(len(by_supertype_inclusive[x]),len(by_supertype_inclusive[y])), - reverse = True) - rows = [[k for k in d[:hsize]]] - rows += [[len(by_supertype_inclusive[k]) for k in rows[0]]] - printrows(padrows(rows)) - print '--------------------' - print str(len(by_cmc)) + ' different CMCs, ' + str(len(by_cost)) + ' unique mana costs' - print 'Breakdown by CMC:' - d = sorted(by_cmc, reverse = False) - rows = [[k for k in d[:cmcsize]]] - rows += [[len(by_cmc[k]) for k in rows[0]]] - printrows(padrows(rows)) - print '-- Popular mana costs: --' - d = sorted(by_cost, - lambda x,y: cmp(len(by_cost[x]), len(by_cost[y])), - reverse = True) - rows = [] - for k in d[0:vsize]: - rows += [[utils.from_mana(k), len(by_cost[k])]] - printrows(padrows(rows)) - print '--------------------' - print str(len(by_pt)) + ' unique p/t combinations' - print ('Largest power: ' + str(max(map(len, by_power)) - 1) + - ', largest toughness: ' + str(max(map(len, by_toughness)) - 1)) - print '-- Popular p/t values: --' - d = sorted(by_pt, - lambda x,y: cmp(len(by_pt[x]), len(by_pt[y])), - reverse = True) - rows = [] - for k in d[0:vsize]: - rows += [[utils.from_unary(k), len(by_pt[k])]] - printrows(padrows(rows)) - print '--------------------' - print 'Loyalty values:' - d = sorted(by_loyalty, - lambda x,y: cmp(len(by_loyalty[x]), len(by_loyalty[y])), - reverse = True) - rows = [] - for k in d[0:vsize]: - rows += [[utils.from_unary(k), len(by_loyalty[k])]] - printrows(padrows(rows)) - print '--------------------' - print('Card text ranges from ' + str(min(by_textlen)) + ' to ' - + str(max(by_textlen)) + ' characters in length') - print('Card text ranges from ' + str(min(by_textlines)) + ' to ' - + str(max(by_textlines)) + ' lines') - print '-- Line counts by frequency: --' - d = sorted(by_textlines, - lambda x,y: cmp(len(by_textlines[x]), len(by_textlines[y])), - reverse = True) - rows = [] - for k in d[0:vsize]: - rows += [[k, len(by_textlines[k])]] - printrows(padrows(rows)) - print '====================' - - -# describe outliers in the indices -def outliers(hsize = 10, vsize = 10, dump_invalid = False): - print '********************' - print 'Overview of indices:' - rows = [['Index Name', 'Keys', 'Total Members']] - for index in indices: - rows += [[index, len(indices[index]), index_size(indices[index])]] - printrows(padrows(rows)) - print '********************' - if len(by_name) > 0: - scardname = sorted(by_name, - lambda x,y: cmp(len(x), len(y)), - reverse = False)[0] - print 'Shortest Cardname: (' + str(len(scardname)) + ')' - print ' ' + scardname - lcardname = sorted(by_name, - lambda x,y: cmp(len(x), len(y)), - reverse = True)[0] - print 'Longest Cardname: (' + str(len(lcardname)) + ')' - print ' ' + lcardname - d = sorted(by_name, - lambda x,y: cmp(len(by_name[x]), len(by_name[y])), - reverse = True) - rows = [] - for k in d[0:vsize]: - if len(by_name[k]) > 1: - rows += [[k, len(by_name[k])]] - if rows == []: - print('No duplicated cardnames') - else: - print '-- Most duplicated names: --' - printrows(padrows(rows)) - else: - print 'No cards indexed by name?' - print '--------------------' - if len(by_type) > 0: - ltypes = sorted(by_type, - lambda x,y: cmp(len(x), len(y)), - reverse = True)[0] - print 'Longest card type: (' + str(len(ltypes)) + ')' - print ' ' + ltypes - else: - print 'No cards indexed by type?' - if len(by_subtype) > 0: - lsubtypes = sorted(by_subtype, - lambda x,y: cmp(len(x), len(y)), - reverse = True)[0] - print 'Longest subtype: (' + str(len(lsubtypes)) + ')' - print ' ' + lsubtypes - else: - print 'No cards indexed by subtype?' - if len(by_supertype) > 0: - lsupertypes = sorted(by_supertype, - lambda x,y: cmp(len(x), len(y)), - reverse = True)[0] - print 'Longest supertype: (' + str(len(lsupertypes)) + ')' - print ' ' + lsupertypes - else: - print 'No cards indexed by supertype?' - print '--------------------' - if len(by_cost) > 0: - lcost = sorted(by_cost, - lambda x,y: cmp(len(x), len(y)), - reverse = True)[0] - print 'Longest mana cost: (' + str(len(lcost)) + ')' - print ' ' + utils.from_mana(lcost) - print '\n' + by_cost[lcost][0].reencode() + '\n' - else: - print 'No cards indexed by cost?' - if len(by_cmc) > 0: - lcmc = sorted(by_cmc, reverse = True)[0] - print 'Largest cmc: (' + str(lcmc) + ')' - print ' ' + str(by_cmc[lcmc][0].cost) - print '\n' + by_cmc[lcmc][0].reencode() - else: - print 'No cards indexed by cmc?' - print '--------------------' - if len(by_power) > 0: - lpower = sorted(by_power, - lambda x,y: cmp(len(x), len(y)), - reverse = True)[0] - print 'Largest creature power: ' + utils.from_unary(lpower) - print '\n' + by_power[lpower][0].reencode() + '\n' - else: - print 'No cards indexed by power?' - if len(by_toughness) > 0: - ltoughness = sorted(by_toughness, - lambda x,y: cmp(len(x), len(y)), - reverse = True)[0] - print 'Largest creature toughness: ' + utils.from_unary(ltoughness) - print '\n' + by_toughness[ltoughness][0].reencode() - else: - print 'No cards indexed by toughness?' - print '--------------------' - if len(by_textlines) > 0: - llines = sorted(by_textlines, reverse = True)[0] - print 'Most lines of text in a card: ' + str(llines) - print '\n' + by_textlines[llines][0].reencode() + '\n' - else: - print 'No cards indexed by line count?' - if len(by_textlen) > 0: - ltext = sorted(by_textlen, reverse = True)[0] - print 'Most chars in a card text: ' + str(ltext) - print '\n' + by_textlen[ltext][0].reencode() - else: - print 'No cards indexed by char count?' - print '--------------------' - print 'There were ' + str(len(invalid_cards)) + ' invalid cards.' - if dump_invalid: - for card in invalid_cards: - print '\n' + card.raw - elif len(invalid_cards) > 0: - print 'Not summarizing.' - print '--------------------' - print 'There were ' + str(len(unparsed_cards)) + ' unparsed cards.' - if dump_invalid: - for card in unparsed_cards: - print '\n' + card.raw - elif len(unparsed_cards) > 0: - print 'Not summarizing.' - print '====================' - -def main(fname, oname = None, verbose = False): - if verbose: - print 'Opening encoded card file: ' + fname - - with open(fname, 'rt') as f: - text = f.read() - - cardtexts = text.split(utils.cardsep) - analyze(cardtexts) - summarize() - outliers(dump_invalid = False) - -if __name__ == '__main__': - import sys - if len(sys.argv) == 2: - main(sys.argv[1]) - elif len(sys.argv) == 3: - main(sys.argv[1], oname = sys.argv[2]) - else: - print 'Usage: ' + sys.argv[0] + ' ' + ' [output filename]' - exit(1) diff --git a/lib/cardlib.py b/lib/cardlib.py index 12e69e2..d6f9b69 100644 --- a/lib/cardlib.py +++ b/lib/cardlib.py @@ -202,9 +202,74 @@ def fields_from_json(src_json): # we don't need to worry about bsides because we handle that in the constructor return parsed, valid and fields_check_valid(fields), fields -def fields_from_format(src_text, fmt_ordered, fmt_labeled, fieldsep): - pass +def fields_from_format(src_text, fmt_ordered, fmt_labeled, fieldsep): + parsed = True + valid = True + fields = {} + + if fmt_labeled: + labels = {fmt_labeled[k] : k for k in fmt_labeled} + field_label_regex = '[' + ''.join(labels.keys()) + ']' + def addf(fields, fkey, fval): + if fkey in fields: + fields[fkey] += [fval] + else: + fields[fkey] = [fval] + + textfields = src_text.split(fieldsep) + idx = 0 + true_idx = 0 + for textfield in textfields: + # ignore leading or trailing empty fields due to seps + if textfield == '': + if true_idx == 0 or true_idx == len(textfields) - 1: + true_idx += 1 + continue + # count the field index for other empty fields but don't add them + else: + idx += 1 + true_idx += 1 + continue + + lab = None + if fmt_labeled: + labs = re.findall(field_label_regex, textfield) + # use the first label if we saw any at all + if len(labs) > 0: + lab = labs[0] + # try to use the field label if we got one + if lab and lab in labels: + fname = labels[lab] + # fall back to the field order specified + elif idx < len(fmt_ordered): + fname = fmt_ordered[idx] + # we don't know what to do with this field: call it other + else: + fname = field_other + parsed = False + valid = False + + # specialized handling + if fname in [field_cost]: + fval = Manacost(textfield) + parsed = parsed and fval.parsed + valid = valid and fval.valid + addf(fields, fname, (idx, fval)) + elif fname in [field_text]: + fval = Manatext(textfield) + valid = valid and fval.valid + addf(fields, fname, (idx, fval)) + elif fname in [field_supertypes, field_types, field_subtypes]: + addf(fields, fname, (idx, textfield.split())) + else: + addf(fields, fname, (idx, textfield)) + + idx += 1 + true_idx += 1 + + # again, bsides are handled by the constructor + return parsed, valid and fields_check_valid(fields), fields # Here's the actual Card class that other files should use. @@ -272,8 +337,8 @@ class Card: if self.fields: for field in self.fields: # look for a specialized set function - if '_set_' + field in self.__dict__: - self.__dict__['_set_' + field](self.fields[field]) + if hasattr(self, '_set_' + field): + getattr(self, '_set_' + field)(self.fields[field]) # otherwise use the default one elif field in self.__dict__: self.set_field_default(field, self.fields[field]) @@ -330,16 +395,16 @@ class Card: break # only use the first one... def _set_text(self, values): - mtext = '' for idx, value in values: mtext = value - self.__dict__[field_text] = mtext - fulltext = mtext.encode() - if fulltext: - self.__dict__[field_text + '_lines'] = map(Manatext, fulltext.split(utils.newline)) - self.__dict__[field_text + '_words'] = re.sub(utils.unletters_regex, - ' ', - fulltext).split() + self.__dict__[field_text] = mtext + fulltext = mtext.encode() + if fulltext: + self.__dict__[field_text + '_lines'] = map(Manatext, fulltext.split(utils.newline)) + self.__dict__[field_text + '_words'] = re.sub(utils.unletters_regex, + ' ', + fulltext).split() + break # only use the first one... def _set_other(self, values): # just record these, we could do somthing unset valid if we really wanted @@ -358,8 +423,8 @@ class Card: for field in fmt_ordered: if field in self.__dict__: - if self.__dict__[field]: - outfield = self.__dict__[field] + outfield = self.__dict__[field] + if outfield: # specialized field handling for the ones that aren't strings (sigh) if isinstance(outfield, list): outfield_str = ' '.join(outfield) diff --git a/lib/datalib.py b/lib/datalib.py new file mode 100644 index 0000000..a8e3edf --- /dev/null +++ b/lib/datalib.py @@ -0,0 +1,380 @@ +import re +import sys + +import utils +from cardlib import Card + +# Format a list of rows of data into nice columns. +# Note that it's the columns that are nice, not this code. +def padrows(l): + # get length for each field + lens = [] + for ll in l: + for i, field in enumerate(ll): + if i < len(lens): + lens[i] = max(len(str(field)), lens[i]) + else: + lens += [len(str(field))] + # now pad out to that length + padded = [] + for ll in l: + padded += [''] + for i, field in enumerate(ll): + s = str(field) + pad = ' ' * (lens[i] - len(s)) + padded[-1] += (s + pad + ' ') + return padded +def printrows(l): + for row in l: + print row + +# index management helpers +def index_size(d): + return sum(map(lambda k: len(d[k]), d)) + +def inc(d, k, obj): + if k or k == 0: + if k in d: + d[k] += obj + else: + d[k] = obj + +# thanks gleemax +def plimit(s, mlen = 1000): + if len(s) > mlen: + return s[:1000] + '[...]' + else: + return s + +class Datamine: + # build the global indices + def __init__(self, card_srcs): + # global card pools + self.unparsed_cards = [] + self.invalid_cards = [] + self.cards = [] + self.allcards = [] + + # global indices + self.by_name = {} + self.by_type = {} + self.by_type_inclusive = {} + self.by_supertype = {} + self.by_supertype_inclusive = {} + self.by_subtype = {} + self.by_subtype_inclusive = {} + self.by_color = {} + self.by_color_inclusive = {} + self.by_color_count = {} + self.by_cmc = {} + self.by_cost = {} + self.by_power = {} + self.by_toughness = {} + self.by_pt = {} + self.by_loyalty = {} + self.by_textlines = {} + self.by_textlen = {} + + self.indices = { + 'by_name' : self.by_name, + 'by_type' : self.by_type, + 'by_type_inclusive' : self.by_type_inclusive, + 'by_supertype' : self.by_supertype, + 'by_supertype_inclusive' : self.by_supertype_inclusive, + 'by_subtype' : self.by_subtype, + 'by_subtype_inclusive' : self.by_subtype_inclusive, + 'by_color' : self.by_color, + 'by_color_inclusive' : self.by_color_inclusive, + 'by_color_count' : self.by_color_count, + 'by_cmc' : self.by_cmc, + 'by_cost' : self.by_cost, + 'by_power' : self.by_power, + 'by_toughness' : self.by_toughness, + 'by_pt' : self.by_pt, + 'by_loyalty' : self.by_loyalty, + 'by_textlines' : self.by_textlines, + 'by_textlen' : self.by_textlen, + } + + for card_src in card_srcs: + # the empty card is not interesting + if not card_src: + continue + card = Card(card_src) + if card.valid: + self.cards += [card] + self.allcards += [card] + elif card.parsed: + self.invalid_cards += [card] + self.allcards += [card] + else: + self.unparsed_cards += [card] + + if card.parsed: + inc(self.by_name, card.name, [card]) + + inc(self.by_type, ' '.join(card.types), [card]) + for t in card.types: + inc(self.by_type_inclusive, t, [card]) + inc(self.by_supertype, ' '.join(card.supertypes), [card]) + for t in card.supertypes: + inc(self.by_supertype_inclusive, t, [card]) + inc(self.by_subtype, ' '.join(card.subtypes), [card]) + for t in card.subtypes: + inc(self.by_subtype_inclusive, t, [card]) + + if card.cost.colors: + inc(self.by_color, card.cost.colors, [card]) + for c in card.cost.colors: + inc(self.by_color_inclusive, c, [card]) + inc(self.by_color_count, len(card.cost.colors), [card]) + else: + # colorless, still want to include in these tables + inc(self.by_color, 'A', [card]) + inc(self.by_color_inclusive, 'A', [card]) + inc(self.by_color_count, 0, [card]) + + inc(self.by_cmc, card.cost.cmc, [card]) + inc(self.by_cost, card.cost.encode() if card.cost.encode() else 'none', [card]) + + inc(self.by_power, card.pt_p, [card]) + inc(self.by_toughness, card.pt_t, [card]) + inc(self.by_pt, card.pt, [card]) + + inc(self.by_loyalty, card.loyalty, [card]) + + inc(self.by_textlines, len(card.text_lines), [card]) + inc(self.by_textlen, len(card.text.encode()), [card]) + + # summarize the indices + # Yes, this printing code is pretty terrible. + def summarize(self, hsize = 10, vsize = 10, cmcsize = 20): + print '====================' + print str(len(self.cards)) + ' valid cards, ' + str(len(self.invalid_cards)) + ' invalid cards.' + print str(len(self.allcards)) + ' cards parsed, ' + str(len(self.unparsed_cards)) + ' failed to parse' + print '--------------------' + print str(len(self.by_name)) + ' unique card names' + print '--------------------' + print (str(len(self.by_color_inclusive)) + ' represented colors (including colorless as \'A\'), ' + + str(len(self.by_color)) + ' combinations') + print 'Breakdown by color:' + rows = [self.by_color_inclusive.keys()] + rows += [[len(self.by_color_inclusive[k]) for k in rows[0]]] + printrows(padrows(rows)) + print 'Breakdown by number of colors:' + rows = [self.by_color_count.keys()] + rows += [[len(self.by_color_count[k]) for k in rows[0]]] + printrows(padrows(rows)) + print '--------------------' + print str(len(self.by_type_inclusive)) + ' unique card types, ' + str(len(self.by_type)) + ' combinations' + print 'Breakdown by type:' + d = sorted(self.by_type_inclusive, + lambda x,y: cmp(len(self.by_type_inclusive[x]), len(self.by_type_inclusive[y])), + reverse = True) + rows = [[k for k in d[:hsize]]] + rows += [[len(self.by_type_inclusive[k]) for k in rows[0]]] + printrows(padrows(rows)) + print '--------------------' + print (str(len(self.by_subtype_inclusive)) + ' unique subtypes, ' + + str(len(self.by_subtype)) + ' combinations') + print '-- Popular subtypes: --' + d = sorted(self.by_subtype_inclusive, + lambda x,y: cmp(len(self.by_subtype_inclusive[x]), len(self.by_subtype_inclusive[y])), + reverse = True) + rows = [] + for k in d[0:vsize]: + rows += [[k, len(self.by_subtype_inclusive[k])]] + printrows(padrows(rows)) + print '-- Top combinations: --' + d = sorted(self.by_subtype, + lambda x,y: cmp(len(self.by_subtype[x]), len(self.by_subtype[y])), + reverse = True) + rows = [] + for k in d[0:vsize]: + rows += [[k, len(self.by_subtype[k])]] + printrows(padrows(rows)) + print '--------------------' + print (str(len(self.by_supertype_inclusive)) + ' unique supertypes, ' + + str(len(self.by_supertype)) + ' combinations') + print 'Breakdown by supertype:' + d = sorted(self.by_supertype_inclusive, + lambda x,y: cmp(len(self.by_supertype_inclusive[x]),len(self.by_supertype_inclusive[y])), + reverse = True) + rows = [[k for k in d[:hsize]]] + rows += [[len(self.by_supertype_inclusive[k]) for k in rows[0]]] + printrows(padrows(rows)) + print '--------------------' + print str(len(self.by_cmc)) + ' different CMCs, ' + str(len(self.by_cost)) + ' unique mana costs' + print 'Breakdown by CMC:' + d = sorted(self.by_cmc, reverse = False) + rows = [[k for k in d[:cmcsize]]] + rows += [[len(self.by_cmc[k]) for k in rows[0]]] + printrows(padrows(rows)) + print '-- Popular mana costs: --' + d = sorted(self.by_cost, + lambda x,y: cmp(len(self.by_cost[x]), len(self.by_cost[y])), + reverse = True) + rows = [] + for k in d[0:vsize]: + rows += [[utils.from_mana(k), len(self.by_cost[k])]] + printrows(padrows(rows)) + print '--------------------' + print str(len(self.by_pt)) + ' unique p/t combinations' + if len(self.by_power) > 0 and len(self.by_toughness) > 0: + print ('Largest power: ' + str(max(map(len, self.by_power)) - 1) + + ', largest toughness: ' + str(max(map(len, self.by_toughness)) - 1)) + print '-- Popular p/t values: --' + d = sorted(self.by_pt, + lambda x,y: cmp(len(self.by_pt[x]), len(self.by_pt[y])), + reverse = True) + rows = [] + for k in d[0:vsize]: + rows += [[utils.from_unary(k), len(self.by_pt[k])]] + printrows(padrows(rows)) + print '--------------------' + print 'Loyalty values:' + d = sorted(self.by_loyalty, + lambda x,y: cmp(len(self.by_loyalty[x]), len(self.by_loyalty[y])), + reverse = True) + rows = [] + for k in d[0:vsize]: + rows += [[utils.from_unary(k), len(self.by_loyalty[k])]] + printrows(padrows(rows)) + print '--------------------' + if len(self.by_textlen) > 0 and len(self.by_textlines) > 0: + print('Card text ranges from ' + str(min(self.by_textlen)) + ' to ' + + str(max(self.by_textlen)) + ' characters in length') + print('Card text ranges from ' + str(min(self.by_textlines)) + ' to ' + + str(max(self.by_textlines)) + ' lines') + print '-- Line counts by frequency: --' + d = sorted(self.by_textlines, + lambda x,y: cmp(len(self.by_textlines[x]), len(self.by_textlines[y])), + reverse = True) + rows = [] + for k in d[0:vsize]: + rows += [[k, len(self.by_textlines[k])]] + printrows(padrows(rows)) + print '====================' + + + # describe outliers in the indices + def outliers(self, hsize = 10, vsize = 10, dump_invalid = False): + print '********************' + print 'Overview of indices:' + rows = [['Index Name', 'Keys', 'Total Members']] + for index in self.indices: + rows += [[index, len(self.indices[index]), index_size(self.indices[index])]] + printrows(padrows(rows)) + print '********************' + if len(self.by_name) > 0: + scardname = sorted(self.by_name, + lambda x,y: cmp(len(x), len(y)), + reverse = False)[0] + print 'Shortest Cardname: (' + str(len(scardname)) + ')' + print ' ' + scardname + lcardname = sorted(self.by_name, + lambda x,y: cmp(len(x), len(y)), + reverse = True)[0] + print 'Longest Cardname: (' + str(len(lcardname)) + ')' + print ' ' + lcardname + d = sorted(self.by_name, + lambda x,y: cmp(len(self.by_name[x]), len(self.by_name[y])), + reverse = True) + rows = [] + for k in d[0:vsize]: + if len(self.by_name[k]) > 1: + rows += [[k, len(self.by_name[k])]] + if rows == []: + print('No duplicated cardnames') + else: + print '-- Most duplicated names: --' + printrows(padrows(rows)) + else: + print 'No cards indexed by name?' + print '--------------------' + if len(self.by_type) > 0: + ltypes = sorted(self.by_type, + lambda x,y: cmp(len(x), len(y)), + reverse = True)[0] + print 'Longest card type: (' + str(len(ltypes)) + ')' + print ' ' + ltypes + else: + print 'No cards indexed by type?' + if len(self.by_subtype) > 0: + lsubtypes = sorted(self.by_subtype, + lambda x,y: cmp(len(x), len(y)), + reverse = True)[0] + print 'Longest subtype: (' + str(len(lsubtypes)) + ')' + print ' ' + lsubtypes + else: + print 'No cards indexed by subtype?' + if len(self.by_supertype) > 0: + lsupertypes = sorted(self.by_supertype, + lambda x,y: cmp(len(x), len(y)), + reverse = True)[0] + print 'Longest supertype: (' + str(len(lsupertypes)) + ')' + print ' ' + lsupertypes + else: + print 'No cards indexed by supertype?' + print '--------------------' + if len(self.by_cost) > 0: + lcost = sorted(self.by_cost, + lambda x,y: cmp(len(x), len(y)), + reverse = True)[0] + print 'Longest mana cost: (' + str(len(lcost)) + ')' + print ' ' + utils.from_mana(lcost) + print '\n' + plimit(self.by_cost[lcost][0].encode()) + '\n' + else: + print 'No cards indexed by cost?' + if len(self.by_cmc) > 0: + lcmc = sorted(self.by_cmc, reverse = True)[0] + print 'Largest cmc: (' + str(lcmc) + ')' + print ' ' + str(self.by_cmc[lcmc][0].cost) + print '\n' + plimit(self.by_cmc[lcmc][0].encode()) + else: + print 'No cards indexed by cmc?' + print '--------------------' + if len(self.by_power) > 0: + lpower = sorted(self.by_power, + lambda x,y: cmp(len(x), len(y)), + reverse = True)[0] + print 'Largest creature power: ' + utils.from_unary(lpower) + print '\n' + plimit(self.by_power[lpower][0].encode()) + '\n' + else: + print 'No cards indexed by power?' + if len(self.by_toughness) > 0: + ltoughness = sorted(self.by_toughness, + lambda x,y: cmp(len(x), len(y)), + reverse = True)[0] + print 'Largest creature toughness: ' + utils.from_unary(ltoughness) + print '\n' + plimit(self.by_toughness[ltoughness][0].encode()) + else: + print 'No cards indexed by toughness?' + print '--------------------' + if len(self.by_textlines) > 0: + llines = sorted(self.by_textlines, reverse = True)[0] + print 'Most lines of text in a card: ' + str(llines) + print '\n' + plimit(self.by_textlines[llines][0].encode()) + '\n' + else: + print 'No cards indexed by line count?' + if len(self.by_textlen) > 0: + ltext = sorted(self.by_textlen, reverse = True)[0] + print 'Most chars in a card text: ' + str(ltext) + print '\n' + plimit(self.by_textlen[ltext][0].encode()) + else: + print 'No cards indexed by char count?' + print '--------------------' + print 'There were ' + str(len(self.invalid_cards)) + ' invalid cards.' + if dump_invalid: + for card in self.invalid_cards: + print '\n' + repr(card.fields) + elif len(self.invalid_cards) > 0: + print 'Not summarizing.' + print '--------------------' + print 'There were ' + str(len(self.unparsed_cards)) + ' unparsed cards.' + if dump_invalid: + for card in self.unparsed_cards: + print '\n' + repr(card.fields) + elif len(self.unparsed_cards) > 0: + print 'Not summarizing.' + print '====================' diff --git a/summarize.py b/summarize.py new file mode 100644 index 0000000..8a81d99 --- /dev/null +++ b/summarize.py @@ -0,0 +1,33 @@ +import sys + +import lib.utils as utils +import lib.jdecode as jdecode +from lib.datalib import Datamine + +def main(fname, verbose = True): + if fname[-5:] == '.json': + if verbose: + print 'This looks like a json file: ' + fname + json_srcs = jdecode.mtg_open_json(fname, verbose) + card_srcs = [] + for json_cardname in json_srcs: + if len(json_srcs[json_cardname]) > 0: + card_srcs += [json_srcs[json_cardname][0]] + else: + if verbose: + print 'Opening encoded card file: ' + fname + with open(fname, 'rt') as f: + text = f.read() + card_srcs = text.split(utils.cardsep) + + mine = Datamine(card_srcs) + mine.summarize() + mine.outliers(dump_invalid = False) + +if __name__ == '__main__': + import sys + if len(sys.argv) == 2: + main(sys.argv[1]) + else: + print 'Usage: ' + sys.argv[0] + ' ' + '' + exit(1)