Datamining code is in a working state, randomize_mana.py is a good demo

of a really specific feature. Other improvements; changing the keys used during json decode reordered everything in output.txt, but there shouldn't be any other major changes.
2015-07-08 00:22:54 -07:00 · 2015-07-08 00:22:54 -07:00 · cbf8ac34e5
commit cbf8ac34e5
parent 01c78549f5
5 changed files with 28985 additions and 28744 deletions
--- a/datamine.py
+++ b/datamine.py
@ -5,7 +5,8 @@ import random

 import utils

-# format a list of rows of data into nice columns
+# Format a list of rows of data into nice columns.
+# Note that it's the columns that are nice, not this code.
 def padrows(l):
    # get length for each field
    lens = []
@ -28,6 +29,17 @@ def printrows(l):
    for row in l:
        print row

+def randomize_all_mana(text):
+    manastrs = re.findall(utils.mana_regex, text)
+    newtext = text
+    for manastr in sorted(manastrs, lambda x,y: cmp(len(x), len(y)), reverse = True):
+        newtext = newtext.replace(manastr, utils.reserved_marker)
+    for manastr in manastrs:
+        newtext = newtext.replace(utils.reserved_marker, 
+                                  Manacost(manastr).reencode(randomize = True),
+                                  1)
+    return newtext
+
 # so this stuff still needs to be cleaned up
 punctuation_chars = r'[+\-*",.:;WUBRGPV/XTQ|\\&^\{\}@ \n=~%\[\]]'
 creature_keywords = [
@ -184,13 +196,17 @@ class Manacost:
        self.colors = self.get_colors()

    def __str__(self):
-        return utils.mana_untranslate(''.join(self.sequence))
+        return utils.mana_untranslate(utils.mana_open_delimiter + ''.join(self.sequence)
+                                      + utils.mana_close_delimiter)

    def format(self, for_forum):
-        return utils.mana_untranslate(''.join(self.sequence, for_forum))
+        return utils.mana_untranslate(utils.mana_open_delimiter + ''.join(self.sequence, for_forum)
+                                      + utils.mana_close_delimiter)

    def reencode(self, randomize = False):
-        if randomize:
+        if self.none:
+            return ''
+        elif randomize:
            # so this won't work very well if mana_unary_marker isn't empty
            return (utils.mana_open_delimiter 
                    + ''.join(random.sample(self.sequence, len(self.sequence)))
@ -353,7 +369,7 @@ class Card:
            utils.fieldsep,
            self.cost.reencode(randomize) if not self.cost.none else '',
            utils.fieldsep,
-            self.text,
+            self.text if not randomize else randomize_all_mana(self.text),
            utils.fieldsep,
            utils.bsidesep + self.bside.reencode(randomize) if self.bside else '',
        ])
@ -374,6 +390,7 @@ by_subtype = {}
 by_subtype_inclusive = {}
 by_color = {}
 by_color_inclusive = {}
+by_color_count = {}
 by_cmc = {}
 by_cost = {}
 by_power = {}
@ -383,8 +400,32 @@ by_loyalty = {}
 by_textlines = {}
 by_textlen = {}

+indices = {
+    'by_name' : by_name,
+    'by_type' : by_type,
+    'by_type_inclusive' : by_type_inclusive,
+    'by_supertype' : by_supertype,
+    'by_supertype_inclusive' : by_supertype_inclusive,
+    'by_subtype' : by_subtype,
+    'by_subtype_inclusive' : by_subtype_inclusive,
+    'by_color' : by_color,
+    'by_color_inclusive' : by_color_inclusive,
+    'by_color_count' : by_color_count,
+    'by_cmc' : by_cmc,
+    'by_cost' : by_cost,
+    'by_power' : by_power,
+    'by_toughness' : by_toughness,
+    'by_pt' : by_pt,
+    'by_loyalty' : by_loyalty,
+    'by_textlines' : by_textlines,
+    'by_textlen' : by_textlen,
+}
+
+def index_size(d):
+    return sum(map(lambda k: len(d[k]), d))
+
 def inc(d, k, obj):
-    if k:
+    if k or k == 0:
        if k in d:
            d[k] += obj
        else:
@ -424,46 +465,51 @@ def analyze(cardtexts):
                inc(by_color, card.cost.colors, [card])
                for c in card.cost.colors:
                    inc(by_color_inclusive, c, [card])
+                inc(by_color_count, len(card.cost.colors), [card])
            else:
                # colorless, still want to include in these tables
                inc(by_color, 'A', [card])
                inc(by_color_inclusive, 'A', [card])
+                inc(by_color_count, 0, [card])

            inc(by_cmc, card.cost.cmc, [card])
-            inc(by_cost, card.cost.reencode(), [card])
-
+            inc(by_cost, card.cost.reencode() if card.cost.reencode() else 'none', [card])

            inc(by_power, card.power, [card])
            inc(by_toughness, card.toughness, [card])
            inc(by_pt, card.pt, [card])

-
            inc(by_loyalty, card.loyalty, [card])
            
            inc(by_textlines, len(card.text_lines), [card])
            inc(by_textlen, len(card.text), [card])

 # summarize the indices
-def summarize():
+# Yes, this printing code is pretty terrible.
+def summarize(hsize = 10, vsize = 10, cmcsize = 20):
    print '===================='
    print str(len(cards)) + ' valid cards, ' + str(len(invalid_cards)) + ' invalid cards.'
    print str(len(allcards)) + ' cards parsed, ' + str(len(unparsed_cards)) + ' failed to parse'
    print '--------------------'
    print str(len(by_name)) + ' unique card names'
    print '--------------------'
-    print (str(len(by_color)) + ' represented colors (including colorless as \'A\'), ' 
-           + str(len(by_color_inclusive)) + ' combinations')
+    print (str(len(by_color_inclusive)) + ' represented colors (including colorless as \'A\'), ' 
+           + str(len(by_color)) + ' combinations')
    print 'Breakdown by color:'
    rows = [by_color_inclusive.keys()]
    rows += [[len(by_color_inclusive[k]) for k in rows[0]]]
    printrows(padrows(rows))
+    print 'Breakdown by number of colors:'
+    rows = [by_color_count.keys()]
+    rows += [[len(by_color_count[k]) for k in rows[0]]]
+    printrows(padrows(rows))
    print '--------------------'
    print str(len(by_type_inclusive)) + ' unique card types, ' + str(len(by_type)) + ' combinations'
    print 'Breakdown by type:'
    d = sorted(by_type_inclusive, 
-                    lambda x,y: cmp(len(by_type_inclusive[x]), len(by_type_inclusive[y])), 
-                    reverse = True)
-    rows = [[k for k in d[:10]]]
+               lambda x,y: cmp(len(by_type_inclusive[x]), len(by_type_inclusive[y])), 
+               reverse = True)
+    rows = [[k for k in d[:hsize]]]
    rows += [[len(by_type_inclusive[k]) for k in rows[0]]]
    printrows(padrows(rows))
    print '--------------------'
@ -471,18 +517,18 @@ def summarize():
           + str(len(by_subtype)) + ' combinations')
    print '-- Popular subtypes: --'
    d = sorted(by_subtype_inclusive, 
-                    lambda x,y: cmp(len(by_subtype_inclusive[x]), len(by_subtype_inclusive[y])), 
-                    reverse = True)
+               lambda x,y: cmp(len(by_subtype_inclusive[x]), len(by_subtype_inclusive[y])), 
+               reverse = True)
    rows = []
-    for k in d[0:10]:
+    for k in d[0:vsize]:
        rows += [[k, len(by_subtype_inclusive[k])]]
    printrows(padrows(rows))
    print '-- Top combinations: --'
    d = sorted(by_subtype, 
-                    lambda x,y: cmp(len(by_subtype[x]), len(by_subtype[y])), 
-                    reverse = True)
+               lambda x,y: cmp(len(by_subtype[x]), len(by_subtype[y])), 
+               reverse = True)
    rows = []
-    for k in d[0:10]:
+    for k in d[0:vsize]:
        rows += [[k, len(by_subtype[k])]]
    printrows(padrows(rows))
    print '--------------------'
@ -490,17 +536,184 @@ def summarize():
           + str(len(by_supertype)) + ' combinations')
    print 'Breakdown by supertype:'
    d = sorted(by_supertype_inclusive, 
-                    lambda x,y: cmp(len(by_supertype_inclusive[x]),len(by_supertype_inclusive[y])), 
-                    reverse = True)
-    rows = [[k for k in d]]
+               lambda x,y: cmp(len(by_supertype_inclusive[x]),len(by_supertype_inclusive[y])), 
+               reverse = True)
+    rows = [[k for k in d[:hsize]]]
    rows += [[len(by_supertype_inclusive[k]) for k in rows[0]]]
    printrows(padrows(rows))
+    print '--------------------'
+    print str(len(by_cmc)) + ' different CMCs, ' + str(len(by_cost)) + ' unique mana costs'
+    print 'Breakdown by CMC:'
+    d = sorted(by_cmc, reverse = False)
+    rows = [[k for k in d[:cmcsize]]]
+    rows += [[len(by_cmc[k]) for k in rows[0]]]
+    printrows(padrows(rows))
+    print '-- Popular mana costs: --'
+    d = sorted(by_cost, 
+               lambda x,y: cmp(len(by_cost[x]), len(by_cost[y])), 
+               reverse = True)
+    rows = []
+    for k in d[0:vsize]:
+        rows += [[utils.from_mana(k), len(by_cost[k])]]
+    printrows(padrows(rows))
+    print '--------------------'
+    print str(len(by_pt)) + ' unique p/t combinations'
+    print ('Largest power: ' + str(max(map(len, by_power)) - 1) + 
+           ', largest toughness: ' + str(max(map(len, by_toughness)) - 1))
+    print '-- Popular p/t values: --'
+    d = sorted(by_pt, 
+               lambda x,y: cmp(len(by_pt[x]), len(by_pt[y])), 
+               reverse = True)
+    rows = []
+    for k in d[0:vsize]:
+        rows += [[utils.from_unary(k), len(by_pt[k])]]
+    printrows(padrows(rows))
+    print '--------------------'
+    print 'Loyalty values:'
+    d = sorted(by_loyalty, 
+               lambda x,y: cmp(len(by_loyalty[x]), len(by_loyalty[y])), 
+               reverse = True)
+    rows = []
+    for k in d[0:vsize]:
+        rows += [[utils.from_unary(k), len(by_loyalty[k])]]
+    printrows(padrows(rows))
+    print '--------------------'
+    print('Card text ranges from ' + str(min(by_textlen)) + ' to ' 
+          + str(max(by_textlen)) + ' characters in length')
+    print('Card text ranges from ' + str(min(by_textlines)) + ' to '
+          + str(max(by_textlines)) + ' lines')
+    print '-- Line counts by frequency: --'
+    d = sorted(by_textlines, 
+               lambda x,y: cmp(len(by_textlines[x]), len(by_textlines[y])), 
+               reverse = True)
+    rows = []
+    for k in d[0:vsize]:
+        rows += [[k, len(by_textlines[k])]]
+    printrows(padrows(rows))
    print '===================='
-    # TODO: more to come
+

 # describe outliers in the indices
-def outliers():
-    pass
+def outliers(hsize = 10, vsize = 10, dump_invalid = False):
+    print '********************'
+    print 'Overview of indices:'
+    rows = [['Index Name', 'Keys', 'Total Members']]
+    for index in indices:
+        rows += [[index, len(indices[index]), index_size(indices[index])]]
+    printrows(padrows(rows))
+    print '********************'
+    if len(by_name) > 0:
+        scardname =  sorted(by_name, 
+                            lambda x,y: cmp(len(x), len(y)), 
+                            reverse = False)[0]
+        print 'Shortest Cardname: (' + str(len(scardname)) + ')'
+        print '  ' + scardname
+        lcardname =  sorted(by_name, 
+                            lambda x,y: cmp(len(x), len(y)), 
+                            reverse = True)[0]
+        print 'Longest Cardname: (' + str(len(lcardname)) + ')'
+        print '  ' + lcardname
+        d = sorted(by_name, 
+                   lambda x,y: cmp(len(by_name[x]), len(by_name[y])), 
+                   reverse = True)
+        rows = []
+        for k in d[0:vsize]:
+            if len(by_name[k]) > 1:
+                rows += [[k, len(by_name[k])]]
+        if rows == []:
+            print('No duplicated cardnames')
+        else:
+            print '-- Most duplicated names: --'
+            printrows(padrows(rows))
+    else:
+        print 'No cards indexed by name?'
+    print '--------------------'
+    if len(by_type) > 0:
+        ltypes = sorted(by_type, 
+                        lambda x,y: cmp(len(x), len(y)), 
+                        reverse = True)[0]
+        print 'Longest card type: (' + str(len(ltypes)) + ')'
+        print '  ' + ltypes
+    else:
+        print 'No cards indexed by type?'
+    if len(by_subtype) > 0:
+        lsubtypes = sorted(by_subtype, 
+                           lambda x,y: cmp(len(x), len(y)), 
+                           reverse = True)[0]
+        print 'Longest subtype: (' + str(len(lsubtypes)) + ')'
+        print '  ' + lsubtypes
+    else:
+        print 'No cards indexed by subtype?'
+    if len(by_supertype) > 0:
+        lsupertypes = sorted(by_supertype, 
+                        lambda x,y: cmp(len(x), len(y)), 
+                             reverse = True)[0]
+        print 'Longest supertype: (' + str(len(lsupertypes)) + ')'
+        print '  ' + lsupertypes
+    else:
+        print 'No cards indexed by supertype?'
+    print '--------------------'
+    if len(by_cost) > 0:
+        lcost = sorted(by_cost, 
+                       lambda x,y: cmp(len(x), len(y)), 
+                       reverse = True)[0]
+        print 'Longest mana cost: (' + str(len(lcost)) + ')'
+        print '  ' + utils.from_mana(lcost)
+        print '\n' + by_cost[lcost][0].reencode() + '\n'
+    else:
+        print 'No cards indexed by cost?'
+    if len(by_cmc) > 0:
+        lcmc = sorted(by_cmc, reverse = True)[0]
+        print 'Largest cmc: (' + str(lcmc) + ')'
+        print '  ' + str(by_cmc[lcmc][0].cost)
+        print '\n' + by_cmc[lcmc][0].reencode()
+    else:
+        print 'No cards indexed by cmc?'
+    print '--------------------'
+    if len(by_power) > 0:
+        lpower = sorted(by_power, 
+                        lambda x,y: cmp(len(x), len(y)), 
+                        reverse = True)[0]
+        print 'Largest creature power: ' + utils.from_unary(lpower)
+        print '\n' + by_power[lpower][0].reencode() + '\n'
+    else: 
+        print 'No cards indexed by power?'
+    if len(by_toughness) > 0:
+        ltoughness = sorted(by_toughness, 
+                        lambda x,y: cmp(len(x), len(y)), 
+                        reverse = True)[0]
+        print 'Largest creature toughness: ' + utils.from_unary(ltoughness)
+        print '\n' + by_toughness[ltoughness][0].reencode()
+    else: 
+        print 'No cards indexed by toughness?'
+    print '--------------------'
+    if len(by_textlines) > 0:
+        llines = sorted(by_textlines, reverse = True)[0]
+        print 'Most lines of text in a card: ' + str(llines)
+        print '\n' + by_textlines[llines][0].reencode() + '\n'
+    else: 
+        print 'No cards indexed by line count?'
+    if len(by_textlen) > 0:
+        ltext = sorted(by_textlen, reverse = True)[0]
+        print 'Most chars in a card text: ' + str(ltext)
+        print '\n' + by_textlen[ltext][0].reencode()
+    else: 
+        print 'No cards indexed by char count?'
+    print '--------------------'
+    print 'There were ' + str(len(invalid_cards)) + ' invalid cards.'
+    if dump_invalid:
+        for card in invalid_cards:
+            print '\n' + card.raw
+    elif len(invalid_cards) > 0:
+        print 'Not summarizing.'
+    print '--------------------'
+    print 'There were ' + str(len(unparsed_cards)) + ' unparsed cards.'
+    if dump_invalid:
+        for card in unparsed_cards:
+            print '\n' + card.raw
+    elif len(unparsed_cards) > 0:
+        print 'Not summarizing.'
+    print '===================='

 def main(fname, oname = None, verbose = False):
    if verbose:
@ -512,7 +725,7 @@ def main(fname, oname = None, verbose = False):
    cardtexts = text.split(utils.cardsep)
    analyze(cardtexts)
    summarize()
-    outliers()
+    outliers(dump_invalid = False)

 if __name__ == '__main__':
    import sys
@ -523,4 +736,3 @@ if __name__ == '__main__':
    else:
        print 'Usage: ' + sys.argv[0] + ' ' + '<encoded file> [output filename]'
        exit(1)
-
--- a/encode.py
+++ b/encode.py
@ -513,7 +513,7 @@ def main(fname, oname = None, verbose = True):
            if oname == None:
                print val + '\n'
            else:
-                ofile.write(val + '\n\n')
+                ofile.write(val + cardsep)
        
    # print len(badwords)
    # for word in badwords:
--- a/jdecode.py
+++ b/jdecode.py
@ -2,7 +2,7 @@ import json

 # to allow filtering of sets like un sets, etc...
 def legal_set(set):
-    return not set['type'] == 'un'
+    return not (set['type'] == 'un' or set['name'] == 'Celebration')

 def mtg_open_json(fname, verbose = False):

@ -25,7 +25,8 @@ def mtg_open_json(fname, verbose = False):
                cardnumber = None
                if 'number' in card:
                    cardnumber = card['number']
-                cardname = card['name']
+                # the lower avoids duplication of at least one card (Will-o/O'-the-Wisp)
+                cardname = card['name'].lower()

                uid = set['code']
                if cardnumber == None:
@ -46,8 +47,6 @@ def mtg_open_json(fname, verbose = False):
                if uid[-1:] == 'b':
                    bsides[uid] = card

-        #break
-
    for uid in bsides:
        aside_uid = uid[:-1] + 'a'
        if aside_uid in asides:
--- a/output.txt
+++ b/output.txt
--- a/randomize_mana.py
+++ b/randomize_mana.py
@ -0,0 +1,44 @@
+import utils
+import datamine
+import random
+
+def main(fname, oname = None, verbose = True):
+    if verbose:
+        print 'Opening encoded card file: ' + fname
+
+    with open(fname, 'rt') as f:
+        text = f.read()
+
+    cardtexts = text.split(utils.cardsep)
+    
+    # overkill
+    datamine.analyze(cardtexts)
+
+    multicards = []
+    reps = 5
+
+    for card in datamine.cards:
+        for i in range(reps):
+            multicards += [card.reencode(randomize = True)]
+            
+    random.shuffle(multicards)
+
+    if oname:
+        if verbose:
+            print 'Writing output to: ' + oname
+            with open(oname, 'w') as ofile:
+                for textcard in multicards:
+                    ofile.write(textcard + utils.cardsep)
+    else:
+        for textcard in multicards:
+            print textcard + '\n'
+
+if __name__ == '__main__':
+    import sys
+    if len(sys.argv) == 2:
+        main(sys.argv[1])
+    elif len(sys.argv) == 3:
+        main(sys.argv[1], oname = sys.argv[2])
+    else:
+        print 'Usage: ' + sys.argv[0] + ' ' + '<encoded file> [output filename]'
+        exit(1)