code changes for encoding updates and sanity checker script that found them

2016-05-06 18:36:35 -07:00 · 2016-05-06 18:36:35 -07:00 · d4b5ef2104
commit d4b5ef2104
parent 6421c02f60
3 changed files with 177 additions and 12 deletions
--- a/lib/cardlib.py
+++ b/lib/cardlib.py
@ -257,8 +257,11 @@ def fields_from_json(src_json, linetrans = True):
        parsed = False
    if 'subtypes' in src_json:
-        fields[field_subtypes] = [(-1, map(lambda s: utils.to_ascii(s.lower()), 
+        fields[field_subtypes] = [(-1, map(lambda s: utils.to_ascii(s.lower())
                                           # urza's lands...
                                           .replace('"', "'").replace('-', utils.dash_marker), 
                                           src_json['subtypes']))]
    if 'rarity' in src_json:
        if src_json['rarity'] in utils.json_rarity_map:
--- a/lib/transforms.py
+++ b/lib/transforms.py
@ -94,6 +94,19 @@ def text_pass_2_cardname(s, name):
    for override in overrides:
        s = s.replace(override, this_marker)
    # stupid planeswalker abilities
    s = s.replace('to him.', 'to ' + this_marker + '.')
    s = s.replace('to him this', 'to ' + this_marker + ' this')
    s = s.replace('to himself', 'to itself')
    s = s.replace("he's", this_marker + ' is')
    # sometimes we actually don't want to do this replacement
    s = s.replace('named ' + this_marker, 'named ' + name)
    s = s.replace('name is still ' + this_marker, 'name is still ' + name)
    s = s.replace('named keeper of ' + this_marker, 'named keeper of ' + name)
    s = s.replace('named kobolds of ' + this_marker, 'named kobolds of ' + name)
    s = s.replace('named sword of kaldra, ' + this_marker, 'named sword of kaldra, ' + name)
    return s
@ -133,9 +146,12 @@ def text_pass_4b_x(s):
    s = s.replace(u'x\u2014', x_marker + u'\u2014')
    s = s.replace('x.', x_marker + '.')
    s = s.replace('x,', x_marker + ',')
    s = s.replace('x is', x_marker + ' is')
    s = s.replace('x can\'t', x_marker + ' can\'t')
    s = s.replace('x/x', x_marker + '/' + x_marker)
    s = s.replace('x target', x_marker + ' target')
    s = s.replace('si' + x_marker + ' target', 'six target')
    s = s.replace('avara' + x_marker, 'avarax')
    # there's also some stupid ice age card that wants -x/-y
    s = s.replace('/~', '/-')
    return s
@ -469,25 +485,27 @@ def text_pass_11_linetrans(s):
 # randomize the order of the lines
 # not a text pass, intended to be invoked dynamically when encoding a card
 # call this on fully encoded text, with mana symbols expanded
-def randomize_lines(text):
+def separate_lines(text):
    # forget about level up, ignore empty text too while we're at it
    if text == '' or 'level up' in text:
-        return [],[],[],[]
+        return [],[],[],[],[]
    preline_search = ['equip', 'fortify', 'enchant ', 'bestow']
-    postline_search = [
+    costline_search = [
-        'countertype', 'multikicker', 'kicker', 'suspend', 'echo', 'awaken',
+        'multikicker', 'kicker', 'suspend', 'echo', 'awaken',
-        'buyback', 'champion', 'dash', 'entwine', 'evoke', 'fading', 'flashback',
+        'buyback', 'dash', 'entwine', 'evoke', 'flashback',
        'madness', 'megamorph', 'morph', 'miracle', 'ninjutsu', 'overload',
        'prowl', 'recover', 'reinforce', 'replicate', 'scavenge', 'splice',
-        'surge', 'unearth', 'transmute', 'transfigure', 'vanishing', 'tribute',
+        'surge', 'unearth', 'transmute', 'transfigure',
    ]
    # cycling is a special case to handle the variants
    postline_search = ['countertype']
    keyline_search = ['cumulative']
    prelines = []
    keylines = []
    mainlines = []
    costlines = []
    postlines = []
    lines = text.split(utils.newline)
@ -496,26 +514,28 @@ def randomize_lines(text):
        if not '.' in line:
            if any(line.startswith(s) for s in preline_search):
                prelines.append(line)
-            elif any(line.startswith(s) for s in postline_search) or 'cycling' in line:
+            elif any(line.startswith(s) for s in postline_search):
                postlines.append(line)
            elif any(line.startswith(s) for s in costline_search) or 'cycling' in line:
                costlines.append(line)
            else:
                keylines.append(line)
        elif (utils.dash_marker in line and not 
              (' '+utils.dash_marker+' ' in line or 'non'+utils.dash_marker in line)):
            if any(line.startswith(s) for s in preline_search):
                prelines.append(line)
-            elif any(line.startswith(s) for s in postline_search) or 'cycling' in line:
+            elif any(line.startswith(s) for s in costline_search) or 'cycling' in line:
-                postlines.append(line)
+                costlines.append(line)
            elif any(line.startswith(s) for s in keyline_search):
                keylines.append(line)
            else:
                mainlines.append(line)
        elif ': monstrosity' in line:
-            postlines.append(line)
+            costlines.append(line)
        else:
            mainlines.append(line)
-    return prelines, keylines, mainlines, postlines
+    return prelines, keylines, mainlines, costlines, postlines
 # Text unpasses, for decoding. All assume the text inside a Manatext, so don't do anything
--- a/scripts/sanity.py
+++ b/scripts/sanity.py
@ -0,0 +1,142 @@
 #!/usr/bin/env python
 import sys
 import os
 import re
 libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib')
 sys.path.append(libdir)
 import utils
 import jdecode
 import transforms
 def check_lines(fname):
    cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True)
    prelines = set()
    keylines = set()
    mainlines = set()
    costlines = set()
    postlines = set()
    known = ['enchant ', 'equip', 'countertype', 'multikicker', 'kicker',
             'suspend', 'echo', 'awaken', 'bestow', 'buyback',
             'cumulative', 'dash', 'entwine', 'evoke', 'fortify',
             'flashback', 'madness', 'morph', 'megamorph', 'miracle', 'ninjutsu',
             'overload', 'prowl', 'recover', 'reinforce', 'replicate', 'scavenge',
             'splice', 'surge', 'unearth', 'transfigure', 'transmute',
    ]
    known = []
    for card in cards:
        prel, keyl, mainl, costl, postl = transforms.separate_lines(card.text.encode(randomize=False))
        if card.bside:
            prel2, keyl2, mainl2, costl2, postl2 = transforms.separate_lines(card.bside.text.encode(randomize=False))
            prel += prel2
            keyl += keyl2
            mainl += mainl2
            costl += costl2
            postl += postl2
        for line in prel:
            if line.strip() == '':
                print(card.name, card.text.text)
            if any(line.startswith(s) for s in known):
                line = 'known'
            prelines.add(line)
        for line in postl:
            if line.strip() == '':
                print(card.name, card.text.text)
            if any(line.startswith(s) for s in known):
                line = 'known'
            postlines.add(line)
        for line in keyl:
            if line.strip() == '':
                print(card.name, card.text.text)
            if any(line.startswith(s) for s in known):
                line = 'known'
            keylines.add(line)
        for line in mainl:
            if line.strip() == '':
                print(card.name, card.text.text)
            # if any(line.startswith(s) for s in known):
            #     line = 'known'
            mainlines.add(line)
        for line in costl:
            if line.strip() == '':
                print(card.name, card.text.text)
            # if any(line.startswith(s) for s in known) or 'cycling' in line or 'monstrosity' in line:
            #     line = 'known'
            costlines.add(line)
    print('prel: {:d}, keyl: {:d}, mainl: {:d}, postl {:d}'
          .format(len(prelines), len(keylines), len(mainlines), len(postlines)))
    print('\nprelines')
    for line in sorted(prelines):
        print(line)
    print('\npostlines')
    for line in sorted(postlines):
        print(line)
    print('\ncostlines')
    for line in sorted(costlines):
        print(line)
    print('\nkeylines')
    for line in sorted(keylines):
        print(line)
    print('\nmainlines')
    for line in sorted(mainlines):
        #if any(s in line for s in ['champion', 'devour', 'tribute']):
        print(line)
 def check_vocab(fname):
    cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True)
    vocab = {}
    for card in cards:
        words = card.text.vectorize().split()
        if card.bside:
            words += card.bside.text.vectorize().split()
        for word in words:
            if not word in vocab:
                vocab[word] = 1
            else:
                vocab[word] += 1
    for word in sorted(vocab, lambda x,y: cmp(vocab[x], vocab[y]), reverse = True):
        print('{:8d} : {:s}'.format(vocab[word], word))
    n = 3
    for card in cards:
        words = card.text.vectorize().split()
        if card.bside:
            words += card.bside.text.vectorize().split()
        for word in words:
            if vocab[word] <= n:
            #if 'name' in word:
                print('\n{:8d} : {:s}'.format(vocab[word], word))
                print(card.encode())
                break
 if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('infile', nargs='?', default=os.path.join(libdir, '../data/output.txt'),
                        help='encoded card file or json corpus to process')
    parser.add_argument('-lines', action='store_true',
                        help='show behavior of line separation')
    parser.add_argument('-vocab', action='store_true',
                        help='show vocabulary counts from encoded card text')
    args = parser.parse_args()
    if args.lines:
        check_lines(args.infile)
    if args.vocab:
        check_vocab(args.infile)
    exit(0)