code changes for encoding updates and sanity checker script that found them

2016-05-06 18:36:35 -07:00 · 2016-05-06 18:36:35 -07:00 · d4b5ef2104
commit d4b5ef2104
parent 6421c02f60
3 changed files with 177 additions and 12 deletions
--- a/lib/cardlib.py
+++ b/lib/cardlib.py
@ -257,8 +257,11 @@ def fields_from_json(src_json, linetrans = True):
        parsed = False

    if 'subtypes' in src_json:
-        fields[field_subtypes] = [(-1, map(lambda s: utils.to_ascii(s.lower()), 
+        fields[field_subtypes] = [(-1, map(lambda s: utils.to_ascii(s.lower())
+                                           # urza's lands...
+                                           .replace('"', "'").replace('-', utils.dash_marker), 
                                           src_json['subtypes']))]
+        

    if 'rarity' in src_json:
        if src_json['rarity'] in utils.json_rarity_map:
--- a/lib/transforms.py
+++ b/lib/transforms.py
@ -94,6 +94,19 @@ def text_pass_2_cardname(s, name):
    for override in overrides:
        s = s.replace(override, this_marker)

+    # stupid planeswalker abilities
+    s = s.replace('to him.', 'to ' + this_marker + '.')
+    s = s.replace('to him this', 'to ' + this_marker + ' this')
+    s = s.replace('to himself', 'to itself')
+    s = s.replace("he's", this_marker + ' is')
+
+    # sometimes we actually don't want to do this replacement
+    s = s.replace('named ' + this_marker, 'named ' + name)
+    s = s.replace('name is still ' + this_marker, 'name is still ' + name)
+    s = s.replace('named keeper of ' + this_marker, 'named keeper of ' + name)
+    s = s.replace('named kobolds of ' + this_marker, 'named kobolds of ' + name)
+    s = s.replace('named sword of kaldra, ' + this_marker, 'named sword of kaldra, ' + name)
+
    return s


@ -133,9 +146,12 @@ def text_pass_4b_x(s):
    s = s.replace(u'x\u2014', x_marker + u'\u2014')
    s = s.replace('x.', x_marker + '.')
    s = s.replace('x,', x_marker + ',')
+    s = s.replace('x is', x_marker + ' is')
+    s = s.replace('x can\'t', x_marker + ' can\'t')
    s = s.replace('x/x', x_marker + '/' + x_marker)
    s = s.replace('x target', x_marker + ' target')
    s = s.replace('si' + x_marker + ' target', 'six target')
+    s = s.replace('avara' + x_marker, 'avarax')
    # there's also some stupid ice age card that wants -x/-y
    s = s.replace('/~', '/-')
    return s
@ -469,25 +485,27 @@ def text_pass_11_linetrans(s):
 # randomize the order of the lines
 # not a text pass, intended to be invoked dynamically when encoding a card
 # call this on fully encoded text, with mana symbols expanded
-def randomize_lines(text):
+def separate_lines(text):
    # forget about level up, ignore empty text too while we're at it
    if text == '' or 'level up' in text:
-        return [],[],[],[]
+        return [],[],[],[],[]
    
    preline_search = ['equip', 'fortify', 'enchant ', 'bestow']
-    postline_search = [
-        'countertype', 'multikicker', 'kicker', 'suspend', 'echo', 'awaken',
-        'buyback', 'champion', 'dash', 'entwine', 'evoke', 'fading', 'flashback',
+    costline_search = [
+        'multikicker', 'kicker', 'suspend', 'echo', 'awaken',
+        'buyback', 'dash', 'entwine', 'evoke', 'flashback',
        'madness', 'megamorph', 'morph', 'miracle', 'ninjutsu', 'overload',
        'prowl', 'recover', 'reinforce', 'replicate', 'scavenge', 'splice',
-        'surge', 'unearth', 'transmute', 'transfigure', 'vanishing', 'tribute',
+        'surge', 'unearth', 'transmute', 'transfigure',
    ]
    # cycling is a special case to handle the variants
+    postline_search = ['countertype']
    keyline_search = ['cumulative']

    prelines = []
    keylines = []
    mainlines = []
+    costlines = []
    postlines = []

    lines = text.split(utils.newline)
@ -496,26 +514,28 @@ def randomize_lines(text):
        if not '.' in line:
            if any(line.startswith(s) for s in preline_search):
                prelines.append(line)
-            elif any(line.startswith(s) for s in postline_search) or 'cycling' in line:
+            elif any(line.startswith(s) for s in postline_search):
                postlines.append(line)
+            elif any(line.startswith(s) for s in costline_search) or 'cycling' in line:
+                costlines.append(line)
            else:
                keylines.append(line)
        elif (utils.dash_marker in line and not 
              (' '+utils.dash_marker+' ' in line or 'non'+utils.dash_marker in line)):
            if any(line.startswith(s) for s in preline_search):
                prelines.append(line)
-            elif any(line.startswith(s) for s in postline_search) or 'cycling' in line:
-                postlines.append(line)
+            elif any(line.startswith(s) for s in costline_search) or 'cycling' in line:
+                costlines.append(line)
            elif any(line.startswith(s) for s in keyline_search):
                keylines.append(line)
            else:
                mainlines.append(line)
        elif ': monstrosity' in line:
-            postlines.append(line)
+            costlines.append(line)
        else:
            mainlines.append(line)

-    return prelines, keylines, mainlines, postlines
+    return prelines, keylines, mainlines, costlines, postlines


 # Text unpasses, for decoding. All assume the text inside a Manatext, so don't do anything
--- a/scripts/sanity.py
+++ b/scripts/sanity.py
@ -0,0 +1,142 @@
+#!/usr/bin/env python
+import sys
+import os
+import re
+
+libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib')
+sys.path.append(libdir)
+import utils
+import jdecode
+import transforms
+
+def check_lines(fname):
+    cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True)
+
+    prelines = set()
+    keylines = set()
+    mainlines = set()
+    costlines = set()
+    postlines = set()
+
+    known = ['enchant ', 'equip', 'countertype', 'multikicker', 'kicker',
+             'suspend', 'echo', 'awaken', 'bestow', 'buyback',
+             'cumulative', 'dash', 'entwine', 'evoke', 'fortify',
+             'flashback', 'madness', 'morph', 'megamorph', 'miracle', 'ninjutsu',
+             'overload', 'prowl', 'recover', 'reinforce', 'replicate', 'scavenge',
+             'splice', 'surge', 'unearth', 'transfigure', 'transmute',
+    ]
+    known = []
+
+    for card in cards:
+        prel, keyl, mainl, costl, postl = transforms.separate_lines(card.text.encode(randomize=False))
+        if card.bside:
+            prel2, keyl2, mainl2, costl2, postl2 = transforms.separate_lines(card.bside.text.encode(randomize=False))
+            prel += prel2
+            keyl += keyl2
+            mainl += mainl2
+            costl += costl2
+            postl += postl2
+
+        for line in prel:
+            if line.strip() == '':
+                print(card.name, card.text.text)
+            if any(line.startswith(s) for s in known):
+                line = 'known'
+            prelines.add(line)
+        for line in postl:
+            if line.strip() == '':
+                print(card.name, card.text.text)
+            if any(line.startswith(s) for s in known):
+                line = 'known'
+            postlines.add(line)
+        for line in keyl:
+            if line.strip() == '':
+                print(card.name, card.text.text)
+            if any(line.startswith(s) for s in known):
+                line = 'known'
+            keylines.add(line)
+        for line in mainl:
+            if line.strip() == '':
+                print(card.name, card.text.text)
+            # if any(line.startswith(s) for s in known):
+            #     line = 'known'
+            mainlines.add(line)
+        for line in costl:
+            if line.strip() == '':
+                print(card.name, card.text.text)
+            # if any(line.startswith(s) for s in known) or 'cycling' in line or 'monstrosity' in line:
+            #     line = 'known'
+            costlines.add(line)
+
+    print('prel: {:d}, keyl: {:d}, mainl: {:d}, postl {:d}'
+          .format(len(prelines), len(keylines), len(mainlines), len(postlines)))
+
+    print('\nprelines')
+    for line in sorted(prelines):
+        print(line)
+
+    print('\npostlines')
+    for line in sorted(postlines):
+        print(line)
+
+    print('\ncostlines')
+    for line in sorted(costlines):
+        print(line)
+
+    print('\nkeylines')
+    for line in sorted(keylines):
+        print(line)
+
+    print('\nmainlines')
+    for line in sorted(mainlines):
+        #if any(s in line for s in ['champion', 'devour', 'tribute']):
+        print(line)
+
+def check_vocab(fname):
+    cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True)
+
+    vocab = {}
+    for card in cards:
+        words = card.text.vectorize().split()
+        if card.bside:
+            words += card.bside.text.vectorize().split()
+        for word in words:
+            if not word in vocab:
+                vocab[word] = 1
+            else:
+                vocab[word] += 1
+
+    for word in sorted(vocab, lambda x,y: cmp(vocab[x], vocab[y]), reverse = True):
+        print('{:8d} : {:s}'.format(vocab[word], word))
+
+    n = 3
+
+    for card in cards:
+        words = card.text.vectorize().split()
+        if card.bside:
+            words += card.bside.text.vectorize().split()
+        for word in words:
+            if vocab[word] <= n:
+            #if 'name' in word:
+                print('\n{:8d} : {:s}'.format(vocab[word], word))
+                print(card.encode())
+                break
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument('infile', nargs='?', default=os.path.join(libdir, '../data/output.txt'),
+                        help='encoded card file or json corpus to process')
+    parser.add_argument('-lines', action='store_true',
+                        help='show behavior of line separation')
+    parser.add_argument('-vocab', action='store_true',
+                        help='show vocabulary counts from encoded card text')
+    args = parser.parse_args()
+
+    if args.lines:
+        check_lines(args.infile)
+    if args.vocab:
+        check_vocab(args.infile)
+
+    exit(0)