code changes for encoding updates and sanity checker script that found them

This commit is contained in:
Bill Zorn 2016-05-06 18:36:35 -07:00
parent 6421c02f60
commit d4b5ef2104
3 changed files with 177 additions and 12 deletions

View file

@ -257,8 +257,11 @@ def fields_from_json(src_json, linetrans = True):
parsed = False parsed = False
if 'subtypes' in src_json: if 'subtypes' in src_json:
fields[field_subtypes] = [(-1, map(lambda s: utils.to_ascii(s.lower()), fields[field_subtypes] = [(-1, map(lambda s: utils.to_ascii(s.lower())
# urza's lands...
.replace('"', "'").replace('-', utils.dash_marker),
src_json['subtypes']))] src_json['subtypes']))]
if 'rarity' in src_json: if 'rarity' in src_json:
if src_json['rarity'] in utils.json_rarity_map: if src_json['rarity'] in utils.json_rarity_map:

View file

@ -94,6 +94,19 @@ def text_pass_2_cardname(s, name):
for override in overrides: for override in overrides:
s = s.replace(override, this_marker) s = s.replace(override, this_marker)
# stupid planeswalker abilities
s = s.replace('to him.', 'to ' + this_marker + '.')
s = s.replace('to him this', 'to ' + this_marker + ' this')
s = s.replace('to himself', 'to itself')
s = s.replace("he's", this_marker + ' is')
# sometimes we actually don't want to do this replacement
s = s.replace('named ' + this_marker, 'named ' + name)
s = s.replace('name is still ' + this_marker, 'name is still ' + name)
s = s.replace('named keeper of ' + this_marker, 'named keeper of ' + name)
s = s.replace('named kobolds of ' + this_marker, 'named kobolds of ' + name)
s = s.replace('named sword of kaldra, ' + this_marker, 'named sword of kaldra, ' + name)
return s return s
@ -133,9 +146,12 @@ def text_pass_4b_x(s):
s = s.replace(u'x\u2014', x_marker + u'\u2014') s = s.replace(u'x\u2014', x_marker + u'\u2014')
s = s.replace('x.', x_marker + '.') s = s.replace('x.', x_marker + '.')
s = s.replace('x,', x_marker + ',') s = s.replace('x,', x_marker + ',')
s = s.replace('x is', x_marker + ' is')
s = s.replace('x can\'t', x_marker + ' can\'t')
s = s.replace('x/x', x_marker + '/' + x_marker) s = s.replace('x/x', x_marker + '/' + x_marker)
s = s.replace('x target', x_marker + ' target') s = s.replace('x target', x_marker + ' target')
s = s.replace('si' + x_marker + ' target', 'six target') s = s.replace('si' + x_marker + ' target', 'six target')
s = s.replace('avara' + x_marker, 'avarax')
# there's also some stupid ice age card that wants -x/-y # there's also some stupid ice age card that wants -x/-y
s = s.replace('/~', '/-') s = s.replace('/~', '/-')
return s return s
@ -469,25 +485,27 @@ def text_pass_11_linetrans(s):
# randomize the order of the lines # randomize the order of the lines
# not a text pass, intended to be invoked dynamically when encoding a card # not a text pass, intended to be invoked dynamically when encoding a card
# call this on fully encoded text, with mana symbols expanded # call this on fully encoded text, with mana symbols expanded
def randomize_lines(text): def separate_lines(text):
# forget about level up, ignore empty text too while we're at it # forget about level up, ignore empty text too while we're at it
if text == '' or 'level up' in text: if text == '' or 'level up' in text:
return [],[],[],[] return [],[],[],[],[]
preline_search = ['equip', 'fortify', 'enchant ', 'bestow'] preline_search = ['equip', 'fortify', 'enchant ', 'bestow']
postline_search = [ costline_search = [
'countertype', 'multikicker', 'kicker', 'suspend', 'echo', 'awaken', 'multikicker', 'kicker', 'suspend', 'echo', 'awaken',
'buyback', 'champion', 'dash', 'entwine', 'evoke', 'fading', 'flashback', 'buyback', 'dash', 'entwine', 'evoke', 'flashback',
'madness', 'megamorph', 'morph', 'miracle', 'ninjutsu', 'overload', 'madness', 'megamorph', 'morph', 'miracle', 'ninjutsu', 'overload',
'prowl', 'recover', 'reinforce', 'replicate', 'scavenge', 'splice', 'prowl', 'recover', 'reinforce', 'replicate', 'scavenge', 'splice',
'surge', 'unearth', 'transmute', 'transfigure', 'vanishing', 'tribute', 'surge', 'unearth', 'transmute', 'transfigure',
] ]
# cycling is a special case to handle the variants # cycling is a special case to handle the variants
postline_search = ['countertype']
keyline_search = ['cumulative'] keyline_search = ['cumulative']
prelines = [] prelines = []
keylines = [] keylines = []
mainlines = [] mainlines = []
costlines = []
postlines = [] postlines = []
lines = text.split(utils.newline) lines = text.split(utils.newline)
@ -496,26 +514,28 @@ def randomize_lines(text):
if not '.' in line: if not '.' in line:
if any(line.startswith(s) for s in preline_search): if any(line.startswith(s) for s in preline_search):
prelines.append(line) prelines.append(line)
elif any(line.startswith(s) for s in postline_search) or 'cycling' in line: elif any(line.startswith(s) for s in postline_search):
postlines.append(line) postlines.append(line)
elif any(line.startswith(s) for s in costline_search) or 'cycling' in line:
costlines.append(line)
else: else:
keylines.append(line) keylines.append(line)
elif (utils.dash_marker in line and not elif (utils.dash_marker in line and not
(' '+utils.dash_marker+' ' in line or 'non'+utils.dash_marker in line)): (' '+utils.dash_marker+' ' in line or 'non'+utils.dash_marker in line)):
if any(line.startswith(s) for s in preline_search): if any(line.startswith(s) for s in preline_search):
prelines.append(line) prelines.append(line)
elif any(line.startswith(s) for s in postline_search) or 'cycling' in line: elif any(line.startswith(s) for s in costline_search) or 'cycling' in line:
postlines.append(line) costlines.append(line)
elif any(line.startswith(s) for s in keyline_search): elif any(line.startswith(s) for s in keyline_search):
keylines.append(line) keylines.append(line)
else: else:
mainlines.append(line) mainlines.append(line)
elif ': monstrosity' in line: elif ': monstrosity' in line:
postlines.append(line) costlines.append(line)
else: else:
mainlines.append(line) mainlines.append(line)
return prelines, keylines, mainlines, postlines return prelines, keylines, mainlines, costlines, postlines
# Text unpasses, for decoding. All assume the text inside a Manatext, so don't do anything # Text unpasses, for decoding. All assume the text inside a Manatext, so don't do anything

142
scripts/sanity.py Executable file
View file

@ -0,0 +1,142 @@
#!/usr/bin/env python
import sys
import os
import re
libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib')
sys.path.append(libdir)
import utils
import jdecode
import transforms
def check_lines(fname):
cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True)
prelines = set()
keylines = set()
mainlines = set()
costlines = set()
postlines = set()
known = ['enchant ', 'equip', 'countertype', 'multikicker', 'kicker',
'suspend', 'echo', 'awaken', 'bestow', 'buyback',
'cumulative', 'dash', 'entwine', 'evoke', 'fortify',
'flashback', 'madness', 'morph', 'megamorph', 'miracle', 'ninjutsu',
'overload', 'prowl', 'recover', 'reinforce', 'replicate', 'scavenge',
'splice', 'surge', 'unearth', 'transfigure', 'transmute',
]
known = []
for card in cards:
prel, keyl, mainl, costl, postl = transforms.separate_lines(card.text.encode(randomize=False))
if card.bside:
prel2, keyl2, mainl2, costl2, postl2 = transforms.separate_lines(card.bside.text.encode(randomize=False))
prel += prel2
keyl += keyl2
mainl += mainl2
costl += costl2
postl += postl2
for line in prel:
if line.strip() == '':
print(card.name, card.text.text)
if any(line.startswith(s) for s in known):
line = 'known'
prelines.add(line)
for line in postl:
if line.strip() == '':
print(card.name, card.text.text)
if any(line.startswith(s) for s in known):
line = 'known'
postlines.add(line)
for line in keyl:
if line.strip() == '':
print(card.name, card.text.text)
if any(line.startswith(s) for s in known):
line = 'known'
keylines.add(line)
for line in mainl:
if line.strip() == '':
print(card.name, card.text.text)
# if any(line.startswith(s) for s in known):
# line = 'known'
mainlines.add(line)
for line in costl:
if line.strip() == '':
print(card.name, card.text.text)
# if any(line.startswith(s) for s in known) or 'cycling' in line or 'monstrosity' in line:
# line = 'known'
costlines.add(line)
print('prel: {:d}, keyl: {:d}, mainl: {:d}, postl {:d}'
.format(len(prelines), len(keylines), len(mainlines), len(postlines)))
print('\nprelines')
for line in sorted(prelines):
print(line)
print('\npostlines')
for line in sorted(postlines):
print(line)
print('\ncostlines')
for line in sorted(costlines):
print(line)
print('\nkeylines')
for line in sorted(keylines):
print(line)
print('\nmainlines')
for line in sorted(mainlines):
#if any(s in line for s in ['champion', 'devour', 'tribute']):
print(line)
def check_vocab(fname):
cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True)
vocab = {}
for card in cards:
words = card.text.vectorize().split()
if card.bside:
words += card.bside.text.vectorize().split()
for word in words:
if not word in vocab:
vocab[word] = 1
else:
vocab[word] += 1
for word in sorted(vocab, lambda x,y: cmp(vocab[x], vocab[y]), reverse = True):
print('{:8d} : {:s}'.format(vocab[word], word))
n = 3
for card in cards:
words = card.text.vectorize().split()
if card.bside:
words += card.bside.text.vectorize().split()
for word in words:
if vocab[word] <= n:
#if 'name' in word:
print('\n{:8d} : {:s}'.format(vocab[word], word))
print(card.encode())
break
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('infile', nargs='?', default=os.path.join(libdir, '../data/output.txt'),
help='encoded card file or json corpus to process')
parser.add_argument('-lines', action='store_true',
help='show behavior of line separation')
parser.add_argument('-vocab', action='store_true',
help='show vocabulary counts from encoded card text')
args = parser.parse_args()
if args.lines:
check_lines(args.infile)
if args.vocab:
check_vocab(args.infile)
exit(0)