code changes for encoding updates and sanity checker script that found them
This commit is contained in:
parent
6421c02f60
commit
d4b5ef2104
3 changed files with 177 additions and 12 deletions
|
@ -257,8 +257,11 @@ def fields_from_json(src_json, linetrans = True):
|
|||
parsed = False
|
||||
|
||||
if 'subtypes' in src_json:
|
||||
fields[field_subtypes] = [(-1, map(lambda s: utils.to_ascii(s.lower()),
|
||||
fields[field_subtypes] = [(-1, map(lambda s: utils.to_ascii(s.lower())
|
||||
# urza's lands...
|
||||
.replace('"', "'").replace('-', utils.dash_marker),
|
||||
src_json['subtypes']))]
|
||||
|
||||
|
||||
if 'rarity' in src_json:
|
||||
if src_json['rarity'] in utils.json_rarity_map:
|
||||
|
|
|
@ -94,6 +94,19 @@ def text_pass_2_cardname(s, name):
|
|||
for override in overrides:
|
||||
s = s.replace(override, this_marker)
|
||||
|
||||
# stupid planeswalker abilities
|
||||
s = s.replace('to him.', 'to ' + this_marker + '.')
|
||||
s = s.replace('to him this', 'to ' + this_marker + ' this')
|
||||
s = s.replace('to himself', 'to itself')
|
||||
s = s.replace("he's", this_marker + ' is')
|
||||
|
||||
# sometimes we actually don't want to do this replacement
|
||||
s = s.replace('named ' + this_marker, 'named ' + name)
|
||||
s = s.replace('name is still ' + this_marker, 'name is still ' + name)
|
||||
s = s.replace('named keeper of ' + this_marker, 'named keeper of ' + name)
|
||||
s = s.replace('named kobolds of ' + this_marker, 'named kobolds of ' + name)
|
||||
s = s.replace('named sword of kaldra, ' + this_marker, 'named sword of kaldra, ' + name)
|
||||
|
||||
return s
|
||||
|
||||
|
||||
|
@ -133,9 +146,12 @@ def text_pass_4b_x(s):
|
|||
s = s.replace(u'x\u2014', x_marker + u'\u2014')
|
||||
s = s.replace('x.', x_marker + '.')
|
||||
s = s.replace('x,', x_marker + ',')
|
||||
s = s.replace('x is', x_marker + ' is')
|
||||
s = s.replace('x can\'t', x_marker + ' can\'t')
|
||||
s = s.replace('x/x', x_marker + '/' + x_marker)
|
||||
s = s.replace('x target', x_marker + ' target')
|
||||
s = s.replace('si' + x_marker + ' target', 'six target')
|
||||
s = s.replace('avara' + x_marker, 'avarax')
|
||||
# there's also some stupid ice age card that wants -x/-y
|
||||
s = s.replace('/~', '/-')
|
||||
return s
|
||||
|
@ -469,25 +485,27 @@ def text_pass_11_linetrans(s):
|
|||
# randomize the order of the lines
|
||||
# not a text pass, intended to be invoked dynamically when encoding a card
|
||||
# call this on fully encoded text, with mana symbols expanded
|
||||
def randomize_lines(text):
|
||||
def separate_lines(text):
|
||||
# forget about level up, ignore empty text too while we're at it
|
||||
if text == '' or 'level up' in text:
|
||||
return [],[],[],[]
|
||||
return [],[],[],[],[]
|
||||
|
||||
preline_search = ['equip', 'fortify', 'enchant ', 'bestow']
|
||||
postline_search = [
|
||||
'countertype', 'multikicker', 'kicker', 'suspend', 'echo', 'awaken',
|
||||
'buyback', 'champion', 'dash', 'entwine', 'evoke', 'fading', 'flashback',
|
||||
costline_search = [
|
||||
'multikicker', 'kicker', 'suspend', 'echo', 'awaken',
|
||||
'buyback', 'dash', 'entwine', 'evoke', 'flashback',
|
||||
'madness', 'megamorph', 'morph', 'miracle', 'ninjutsu', 'overload',
|
||||
'prowl', 'recover', 'reinforce', 'replicate', 'scavenge', 'splice',
|
||||
'surge', 'unearth', 'transmute', 'transfigure', 'vanishing', 'tribute',
|
||||
'surge', 'unearth', 'transmute', 'transfigure',
|
||||
]
|
||||
# cycling is a special case to handle the variants
|
||||
postline_search = ['countertype']
|
||||
keyline_search = ['cumulative']
|
||||
|
||||
prelines = []
|
||||
keylines = []
|
||||
mainlines = []
|
||||
costlines = []
|
||||
postlines = []
|
||||
|
||||
lines = text.split(utils.newline)
|
||||
|
@ -496,26 +514,28 @@ def randomize_lines(text):
|
|||
if not '.' in line:
|
||||
if any(line.startswith(s) for s in preline_search):
|
||||
prelines.append(line)
|
||||
elif any(line.startswith(s) for s in postline_search) or 'cycling' in line:
|
||||
elif any(line.startswith(s) for s in postline_search):
|
||||
postlines.append(line)
|
||||
elif any(line.startswith(s) for s in costline_search) or 'cycling' in line:
|
||||
costlines.append(line)
|
||||
else:
|
||||
keylines.append(line)
|
||||
elif (utils.dash_marker in line and not
|
||||
(' '+utils.dash_marker+' ' in line or 'non'+utils.dash_marker in line)):
|
||||
if any(line.startswith(s) for s in preline_search):
|
||||
prelines.append(line)
|
||||
elif any(line.startswith(s) for s in postline_search) or 'cycling' in line:
|
||||
postlines.append(line)
|
||||
elif any(line.startswith(s) for s in costline_search) or 'cycling' in line:
|
||||
costlines.append(line)
|
||||
elif any(line.startswith(s) for s in keyline_search):
|
||||
keylines.append(line)
|
||||
else:
|
||||
mainlines.append(line)
|
||||
elif ': monstrosity' in line:
|
||||
postlines.append(line)
|
||||
costlines.append(line)
|
||||
else:
|
||||
mainlines.append(line)
|
||||
|
||||
return prelines, keylines, mainlines, postlines
|
||||
return prelines, keylines, mainlines, costlines, postlines
|
||||
|
||||
|
||||
# Text unpasses, for decoding. All assume the text inside a Manatext, so don't do anything
|
||||
|
|
142
scripts/sanity.py
Executable file
142
scripts/sanity.py
Executable file
|
@ -0,0 +1,142 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
|
||||
libdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib')
|
||||
sys.path.append(libdir)
|
||||
import utils
|
||||
import jdecode
|
||||
import transforms
|
||||
|
||||
def check_lines(fname):
|
||||
cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True)
|
||||
|
||||
prelines = set()
|
||||
keylines = set()
|
||||
mainlines = set()
|
||||
costlines = set()
|
||||
postlines = set()
|
||||
|
||||
known = ['enchant ', 'equip', 'countertype', 'multikicker', 'kicker',
|
||||
'suspend', 'echo', 'awaken', 'bestow', 'buyback',
|
||||
'cumulative', 'dash', 'entwine', 'evoke', 'fortify',
|
||||
'flashback', 'madness', 'morph', 'megamorph', 'miracle', 'ninjutsu',
|
||||
'overload', 'prowl', 'recover', 'reinforce', 'replicate', 'scavenge',
|
||||
'splice', 'surge', 'unearth', 'transfigure', 'transmute',
|
||||
]
|
||||
known = []
|
||||
|
||||
for card in cards:
|
||||
prel, keyl, mainl, costl, postl = transforms.separate_lines(card.text.encode(randomize=False))
|
||||
if card.bside:
|
||||
prel2, keyl2, mainl2, costl2, postl2 = transforms.separate_lines(card.bside.text.encode(randomize=False))
|
||||
prel += prel2
|
||||
keyl += keyl2
|
||||
mainl += mainl2
|
||||
costl += costl2
|
||||
postl += postl2
|
||||
|
||||
for line in prel:
|
||||
if line.strip() == '':
|
||||
print(card.name, card.text.text)
|
||||
if any(line.startswith(s) for s in known):
|
||||
line = 'known'
|
||||
prelines.add(line)
|
||||
for line in postl:
|
||||
if line.strip() == '':
|
||||
print(card.name, card.text.text)
|
||||
if any(line.startswith(s) for s in known):
|
||||
line = 'known'
|
||||
postlines.add(line)
|
||||
for line in keyl:
|
||||
if line.strip() == '':
|
||||
print(card.name, card.text.text)
|
||||
if any(line.startswith(s) for s in known):
|
||||
line = 'known'
|
||||
keylines.add(line)
|
||||
for line in mainl:
|
||||
if line.strip() == '':
|
||||
print(card.name, card.text.text)
|
||||
# if any(line.startswith(s) for s in known):
|
||||
# line = 'known'
|
||||
mainlines.add(line)
|
||||
for line in costl:
|
||||
if line.strip() == '':
|
||||
print(card.name, card.text.text)
|
||||
# if any(line.startswith(s) for s in known) or 'cycling' in line or 'monstrosity' in line:
|
||||
# line = 'known'
|
||||
costlines.add(line)
|
||||
|
||||
print('prel: {:d}, keyl: {:d}, mainl: {:d}, postl {:d}'
|
||||
.format(len(prelines), len(keylines), len(mainlines), len(postlines)))
|
||||
|
||||
print('\nprelines')
|
||||
for line in sorted(prelines):
|
||||
print(line)
|
||||
|
||||
print('\npostlines')
|
||||
for line in sorted(postlines):
|
||||
print(line)
|
||||
|
||||
print('\ncostlines')
|
||||
for line in sorted(costlines):
|
||||
print(line)
|
||||
|
||||
print('\nkeylines')
|
||||
for line in sorted(keylines):
|
||||
print(line)
|
||||
|
||||
print('\nmainlines')
|
||||
for line in sorted(mainlines):
|
||||
#if any(s in line for s in ['champion', 'devour', 'tribute']):
|
||||
print(line)
|
||||
|
||||
def check_vocab(fname):
|
||||
cards = jdecode.mtg_open_file(fname, verbose=True, linetrans=True)
|
||||
|
||||
vocab = {}
|
||||
for card in cards:
|
||||
words = card.text.vectorize().split()
|
||||
if card.bside:
|
||||
words += card.bside.text.vectorize().split()
|
||||
for word in words:
|
||||
if not word in vocab:
|
||||
vocab[word] = 1
|
||||
else:
|
||||
vocab[word] += 1
|
||||
|
||||
for word in sorted(vocab, lambda x,y: cmp(vocab[x], vocab[y]), reverse = True):
|
||||
print('{:8d} : {:s}'.format(vocab[word], word))
|
||||
|
||||
n = 3
|
||||
|
||||
for card in cards:
|
||||
words = card.text.vectorize().split()
|
||||
if card.bside:
|
||||
words += card.bside.text.vectorize().split()
|
||||
for word in words:
|
||||
if vocab[word] <= n:
|
||||
#if 'name' in word:
|
||||
print('\n{:8d} : {:s}'.format(vocab[word], word))
|
||||
print(card.encode())
|
||||
break
|
||||
|
||||
if __name__ == '__main__':
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('infile', nargs='?', default=os.path.join(libdir, '../data/output.txt'),
|
||||
help='encoded card file or json corpus to process')
|
||||
parser.add_argument('-lines', action='store_true',
|
||||
help='show behavior of line separation')
|
||||
parser.add_argument('-vocab', action='store_true',
|
||||
help='show vocabulary counts from encoded card text')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.lines:
|
||||
check_lines(args.infile)
|
||||
if args.vocab:
|
||||
check_vocab(args.infile)
|
||||
|
||||
exit(0)
|
Loading…
Reference in a new issue